Source code for stream.preprocessor._tf_idf
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
[docs]def c_tf_idf(documents, m, ngram_range=(1, 1)):
"""class based tf_idf retrieval from cluster of documents
Args:
documents (_type_): _description_
m (_type_): _description_
ngram_range (tuple, optional): _description_. Defaults to (1, 1).
Returns:
_type_: _description_
"""
count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(
documents
)
t = count.transform(documents).toarray()
w = t.sum(axis=1)
tf = np.divide(t.T, w)
sum_t = t.sum(axis=0)
idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
tf_idf = np.multiply(tf, idf)
return tf_idf, count
[docs]def extract_tfidf_topics(tf_idf, count, docs_per_topic, n=10):
"""class based tf_idf retrieval from cluster of documents
Args:
tf_idf (_type_): _description_
count (_type_): _description_
docs_per_topic (_type_): _description_
n (int, optional): _description_. Defaults to 20.
Returns:
_type_: _description_
"""
words = count.get_feature_names_out()
labels = list(docs_per_topic.predictions)
tf_idf_transposed = tf_idf.T
indices = tf_idf_transposed.argsort()[:, -n:]
top_n_words = {
label: [((words[j]), (tf_idf_transposed[i][j]))
for j in indices[i]][::-1]
for i, label in enumerate(labels)
}
return top_n_words
[docs]def extract_topic_sizes(df):
"""
Extracts and computes the size of each topic from a given DataFrame.
This function groups the DataFrame by the 'Topic' column, which represents
topic IDs, and then counts the number of documents associated with each topic.
It returns a DataFrame with two columns: 'Topic' and 'Size', where 'Size' is
the count of documents in each topic. The returned DataFrame is sorted in
descending order of 'Size'.
Parameters:
df (pandas.DataFrame): A DataFrame containing at least two columns, 'Topic'
and 'docs', where 'Topic' is an ID column for topics
and 'docs' contains documents or data points associated
with each topic.
Returns:
pandas.DataFrame: A DataFrame with 'Topic' and 'Size' columns, where 'Size'
indicates the number of documents in each topic, sorted in
descending order of size.
"""
topic_sizes = (
df.groupby(["Topic"])
.docs.count()
.reset_index()
.rename({"Topic": "Topic", "docs": "Size"}, axis="columns")
.sort_values("Size", ascending=False)
)
return topic_sizes