Source code for stream.preprocessor._tf_idf

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


[docs]def c_tf_idf(documents, m, ngram_range=(1, 1)): """class based tf_idf retrieval from cluster of documents Args: documents (_type_): _description_ m (_type_): _description_ ngram_range (tuple, optional): _description_. Defaults to (1, 1). Returns: _type_: _description_ """ count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit( documents ) t = count.transform(documents).toarray() w = t.sum(axis=1) tf = np.divide(t.T, w) sum_t = t.sum(axis=0) idf = np.log(np.divide(m, sum_t)).reshape(-1, 1) tf_idf = np.multiply(tf, idf) return tf_idf, count
[docs]def extract_tfidf_topics(tf_idf, count, docs_per_topic, n=10): """class based tf_idf retrieval from cluster of documents Args: tf_idf (_type_): _description_ count (_type_): _description_ docs_per_topic (_type_): _description_ n (int, optional): _description_. Defaults to 20. Returns: _type_: _description_ """ words = count.get_feature_names_out() labels = list(docs_per_topic.predictions) tf_idf_transposed = tf_idf.T indices = tf_idf_transposed.argsort()[:, -n:] top_n_words = { label: [((words[j]), (tf_idf_transposed[i][j])) for j in indices[i]][::-1] for i, label in enumerate(labels) } return top_n_words
[docs]def extract_topic_sizes(df): """ Extracts and computes the size of each topic from a given DataFrame. This function groups the DataFrame by the 'Topic' column, which represents topic IDs, and then counts the number of documents associated with each topic. It returns a DataFrame with two columns: 'Topic' and 'Size', where 'Size' is the count of documents in each topic. The returned DataFrame is sorted in descending order of 'Size'. Parameters: df (pandas.DataFrame): A DataFrame containing at least two columns, 'Topic' and 'docs', where 'Topic' is an ID column for topics and 'docs' contains documents or data points associated with each topic. Returns: pandas.DataFrame: A DataFrame with 'Topic' and 'Size' columns, where 'Size' indicates the number of documents in each topic, sorted in descending order of size. """ topic_sizes = ( df.groupby(["Topic"]) .docs.count() .reset_index() .rename({"Topic": "Topic", "docs": "Size"}, axis="columns") .sort_values("Size", ascending=False) ) return topic_sizes