Source code for stream.preprocessor.topic_extraction

import re
from itertools import compress

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import Word2VecKeyedVectors
from nltk import pos_tag
from nltk.corpus import brown as nltk_words
from nltk.corpus import words as eng_dict
from numpy.linalg import norm
from octis.dataset.dataset import Dataset as OCDataset

from ._embedder import BaseEmbedder


[docs]class TopicExtractor: def __init__( self, dataset, topic_assignments, n_topics, embedding_model, ): self.dataset = dataset self.topic_assignments = topic_assignments self.embedder = BaseEmbedder(embedding_model) self.n_topics = n_topics def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=True): """extracts the topics most probable words, which are the words nearest to the topics centroid. We extract all nouns from the corpus and the brown corpus. Afterwards we compute the cosine similarity between every word and every centroid. Note, that here we did not use the sklearn.metrics.pairwise cosine_similarity function due to a faster computation when using numpy. Hecen we used: np.inner(centroids, nouns) / np.multiply.outer( norm(centroids, axis=1), norm(nouns, axis=1) ) Args: embeddings (_type_): _document embeddings to compute centroid of the topic n (_type_): n_top number of words per topic Returns: dict: extracted topics """ # define whether word is a noun def is_noun(pos): return pos[:2] == "NN" # not used for now # DATADIR = "../datasets/preprocessed_datasets" # extend the corpus if corpus == "brown": word_list = nltk_words.words() word_list = [word.lower().strip() for word in word_list] word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] elif corpus == "words": word_list = eng_dict.words() word_list = [word.lower().strip() for word in word_list] word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] elif corpus == "octis": data = OCDataset() data.fetch_dataset("20NewsGroup") word_list = data.get_vocabulary() data.fetch_dataset("M10") word_list += data.get_vocabulary() data.fetch_dataset("BBC_News") word_list += data.get_vocabulary() # include reuters etc datasets # data.load_custom_dataset_from_folder(DATADIR + "/GN") # word_list += data.get_vocabulary() word_list += self.dataset.get_vocabulary() word_list = [word.lower().strip() for word in word_list] word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list] else: raise ValueError( "There are no words to be extracted for the Topics: Please specify a corpus" ) if only_nouns: word_list = [word for (word, pos) in pos_tag( word_list) if is_noun(pos)] else: word_list = [word for (word, pos) in pos_tag(word_list)] word_list = list(set(word_list)) # embedd the noun_corpus nouns = self.embedder.create_word_embeddings(word_list) if isinstance(self.embedder.embedder, Word2VecKeyedVectors): word_list = list(compress(word_list, list(~pd.isnull(nouns)))) nouns = nouns[~pd.isnull(nouns)] try: nouns.shape[1] except IndexError: nouns = np.stack([noun for noun in nouns]) mean_embeddings = [] # create topic centroids for t in range(self.n_topics): weighted_topic = np.multiply( np.array(self.topic_assignments[t])[:, np.newaxis], embeddings ) mean_embedding = sum(weighted_topic) / len(embeddings) mean_embeddings.append(mean_embedding) topic_words = [] topic_word_scores = [] # compute cosine similarity between all nouns and the centroids res = np.inner(mean_embeddings, nouns) / np.multiply.outer( norm(mean_embeddings, axis=1), norm(nouns, axis=1) ) # get the top words according to the similarity top_words = np.flip(np.argsort(res, axis=1), axis=1) top_scores = np.flip(np.sort(res, axis=1), axis=1) # for cleaner visualization for words, scores in zip(top_words, top_scores): topic_words.append([word_list[i] for i in words[0:n]]) topic_word_scores.append(scores[0:n]) topic_words = np.array(topic_words) topic_word_scores = np.array(topic_word_scores) # return as dict of lists topics = [] for i in range(len(topic_words)): topics.append(list(zip(topic_words[i], topic_word_scores[i]))) topics_ = {} for i in range(len(topics)): topics_[i] = topics[i] # return topics and centroid of topics return topics_, mean_embeddings