Source code for stream.preprocessor.topic_extraction
import re
from itertools import compress
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import Word2VecKeyedVectors
from nltk import pos_tag
from nltk.corpus import brown as nltk_words
from nltk.corpus import words as eng_dict
from numpy.linalg import norm
from octis.dataset.dataset import Dataset as OCDataset
from ._embedder import BaseEmbedder
[docs]class TopicExtractor:
def __init__(
self,
dataset,
topic_assignments,
n_topics,
embedding_model,
):
self.dataset = dataset
self.topic_assignments = topic_assignments
self.embedder = BaseEmbedder(embedding_model)
self.n_topics = n_topics
def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=True):
"""extracts the topics most probable words, which are the words nearest to the topics centroid.
We extract all nouns from the corpus and the brown corpus.
Afterwards we compute the cosine similarity between every word and every centroid.
Note, that here we did not use the sklearn.metrics.pairwise cosine_similarity function due to
a faster computation when using numpy.
Hecen we used:
np.inner(centroids, nouns) / np.multiply.outer(
norm(centroids, axis=1), norm(nouns, axis=1)
)
Args:
embeddings (_type_): _document embeddings to compute centroid of the topic
n (_type_): n_top number of words per topic
Returns:
dict: extracted topics
"""
# define whether word is a noun
def is_noun(pos):
return pos[:2] == "NN"
# not used for now
# DATADIR = "../datasets/preprocessed_datasets"
# extend the corpus
if corpus == "brown":
word_list = nltk_words.words()
word_list = [word.lower().strip() for word in word_list]
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
for word in word_list]
elif corpus == "words":
word_list = eng_dict.words()
word_list = [word.lower().strip() for word in word_list]
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
for word in word_list]
elif corpus == "octis":
data = OCDataset()
data.fetch_dataset("20NewsGroup")
word_list = data.get_vocabulary()
data.fetch_dataset("M10")
word_list += data.get_vocabulary()
data.fetch_dataset("BBC_News")
word_list += data.get_vocabulary()
# include reuters etc datasets
# data.load_custom_dataset_from_folder(DATADIR + "/GN")
# word_list += data.get_vocabulary()
word_list += self.dataset.get_vocabulary()
word_list = [word.lower().strip() for word in word_list]
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
for word in word_list]
else:
raise ValueError(
"There are no words to be extracted for the Topics: Please specify a corpus"
)
if only_nouns:
word_list = [word for (word, pos) in pos_tag(
word_list) if is_noun(pos)]
else:
word_list = [word for (word, pos) in pos_tag(word_list)]
word_list = list(set(word_list))
# embedd the noun_corpus
nouns = self.embedder.create_word_embeddings(word_list)
if isinstance(self.embedder.embedder, Word2VecKeyedVectors):
word_list = list(compress(word_list, list(~pd.isnull(nouns))))
nouns = nouns[~pd.isnull(nouns)]
try:
nouns.shape[1]
except IndexError:
nouns = np.stack([noun for noun in nouns])
mean_embeddings = []
# create topic centroids
for t in range(self.n_topics):
weighted_topic = np.multiply(
np.array(self.topic_assignments[t])[:, np.newaxis], embeddings
)
mean_embedding = sum(weighted_topic) / len(embeddings)
mean_embeddings.append(mean_embedding)
topic_words = []
topic_word_scores = []
# compute cosine similarity between all nouns and the centroids
res = np.inner(mean_embeddings, nouns) / np.multiply.outer(
norm(mean_embeddings, axis=1), norm(nouns, axis=1)
)
# get the top words according to the similarity
top_words = np.flip(np.argsort(res, axis=1), axis=1)
top_scores = np.flip(np.sort(res, axis=1), axis=1)
# for cleaner visualization
for words, scores in zip(top_words, top_scores):
topic_words.append([word_list[i] for i in words[0:n]])
topic_word_scores.append(scores[0:n])
topic_words = np.array(topic_words)
topic_word_scores = np.array(topic_word_scores)
# return as dict of lists
topics = []
for i in range(len(topic_words)):
topics.append(list(zip(topic_words[i], topic_word_scores[i])))
topics_ = {}
for i in range(len(topics)):
topics_[i] = topics[i]
# return topics and centroid of topics
return topics_, mean_embeddings