import gensim
import nltk
import numpy as np
from nltk.corpus import stopwords
from octis.evaluation_metrics.metrics import AbstractMetric
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from ._helper_funcs import (cos_sim_pw, embed_corpus, embed_stopwords,
embed_topic, update_corpus_dic_list)
from .constants import NLTK_STOPWORD_LANGUAGE, SENTENCE_TRANSFORMER_MODEL
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
nltk_stopwords = stopwords.words(NLTK_STOPWORD_LANGUAGE)
stopwords = list(
set(list(nltk_stopwords) + list(gensim_stopwords) + list(ENGLISH_STOP_WORDS)))
[docs]class Embedding_Topic_Diversity(AbstractMetric):
"""
A metric class to calculate the diversity of topics based on word embeddings. It computes
the mean cosine similarity of the mean vectors of the top words of all topics, providing
a measure of how diverse the topics are in the embedding space.
Attributes:
n_words (int): The number of top words to consider for each topic.
corpus_dict (dict): A dictionary mapping each word in the corpus to its embedding.
"""
def __init__(
self,
dataset,
n_words=10,
embedder=SentenceTransformer(SENTENCE_TRANSFORMER_MODEL),
emb_filename=None,
emb_path="Embeddings/",
expansion_path="Embeddings/",
expansion_filename=None,
expansion_word_list=None,
):
"""
Initializes the Embedding_Topic_Diversity object with a dataset, number of words,
embedding model, and paths for storing embeddings.
Parameters:
dataset: The dataset to be used for embedding topic diversity calculation.
n_words (int, optional): The number of top words to consider for each topic.
Defaults to 10.
embedder (SentenceTransformer, optional): The embedding model to use.
Defaults to SentenceTransformer("paraphrase-MiniLM-L6-v2").
emb_filename (str, optional): Filename to store embeddings. Defaults to None.
emb_path (str, optional): Path to store embeddings. Defaults to "Embeddings/".
expansion_path (str, optional): Path for expansion embeddings. Defaults to "Embeddings/".
expansion_filename (str, optional): Filename for expansion embeddings. Defaults to None.
expansion_word_list (list, optional): List of words for expansion. Defaults to None.
"""
tw_emb = embed_corpus(
dataset,
embedder,
emb_filename=emb_filename,
emb_path=emb_path,
)
if expansion_word_list is not None:
tw_emb = update_corpus_dic_list(
expansion_word_list,
tw_emb,
embedder,
emb_filename=expansion_filename,
emb_path=expansion_path,
)
self.n_words = n_words
self.corpus_dict = tw_emb
[docs] def score(self, model_output):
"""
Calculates the overall diversity score for the given model output.
This method computes the diversity of the topics by averaging the cosine similarity
of the mean vectors of the top words of each topic. A lower score indicates higher diversity.
Parameters:
model_output (dict): The output of a topic model, containing a list of topics
and a topic-word matrix.
Returns:
float: The overall diversity score for all topics.
"""
topics_tw = model_output["topics"] # size: (n_topics, voc_size)
topic_weights = model_output["topic-word-matrix"][
:, : self.n_words
] # select the weights of the top words
topic_weights = topic_weights / np.sum(topic_weights, axis=1).reshape(
-1, 1
) # normalize the weights such that they sum up to one
emb_tw = embed_topic(
topics_tw, self.corpus_dict, self.n_words
) # embed the top words
emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
:, : self.n_words, :
] # create tensor of size (n_topics, n_topwords, n_embedding_dims)
weighted_vecs = (
topic_weights[:, :, None] * emb_tw
) # multiply each embedding vector with its corresponding weight
topic_means = np.sum(
weighted_vecs, axis=1
) # calculate the sum, which yields the weighted average
return float(cos_sim_pw(topic_means))
[docs] def score_per_topic(self, model_output):
"""
Calculates diversity scores for each topic individually based on embedding similarities.
This method computes the diversity of each topic by calculating the cosine similarity
of its mean vector with the mean vectors of other topics.
Parameters:
model_output (dict): The output of a topic model, containing a list of topics
and a topic-word matrix.
Returns:
numpy.ndarray: An array of diversity scores for each topic.
"""
topics_tw = model_output["topics"] # size: (n_topics, voc_size)
topic_weights = model_output["topic-word-matrix"][
:, : self.n_words
] # select the weights of the top words size: (n_topics, n_topwords)
topic_weights = topic_weights / np.nansum(
topic_weights, axis=1, keepdims=True
).reshape(
-1, 1
) # normalize the weights such that they sum up to one
print(topic_weights)
print(np.isnan(topic_weights).any())
emb_tw = embed_topic(
topics_tw, self.corpus_dict, self.n_words
) # embed the top words
emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
:, : self.n_words, :
] # create tensor of size (n_topics, n_topwords, n_embedding_dims)
self.embeddings = emb_tw
weighted_vecs = (
topic_weights[:, :, None] * emb_tw
) # multiply each embedding vector with its corresponding weight
topic_means = np.sum(
weighted_vecs, axis=1
) # calculate the sum, which yields the weighted average
sim = cosine_similarity(
topic_means
) # calculate the pairwise cosine similarity of the topic means
sim_mean = (np.sum(sim, axis=1) - 1) / (
len(sim) - 1
) # average the similarity of each topic's mean to the mean of every other topic
results = {}
for k in range(len(model_output["topics"])):
half_topic_words = topics_tw[k][
: len(topics_tw[k]) // 2
] # Take only the first half of the words
results[", ".join(half_topic_words)] = np.around(
np.array(sim_mean)[k], 5)
return results
[docs]class Expressivity(AbstractMetric):
"""
A metric class to calculate the expressivity of topics by measuring the distance between
the mean vector of the top words in a topic and the mean vector of the embeddings of
the stop words. Lower distances suggest higher expressivity, indicating that the topic's
top words are distinct from common stopwords.
Attributes:
stopword_list (list): A list of stopwords to use for comparison.
n_words (int): The number of top words to consider for each topic.
corpus_dict (dict): A dictionary mapping each word in the corpus to its embedding.
embeddings (numpy.ndarray): The embeddings for the top words of the topics.
stopword_emb (numpy.ndarray): The embeddings for the stopwords.
stopword_mean (numpy.ndarray): The mean vector of the embeddings of the stopwords.
"""
def __init__(
self,
dataset,
stopword_list=stopwords,
n_words=10,
embedder=SentenceTransformer(SENTENCE_TRANSFORMER_MODEL),
emb_filename=None,
emb_path="Embeddings/",
expansion_path="Embeddings/",
expansion_filename=None,
expansion_word_list=None,
):
"""
Initializes the Expressivity object with a dataset, a list of stopwords, number of
words, embedding model, and paths for storing embeddings.
Parameters:
dataset: The dataset to be used for expressivity calculation.
stopword_list (list, optional): A list of stopwords for comparison. Defaults to a standard list.
n_words (int, optional): The number of top words to consider for each topic. Defaults to 10.
embedder (SentenceTransformer, optional): The embedding model to use.
Defaults to SentenceTransformer("paraphrase-MiniLM-L6-v2").
emb_filename (str, optional): Filename to store embeddings. Defaults to None.
emb_path (str, optional): Path to store embeddings. Defaults to "Embeddings/".
expansion_path (str, optional): Path for expansion embeddings. Defaults to "Embeddings/".
expansion_filename (str, optional): Filename for expansion embeddings. Defaults to None.
expansion_word_list (list, optional): List of words for expansion. Defaults to None.
"""
tw_emb = embed_corpus(
dataset,
embedder,
emb_filename=emb_filename,
emb_path=emb_path,
)
if expansion_word_list is not None:
tw_emb = update_corpus_dic_list(
expansion_word_list,
tw_emb,
embedder,
emb_filename=expansion_filename,
emb_path=expansion_path,
)
self.stopword_list = stopword_list
self.n_words = n_words
self.corpus_dict = tw_emb
self.embeddings = None
self.stopword_emb = embed_stopwords(
stopword_list, embedder
) # embed all the stopwords size: (n_stopwords, emb_dim)
self.stopword_mean = np.mean(
np.array(self.stopword_emb), axis=0
) # mean of stopword embeddings
[docs] def score(self, model_output, new_Embeddings=True):
"""
Calculates the overall expressivity score for the given model output.
This method computes the expressivity of the topics by averaging the cosine similarity
between the mean vectors of the top words of each topic and the mean vector of
the stopwords. A lower score indicates higher expressivity.
Parameters:
model_output (dict): The output of a topic model, containing a list of topics
and a topic-word matrix.
new_Embeddings (bool, optional): Whether to recalculate embeddings. Defaults to True.
Returns:
float: The overall expressivity score for all topics.
"""
if new_Embeddings:
self.embeddings = None
return float(
np.mean(list(self.score_per_topic(
model_output, new_Embeddings).values()))
)
[docs] def score_per_topic(self, model_output, new_Embeddings=True):
"""
Calculates expressivity scores for each topic individually based on embedding distances.
This method computes the expressivity of each topic by calculating the cosine similarity
of its mean vector with the mean vector of the stopwords.
Parameters:
model_output (dict): The output of a topic model, containing a list of topics
and a topic-word matrix.
new_Embeddings (bool, optional): Whether to recalculate embeddings. Defaults to True.
Returns:
numpy.ndarray: An array of expressivity scores for each topic.
"""
if new_Embeddings:
self.embeddings = None
# not used for now, but could be useful in the future
# ntopics = len(model_output["topics"])
topics_tw = model_output["topics"] # size: (n_topics, voc_size)
topic_weights = model_output["topic-word-matrix"][
:, : self.n_words
] # select the weights of the top words
topic_weights = topic_weights / np.nansum(
topic_weights, axis=1, keepdims=True
).reshape(
-1, 1
) # normalize the weights such that they sum up to one
if self.embeddings is None:
emb_tw = embed_topic(
topics_tw, self.corpus_dict, self.n_words
) # embed the top words
emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
:, : self.n_words, :
] # create tensor of size (n_topics, n_topwords, n_embedding_dims)
self.embeddings = emb_tw
else:
emb_tw = self.embeddings
weighted_vecs = (
topic_weights[:, :, None] * emb_tw
) # multiply each embedding vector with its corresponding weight
topic_means = np.sum(
weighted_vecs, axis=1
) # calculate the sum, which yields the weighted average
if np.isnan(topic_means.sum()) != 0:
# raise ValueError("There are some nans in the topic means")
print("There are some nans in the topic means")
topword_sims = []
valid_topic_means = []
for mean in topic_means:
if not np.isnan(
mean
).any(): # Check if there are no NaNs in the current mean
# Append non-NaN mean to the valid list
valid_topic_means.append(mean)
# Compute cosine similarity for valid topic means only
for mean in valid_topic_means:
topword_sims.append(
cosine_similarity(
mean.reshape(1, -1), self.stopword_mean.reshape(1, -1)
)[0, 0]
)
results = {}
for k in range(
len(valid_topic_means)
): # Adjust range to the length of valid_topic_means
half_topic_words = topics_tw[k][
: len(topics_tw[k]) // 2
] # Take only the first half of the words
results[", ".join(half_topic_words)] = np.around(
np.array(topword_sims)[k], 5
)
return results