Source code for stream.metrics.diversity_metrics

import gensim
import nltk
import numpy as np
from nltk.corpus import stopwords
from octis.evaluation_metrics.metrics import AbstractMetric
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

from ._helper_funcs import (cos_sim_pw, embed_corpus, embed_stopwords,
                            embed_topic, update_corpus_dic_list)
from .constants import NLTK_STOPWORD_LANGUAGE, SENTENCE_TRANSFORMER_MODEL

gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
nltk_stopwords = stopwords.words(NLTK_STOPWORD_LANGUAGE)
stopwords = list(
    set(list(nltk_stopwords) + list(gensim_stopwords) + list(ENGLISH_STOP_WORDS)))


[docs]class Embedding_Topic_Diversity(AbstractMetric):
    """
    A metric class to calculate the diversity of topics based on word embeddings. It computes
    the mean cosine similarity of the mean vectors of the top words of all topics, providing
    a measure of how diverse the topics are in the embedding space.

    Attributes:
        n_words (int): The number of top words to consider for each topic.
        corpus_dict (dict): A dictionary mapping each word in the corpus to its embedding.
    """

    def __init__(
        self,
        dataset,
        n_words=10,
        embedder=SentenceTransformer(SENTENCE_TRANSFORMER_MODEL),
        emb_filename=None,
        emb_path="Embeddings/",
        expansion_path="Embeddings/",
        expansion_filename=None,
        expansion_word_list=None,
    ):
        """
        Initializes the Embedding_Topic_Diversity object with a dataset, number of words,
        embedding model, and paths for storing embeddings.

        Parameters:
            dataset: The dataset to be used for embedding topic diversity calculation.
            n_words (int, optional): The number of top words to consider for each topic.
                Defaults to 10.
            embedder (SentenceTransformer, optional): The embedding model to use.
                Defaults to SentenceTransformer("paraphrase-MiniLM-L6-v2").
            emb_filename (str, optional): Filename to store embeddings. Defaults to None.
            emb_path (str, optional): Path to store embeddings. Defaults to "Embeddings/".
            expansion_path (str, optional): Path for expansion embeddings. Defaults to "Embeddings/".
            expansion_filename (str, optional): Filename for expansion embeddings. Defaults to None.
            expansion_word_list (list, optional): List of words for expansion. Defaults to None.
        """

        tw_emb = embed_corpus(
            dataset,
            embedder,
            emb_filename=emb_filename,
            emb_path=emb_path,
        )

        if expansion_word_list is not None:
            tw_emb = update_corpus_dic_list(
                expansion_word_list,
                tw_emb,
                embedder,
                emb_filename=expansion_filename,
                emb_path=expansion_path,
            )

        self.n_words = n_words
        self.corpus_dict = tw_emb

[docs]    def score(self, model_output):
        """
        Calculates the overall diversity score for the given model output.

        This method computes the diversity of the topics by averaging the cosine similarity
        of the mean vectors of the top words of each topic. A lower score indicates higher diversity.

        Parameters:
            model_output (dict): The output of a topic model, containing a list of topics
                                 and a topic-word matrix.

        Returns:
            float: The overall diversity score for all topics.
        """
        topics_tw = model_output["topics"]  # size: (n_topics, voc_size)
        topic_weights = model_output["topic-word-matrix"][
            :, : self.n_words
        ]  # select the weights of the top words

        topic_weights = topic_weights / np.sum(topic_weights, axis=1).reshape(
            -1, 1
        )  # normalize the weights such that they sum up to one

        emb_tw = embed_topic(
            topics_tw, self.corpus_dict, self.n_words
        )  # embed the top words
        emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
            :, : self.n_words, :
        ]  # create tensor of size (n_topics, n_topwords, n_embedding_dims)

        weighted_vecs = (
            topic_weights[:, :, None] * emb_tw
        )  # multiply each embedding vector with its corresponding weight
        topic_means = np.sum(
            weighted_vecs, axis=1
        )  # calculate the sum, which yields the weighted average

        return float(cos_sim_pw(topic_means))

[docs]    def score_per_topic(self, model_output):
        """
        Calculates diversity scores for each topic individually based on embedding similarities.

        This method computes the diversity of each topic by calculating the cosine similarity
        of its mean vector with the mean vectors of other topics.

        Parameters:
            model_output (dict): The output of a topic model, containing a list of topics
                                 and a topic-word matrix.

        Returns:
            numpy.ndarray: An array of diversity scores for each topic.
        """
        topics_tw = model_output["topics"]  # size: (n_topics, voc_size)
        topic_weights = model_output["topic-word-matrix"][
            :, : self.n_words
        ]  # select the weights of the top words size: (n_topics, n_topwords)

        topic_weights = topic_weights / np.nansum(
            topic_weights, axis=1, keepdims=True
        ).reshape(
            -1, 1
        )  # normalize the weights such that they sum up to one

        print(topic_weights)
        print(np.isnan(topic_weights).any())

        emb_tw = embed_topic(
            topics_tw, self.corpus_dict, self.n_words
        )  # embed the top words
        emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
            :, : self.n_words, :
        ]  # create tensor of size (n_topics, n_topwords, n_embedding_dims)
        self.embeddings = emb_tw

        weighted_vecs = (
            topic_weights[:, :, None] * emb_tw
        )  # multiply each embedding vector with its corresponding weight
        topic_means = np.sum(
            weighted_vecs, axis=1
        )  # calculate the sum, which yields the weighted average

        sim = cosine_similarity(
            topic_means
        )  # calculate the pairwise cosine similarity of the topic means
        sim_mean = (np.sum(sim, axis=1) - 1) / (
            len(sim) - 1
        )  # average the similarity of each topic's mean to the mean of every other topic

        results = {}
        for k in range(len(model_output["topics"])):
            half_topic_words = topics_tw[k][
                : len(topics_tw[k]) // 2
            ]  # Take only the first half of the words
            results[", ".join(half_topic_words)] = np.around(
                np.array(sim_mean)[k], 5)

        return results


[docs]class Expressivity(AbstractMetric):
    """
    A metric class to calculate the expressivity of topics by measuring the distance between
    the mean vector of the top words in a topic and the mean vector of the embeddings of
    the stop words. Lower distances suggest higher expressivity, indicating that the topic's
    top words are distinct from common stopwords.

    Attributes:
        stopword_list (list): A list of stopwords to use for comparison.
        n_words (int): The number of top words to consider for each topic.
        corpus_dict (dict): A dictionary mapping each word in the corpus to its embedding.
        embeddings (numpy.ndarray): The embeddings for the top words of the topics.
        stopword_emb (numpy.ndarray): The embeddings for the stopwords.
        stopword_mean (numpy.ndarray): The mean vector of the embeddings of the stopwords.
    """

    def __init__(
        self,
        dataset,
        stopword_list=stopwords,
        n_words=10,
        embedder=SentenceTransformer(SENTENCE_TRANSFORMER_MODEL),
        emb_filename=None,
        emb_path="Embeddings/",
        expansion_path="Embeddings/",
        expansion_filename=None,
        expansion_word_list=None,
    ):
        """
        Initializes the Expressivity object with a dataset, a list of stopwords, number of
        words, embedding model, and paths for storing embeddings.

        Parameters:
            dataset: The dataset to be used for expressivity calculation.
            stopword_list (list, optional): A list of stopwords for comparison. Defaults to a standard list.
            n_words (int, optional): The number of top words to consider for each topic. Defaults to 10.
            embedder (SentenceTransformer, optional): The embedding model to use.
                Defaults to SentenceTransformer("paraphrase-MiniLM-L6-v2").
            emb_filename (str, optional): Filename to store embeddings. Defaults to None.
            emb_path (str, optional): Path to store embeddings. Defaults to "Embeddings/".
            expansion_path (str, optional): Path for expansion embeddings. Defaults to "Embeddings/".
            expansion_filename (str, optional): Filename for expansion embeddings. Defaults to None.
            expansion_word_list (list, optional): List of words for expansion. Defaults to None.
        """

        tw_emb = embed_corpus(
            dataset,
            embedder,
            emb_filename=emb_filename,
            emb_path=emb_path,
        )

        if expansion_word_list is not None:
            tw_emb = update_corpus_dic_list(
                expansion_word_list,
                tw_emb,
                embedder,
                emb_filename=expansion_filename,
                emb_path=expansion_path,
            )
        self.stopword_list = stopword_list

        self.n_words = n_words
        self.corpus_dict = tw_emb
        self.embeddings = None

        self.stopword_emb = embed_stopwords(
            stopword_list, embedder
        )  # embed all the stopwords size: (n_stopwords, emb_dim)
        self.stopword_mean = np.mean(
            np.array(self.stopword_emb), axis=0
        )  # mean of stopword embeddings

[docs]    def score(self, model_output, new_Embeddings=True):
        """
        Calculates the overall expressivity score for the given model output.

        This method computes the expressivity of the topics by averaging the cosine similarity
        between the mean vectors of the top words of each topic and the mean vector of
        the stopwords. A lower score indicates higher expressivity.

        Parameters:
            model_output (dict): The output of a topic model, containing a list of topics
                                 and a topic-word matrix.
            new_Embeddings (bool, optional): Whether to recalculate embeddings. Defaults to True.

        Returns:
            float: The overall expressivity score for all topics.
        """
        if new_Embeddings:
            self.embeddings = None
        return float(
            np.mean(list(self.score_per_topic(
                model_output, new_Embeddings).values()))
        )

[docs]    def score_per_topic(self, model_output, new_Embeddings=True):
        """
        Calculates expressivity scores for each topic individually based on embedding distances.

        This method computes the expressivity of each topic by calculating the cosine similarity
        of its mean vector with the mean vector of the stopwords.

        Parameters:
            model_output (dict): The output of a topic model, containing a list of topics
                                 and a topic-word matrix.
            new_Embeddings (bool, optional): Whether to recalculate embeddings. Defaults to True.

        Returns:
            numpy.ndarray: An array of expressivity scores for each topic.
        """
        if new_Embeddings:
            self.embeddings = None

        # not used for now, but could be useful in the future
        # ntopics = len(model_output["topics"])

        topics_tw = model_output["topics"]  # size: (n_topics, voc_size)
        topic_weights = model_output["topic-word-matrix"][
            :, : self.n_words
        ]  # select the weights of the top words

        topic_weights = topic_weights / np.nansum(
            topic_weights, axis=1, keepdims=True
        ).reshape(
            -1, 1
        )  # normalize the weights such that they sum up to one

        if self.embeddings is None:
            emb_tw = embed_topic(
                topics_tw, self.corpus_dict, self.n_words
            )  # embed the top words
            emb_tw = np.dstack(emb_tw).transpose(2, 0, 1)[
                :, : self.n_words, :
            ]  # create tensor of size (n_topics, n_topwords, n_embedding_dims)
            self.embeddings = emb_tw
        else:
            emb_tw = self.embeddings

        weighted_vecs = (
            topic_weights[:, :, None] * emb_tw
        )  # multiply each embedding vector with its corresponding weight
        topic_means = np.sum(
            weighted_vecs, axis=1
        )  # calculate the sum, which yields the weighted average

        if np.isnan(topic_means.sum()) != 0:
            # raise ValueError("There are some nans in the topic means")
            print("There are some nans in the topic means")

        topword_sims = []
        valid_topic_means = []

        for mean in topic_means:
            if not np.isnan(
                mean
            ).any():  # Check if there are no NaNs in the current mean
                # Append non-NaN mean to the valid list
                valid_topic_means.append(mean)

        # Compute cosine similarity for valid topic means only
        for mean in valid_topic_means:
            topword_sims.append(
                cosine_similarity(
                    mean.reshape(1, -1), self.stopword_mean.reshape(1, -1)
                )[0, 0]
            )

        results = {}
        for k in range(
            len(valid_topic_means)
        ):  # Adjust range to the length of valid_topic_means
            half_topic_words = topics_tw[k][
                : len(topics_tw[k]) // 2
            ]  # Take only the first half of the words
            results[", ".join(half_topic_words)] = np.around(
                np.array(topword_sims)[k], 5
            )

        return results