Source code for stream.preprocessor._embedder

import re
from collections.abc import Iterable
from typing import List

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import Word2VecKeyedVectors
from tqdm import tqdm


[docs]class GensimBackend: """ Gensim Embedding Model This class provides functionality to create document embeddings using Gensim Word2Vec embeddings. Args: embedding_model (Word2VecKeyedVectors): A Gensim Word2Vec model for word embeddings. Attributes: embedding_model (Word2VecKeyedVectors): The Gensim Word2Vec model used for embeddings. Methods: encode(documents: List[str], verbose: bool = False) -> np.ndarray: Embed a list of documents/words into a matrix of embeddings. """ def __init__(self, embedding_model: Word2VecKeyedVectors): """ Initialize the GensimBackend with a Word2VecKeyedVectors model. Args: embedding_model (Word2VecKeyedVectors): A Gensim Word2Vec model for word embeddings. Raises: ValueError: If the provided model is not a Word2VecKeyedVectors instance. """ super().__init__() if isinstance(embedding_model, Word2VecKeyedVectors): self.embedding_model = embedding_model else: raise ValueError( "Please select a correct Gensim model: \n" "`import gensim.downloader as api` \n" "`ft = api.load('fasttext-wiki-news-subwords-300')`" )
[docs] def encode(self, documents: List[str], verbose: bool = False) -> np.ndarray: """ Embed a list of documents/words into an n-dimensional matrix of embeddings. Args: documents (List[str]): A list of documents or words to be embedded. verbose (bool, optional): Controls the verbosity of the process. Returns: np.ndarray: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m`. """ # unused variables # vector_shape = self.embedding_model.get_vector( # list(self.embedding_model.index_to_key)[0] # ).shape[0] # empty_vector = np.zeros(vector_shape) embeddings = [] for doc in tqdm(documents, disable=not verbose, position=0, leave=True): doc_embedding = [] # Extract word embeddings for word in doc.split(" "): try: word_embedding = self.embedding_model.get_vector(word) doc_embedding.append(word_embedding) except KeyError: continue # Pool word embeddings doc_embedding = np.mean(doc_embedding, axis=0) embeddings.append(doc_embedding) embeddings = np.array(embeddings, dtype=object) return embeddings
[docs]class BaseEmbedder: """ Base Embedder Class This class provides a base for creating document and word embeddings using different models. Args: embedding_model: The embedding model used for generating embeddings. Attributes: embedder: The specific backend embedder used for generating embeddings. embedding_model: The embedding model used for generating embeddings. Methods: _check_documents_type(documents: List[str]): Check if the provided documents are of the correct type. _clean_docs(documents: List[str]): Clean and preprocess a list of documents. create_doc_embeddings(documents: List[str], progress: bool = False): Create document embeddings. create_word_embeddings(word: List[str]): Create word embeddings. """ def __init__(self, embedding_model): """ Initialize the BaseEmbedder with an embedding model. Args: embedding_model: The embedding model used for generating embeddings. """ if isinstance(embedding_model, Word2VecKeyedVectors): self.embedder = GensimBackend(embedding_model) else: self.embedder = embedding_model self.embedding_model = embedding_model def _check_documents_type(self, documents): """ Check if the provided documents are of the correct type. Args: documents: The documents to check. Raises: TypeError: If the provided documents are not of the correct type. """ if isinstance(documents, Iterable) and not isinstance(documents, str): if not any([isinstance(doc, str) for doc in documents]): raise TypeError( "Make sure that the iterable only contains strings.") else: raise TypeError( "Make sure that the documents variable is an iterable containing strings only." ) def _clean_docs(self, documents: List[str]): """ Clean and preprocess a list of documents. Args: documents (List[str]): List of documents to clean. Returns: pd.DataFrame: A DataFrame with cleaned and lowercased documents. """ documents = pd.DataFrame( {"docs": documents, "ID": range(len(documents)), "Topic": None} ) for i in range(len(documents)): documents["docs"].loc[i] = re.compile(r"[/(){}\[\]\|@,;]").sub( "", documents["docs"][i] ) documents["docs"].loc[i] = re.compile( r"\\").sub("", documents["docs"][i]) documents["docs"].loc[i] = re.compile( "'").sub("", documents["docs"][i]) documents["docs"].loc[i] = re.compile( " ").sub(" ", documents["docs"][i]) documents["docs"] = documents["docs"].str.lower() return documents def _clean_docs_(self, text): # text = re.compile(r"[/(){}\[\]\|@,;]").sub("", text) text = re.compile(r"\\").sub("", text) text = re.compile("'").sub("", text) text = re.compile(" ").sub(" ", text) return text
[docs] def create_doc_embeddings(self, documents: List[str], progress=False): """ Create document embeddings for a list of documents. Args: documents (List[str]): List of documents to create embeddings for. progress (bool, optional): Controls the verbosity of the process. Returns: np.ndarray: Document embeddings. pd.DataFrame: A DataFrame with cleaned and lowercased documents. """ self._check_documents_type(documents) documents = pd.DataFrame( {"docs": documents, "ID": range(len(documents)), "Topic": None} ) documents["docs"] = documents["docs"].apply(self._clean_docs_) documents["docs"] = documents["docs"].str.lower() self.corpus_embeddings = self.embedder.encode(documents["docs"]) return self.corpus_embeddings, documents
[docs] def create_word_embeddings(self, word: List[str]): """ Create word embeddings for a list of words. Args: word (List[str]): List of words to create embeddings for. Returns: np.ndarray: Word embeddings. """ try: self.word_embedding = self.embedder.encode( word, ) return self.word_embedding except KeyError: pass