Source code for stream.preprocessor._embedder
import re
from collections.abc import Iterable
from typing import List
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import Word2VecKeyedVectors
from tqdm import tqdm
[docs]class GensimBackend:
"""
Gensim Embedding Model
This class provides functionality to create document embeddings using Gensim Word2Vec embeddings.
Args:
embedding_model (Word2VecKeyedVectors): A Gensim Word2Vec model for word embeddings.
Attributes:
embedding_model (Word2VecKeyedVectors): The Gensim Word2Vec model used for embeddings.
Methods:
encode(documents: List[str], verbose: bool = False) -> np.ndarray:
Embed a list of documents/words into a matrix of embeddings.
"""
def __init__(self, embedding_model: Word2VecKeyedVectors):
"""
Initialize the GensimBackend with a Word2VecKeyedVectors model.
Args:
embedding_model (Word2VecKeyedVectors): A Gensim Word2Vec model for word embeddings.
Raises:
ValueError: If the provided model is not a Word2VecKeyedVectors instance.
"""
super().__init__()
if isinstance(embedding_model, Word2VecKeyedVectors):
self.embedding_model = embedding_model
else:
raise ValueError(
"Please select a correct Gensim model: \n"
"`import gensim.downloader as api` \n"
"`ft = api.load('fasttext-wiki-news-subwords-300')`"
)
[docs] def encode(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""
Embed a list of documents/words into an n-dimensional matrix of embeddings.
Args:
documents (List[str]): A list of documents or words to be embedded.
verbose (bool, optional): Controls the verbosity of the process.
Returns:
np.ndarray: Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`.
"""
# unused variables
# vector_shape = self.embedding_model.get_vector(
# list(self.embedding_model.index_to_key)[0]
# ).shape[0]
# empty_vector = np.zeros(vector_shape)
embeddings = []
for doc in tqdm(documents, disable=not verbose, position=0, leave=True):
doc_embedding = []
# Extract word embeddings
for word in doc.split(" "):
try:
word_embedding = self.embedding_model.get_vector(word)
doc_embedding.append(word_embedding)
except KeyError:
continue
# Pool word embeddings
doc_embedding = np.mean(doc_embedding, axis=0)
embeddings.append(doc_embedding)
embeddings = np.array(embeddings, dtype=object)
return embeddings
[docs]class BaseEmbedder:
"""
Base Embedder Class
This class provides a base for creating document and word embeddings using different models.
Args:
embedding_model: The embedding model used for generating embeddings.
Attributes:
embedder: The specific backend embedder used for generating embeddings.
embedding_model: The embedding model used for generating embeddings.
Methods:
_check_documents_type(documents: List[str]): Check if the provided documents are of the correct type.
_clean_docs(documents: List[str]): Clean and preprocess a list of documents.
create_doc_embeddings(documents: List[str], progress: bool = False): Create document embeddings.
create_word_embeddings(word: List[str]): Create word embeddings.
"""
def __init__(self, embedding_model):
"""
Initialize the BaseEmbedder with an embedding model.
Args:
embedding_model: The embedding model used for generating embeddings.
"""
if isinstance(embedding_model, Word2VecKeyedVectors):
self.embedder = GensimBackend(embedding_model)
else:
self.embedder = embedding_model
self.embedding_model = embedding_model
def _check_documents_type(self, documents):
"""
Check if the provided documents are of the correct type.
Args:
documents: The documents to check.
Raises:
TypeError: If the provided documents are not of the correct type.
"""
if isinstance(documents, Iterable) and not isinstance(documents, str):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError(
"Make sure that the iterable only contains strings.")
else:
raise TypeError(
"Make sure that the documents variable is an iterable containing strings only."
)
def _clean_docs(self, documents: List[str]):
"""
Clean and preprocess a list of documents.
Args:
documents (List[str]): List of documents to clean.
Returns:
pd.DataFrame: A DataFrame with cleaned and lowercased documents.
"""
documents = pd.DataFrame(
{"docs": documents, "ID": range(len(documents)), "Topic": None}
)
for i in range(len(documents)):
documents["docs"].loc[i] = re.compile(r"[/(){}\[\]\|@,;]").sub(
"", documents["docs"][i]
)
documents["docs"].loc[i] = re.compile(
r"\\").sub("", documents["docs"][i])
documents["docs"].loc[i] = re.compile(
"'").sub("", documents["docs"][i])
documents["docs"].loc[i] = re.compile(
" ").sub(" ", documents["docs"][i])
documents["docs"] = documents["docs"].str.lower()
return documents
def _clean_docs_(self, text): #
text = re.compile(r"[/(){}\[\]\|@,;]").sub("", text)
text = re.compile(r"\\").sub("", text)
text = re.compile("'").sub("", text)
text = re.compile(" ").sub(" ", text)
return text
[docs] def create_doc_embeddings(self, documents: List[str], progress=False):
"""
Create document embeddings for a list of documents.
Args:
documents (List[str]): List of documents to create embeddings for.
progress (bool, optional): Controls the verbosity of the process.
Returns:
np.ndarray: Document embeddings.
pd.DataFrame: A DataFrame with cleaned and lowercased documents.
"""
self._check_documents_type(documents)
documents = pd.DataFrame(
{"docs": documents, "ID": range(len(documents)), "Topic": None}
)
documents["docs"] = documents["docs"].apply(self._clean_docs_)
documents["docs"] = documents["docs"].str.lower()
self.corpus_embeddings = self.embedder.encode(documents["docs"])
return self.corpus_embeddings, documents
[docs] def create_word_embeddings(self, word: List[str]):
"""
Create word embeddings for a list of words.
Args:
word (List[str]): List of words to create embeddings for.
Returns:
np.ndarray: Word embeddings.
"""
try:
self.word_embedding = self.embedder.encode(
word,
)
return self.word_embedding
except KeyError:
pass