from datetime import datetime
import pandas as pd
import pyarrow as pa
from datasets import Dataset
from loguru import logger
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, Trainer, TrainingArguments
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from ..preprocessor._tf_idf import c_tf_idf, extract_tfidf_topics
from ..utils.dataset import TMDataset
from .abstract_helper_models.base import BaseModel, TrainingStatus
time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
MODEL_NAME = "DCTE"
EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L3-v2"
# logger.add(f"{MODEL_NAME}_{time}.log", backtrace=True, diagnose=True)
[docs]class DCTE(BaseModel):
"""
A document classification and topic extraction class that utilizes the SetFitModel for
document classification and TF-IDF for topic extraction.
This class inherits from the AbstractModel class and is designed for supervised
document classification and unsupervised topic modeling.
Attributes:
n_topics (int): The number of topics to identify in the dataset.
model (SetFitModel): The SetFitModel used for document classification.
batch_size (int): The batch size used in training.
num_iterations (int): The number of iterations for SetFit training.
num_epochs (int): The number of epochs for SetFit training.
"""
def __init__(
self,
model: str = EMBEDDING_MODEL_NAME,
):
"""
Initializes the DCTE model with specified number of topics, embedding model,
SetFit model, batch size, number of iterations, and number of epochs.
Parameters:
model (str, optional): The identifier of the SetFit model to be used.
Defaults to "all-MiniLM-L6-v2".
"""
self.n_topics = None
self.model = SetFitModel.from_pretrained(
f"sentence-transformers/{model}")
self._status = TrainingStatus.NOT_STARTED
def _prepare_data(self, val_split: float):
"""
Prepares the dataset for clustering.
"""
assert hasattr(self, "train_dataset") and isinstance(
self.train_dataset, TMDataset
), "The dataset must be an instance of TMDataset."
self.train_dataset.get_dataframe()
self.dataframe = self.train_dataset.dataframe
self.dataframe["label"] = preprocessing.LabelEncoder().fit_transform(
self.dataframe["label_text"]
)
print(
"--- a train-validation split of 0.8 to 0.2 is performed --- \n---change 'val_split' if needed"
)
train_df, val_df = train_test_split(
self.dataframe, test_size=val_split)
# convert to Huggingface dataset
self.train_ds = Dataset(pa.Table.from_pandas(train_df))
self.val_ds = Dataset(pa.Table.from_pandas(val_df))
def _get_topic_document_matrix(self):
assert (
self.trained
), "Model must be trained before accessing the topic-document matrix."
# Safely get the topic-document matrix with a default value of None if not found
return self.output.get("topic-document-matrix", None)
def _get_topics(self, predict_df: pd.DataFrame, top_words: int):
docs_per_topic = predict_df.groupby(["predictions"], as_index=False).agg(
{"text": " ".join}
)
tfidf, count = c_tf_idf(
docs_per_topic["text"].values, m=len(predict_df))
topics = extract_tfidf_topics(
tfidf,
count,
docs_per_topic,
n=top_words,
)
new_topics = {}
words_list = []
for k in predict_df["predictions"].unique():
words = [
word
for t in topics[k][0:top_words]
for word in t
if isinstance(word, str)
]
weights = [
weight
for t in topics[k][0:top_words]
for weight in t
if isinstance(weight, float)
]
weights = [weight / sum(weights) for weight in weights]
new_topics[k] = list(zip(words, weights))
words_list.append(words)
res_dic = {}
res_dic["topics"] = words_list
res_dic["topic-word-matrix"] = tfidf.T
res_dic["topic_dict"] = topics
one_hot_encoder = OneHotEncoder(
sparse=False
) # Use sparse=False to get a dense array
predictions_one_hot = one_hot_encoder.fit_transform(
predict_df[["predictions"]])
# Transpose the one-hot encoded matrix to get shape (k, n)
topic_document_matrix = predictions_one_hot.T
res_dic["topic-document-matrix"] = topic_document_matrix
return res_dic
[docs] def train_model(
self,
train_dataset,
predict_dataset,
val_split: float = 0.2,
n_top_words: int = 10,
**training_args,
):
"""
Trains the DCTE model using the given training dataset and then performs
prediction and topic extraction on the specified prediction dataset.
The method uses the SetFitTrainer for training and evaluates the model's performance.
It then applies the trained model for prediction and extracts topics using TF-IDF.
Parameters:
train_dataset: The dataset used for training the model.
predict_dataset: The dataset on which to perform prediction and topic extraction.
val_split (float, optional): The fraction of the training data to use as
validation data. Defaults to 0.2.
top_words (int, optional): The number of top words to extract for each topic.
Defaults to 10.
Returns:
dict: A dictionary containing the extracted topics and the topic-word matrix.
"""
# Set default training arguments
default_args = {
"batch_size": self.batch_size,
"num_epochs": self.num_epochs,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"num_iterations": self.num_iterations,
"load_best_model_at_end": True,
"loss": CosineSimilarityLoss,
}
# Update default arguments with any user-provided arguments
default_args.update(training_args)
# Use the updated arguments
args = TrainingArguments(**default_args)
self.train_dataset = train_dataset
self._prepare_data(val_split=val_split)
assert hasattr(self, "train_ds") and hasattr(
self, "val_ds"
), "The training and Validation datasets have to be processed before training"
self.trainer = Trainer(
model=self.model,
args=args,
train_dataset=self.train_ds,
eval_dataset=self.val_ds,
)
# train
self.trainer.train()
# evaluate accuracy
metrics = self.trainer.evaluate()
print("--- finished training ---")
print(metrics)
predict_df = pd.DataFrame({"tokens": predict_dataset.get_corpus()})
predict_df["text"] = [" ".join(words)
for words in predict_df["tokens"]]
self.labels = self.model(predict_df["text"])
predict_df["predictions"] = self.labels
self.output = self._get_topics(predict_df, n_top_words)
self.trained = True
return self.output