Source code for stream.preprocessor._preprocessor

import re
import unicodedata
from collections import Counter
from typing import List, Set

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tqdm import tqdm


[docs]class TextPreprocessor: def __init__(self, **kwargs): self.language = kwargs.get("language", "en") self.remove_stopwords = kwargs.get("remove_stopwords", False) self.lowercase = kwargs.get("lowercase", True) self.remove_punctuation = kwargs.get("remove_punctuation", True) self.remove_numbers = kwargs.get("remove_numbers", True) self.lemmatize = kwargs.get("lemmatize", False) self.stem = kwargs.get("stem", False) self.expand_contractions = kwargs.get("expand_contractions", True) self.remove_html_tags = kwargs.get("remove_html_tags", True) self.remove_special_chars = kwargs.get("remove_special_chars", True) self.remove_accents = kwargs.get("remove_accents", True) self.custom_stopwords = set(kwargs.get("custom_stopwords", [])) self.detokenize = kwargs.get("detokenize", False) self.min_word_freq = kwargs.get("min_word_freq", 2) self.max_word_freq = kwargs.get("max_word_freq", None) self.min_word_length = kwargs.get("min_word_length", 3) self.max_word_length = kwargs.get("max_word_length", None) self.dictionary = set(kwargs.get("dictionary", [])) self.remove_words_with_numbers = kwargs.get( "remove_words_with_numbers", False) self.remove_words_with_special_chars = kwargs.get( "remove_words_with_special_chars", False ) if self.language != "en" and self.remove_stopwords: self.stop_words = set(stopwords.words(self.language)) else: self.stop_words = set(stopwords.words("english")) self.stop_words.update(self.custom_stopwords) if self.lemmatize: self.lemmatizer = WordNetLemmatizer() if self.stem: self.stemmer = PorterStemmer() self.contractions_dict = self._load_contractions() self.word_freq = Counter() def _load_contractions(self): # Load a dictionary of contractions and their expansions contractions_dict = { "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am", } return contractions_dict def _expand_contractions(self, text): contractions_pattern = re.compile( "({})".format("|".join(self.contractions_dict.keys())), flags=re.IGNORECASE | re.DOTALL, ) def expand_match(contraction): match = contraction.group(0) expanded_contraction = self.contractions_dict.get(match.lower()) return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) return expanded_text def _remove_html_tags(self, text): clean = re.compile("<.*?>") return re.sub(clean, " ", text) def _remove_special_characters(self, text): return re.sub(r"[^a-zA-Z0-9\s]", " ", text) def _remove_accents(self, text): text = unicodedata.normalize("NFD", text) text = text.encode("ascii", "ignore") return text.decode("utf-8") def _clean_text(self, text): text = text.strip() if self.lowercase: text = text.lower() if self.expand_contractions: text = self._expand_contractions(text) if self.remove_html_tags: text = self._remove_html_tags(text) if self.remove_special_chars: text = self._remove_special_characters(text) if self.remove_accents: text = self._remove_accents(text) if self.remove_numbers: text = re.sub(r"\d+", " ", text) if self.remove_punctuation: text = re.sub(r"[^\w\s]", " ", text) words = word_tokenize(text) # Update word frequency counter self.word_freq.update(words) if self.remove_stopwords: words = [word for word in words if word not in self.stop_words] if self.lemmatize: words = [self.lemmatizer.lemmatize(word) for word in words] if self.stem: words = [self.stemmer.stem(word) for word in words] if self.min_word_freq is not None: words = [ word for word in words if self.word_freq[word] >= self.min_word_freq ] if self.max_word_freq is not None: words = [ word for word in words if self.word_freq[word] <= self.max_word_freq ] if self.min_word_length is not None: words = [word for word in words if len( word) >= self.min_word_length] if self.max_word_length is not None: words = [word for word in words if len( word) <= self.max_word_length] if self.dictionary != set(): words = [word for word in words if word in self.dictionary] if self.remove_words_with_numbers: words = [word for word in words if not any( char.isdigit() for char in word)] if self.remove_words_with_special_chars: words = [word for word in words if not re.search( r"[^a-zA-Z0-9\s]", word)] if self.detokenize: text = TreebankWordDetokenizer().detokenize(words) else: text = " ".join(words) # Remove double spaces text = re.sub(r"\s+", " ", text) return text def preprocess_text(self, text): try: language = detect(text) if language != self.language: return text except LangDetectException: pass return self._clean_text(text) def preprocess_dataframe(self, df, text_column): df[text_column] = df[text_column].apply(self.preprocess_text) return df def preprocess_documents(self, documents: List[str]) -> List[str]: preprocessed_docs = [] for doc in tqdm(documents, desc="Preprocessing documents"): preprocessed_docs.append(self.preprocess_text(doc)) return preprocessed_docs
[docs] def add_custom_stopwords(self, stopwords: Set[str]): """ Add custom stopwords to the preprocessor. Parameters ---------- stopwords : set Set of custom stopwords to be added. """ self.custom_stopwords.update(stopwords) self.stop_words.update(stopwords)
[docs] def remove_custom_stopwords(self, stopwords: Set[str]): """ Remove custom stopwords from the preprocessor. Parameters ---------- stopwords : set Set of custom stopwords to be removed. """ self.custom_stopwords.difference_update(stopwords) self.stop_words.difference_update(stopwords)