From 44d6f507ed25ebc5879c264e8387f97f03845671 Mon Sep 17 00:00:00 2001 From: hossein Date: Mon, 13 Feb 2023 14:33:03 +0330 Subject: [PATCH] remove stop words before calculating similarities --- .../keyphrase_count_vectorizer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index 0a355bc..4e4260a 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -145,6 +145,11 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm' self.binary = binary self.dtype = dtype + def remove_stopwords(self, text): + text = ' '.join([word for word in text.split(' ') if word not in self.stop_words]) + + return text + def fit(self, raw_documents: List[str]) -> object: """ Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents. @@ -170,7 +175,15 @@ def fit(self, raw_documents: List[str]) -> object: # remove keyphrases that have more than 8 words, as they are probably no real keyphrases # additionally this prevents memory issues during transformation to a document-keyphrase matrix - self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 8] + self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 5] + + + keys = ' | '.join([key for key in self.keyphrases]) + if self.stop_words is not None: + keys = self.remove_stopwords(keys) + keys = keys.replace(' | | ', ' | ') + self.keyphrases = list(np.unique(keys.split(' | '))) + # compute document frequencies of keyphrases if self.max_df or self.min_df: