From 44d6f507ed25ebc5879c264e8387f97f03845671 Mon Sep 17 00:00:00 2001
From: hossein <hse.khalilian08@gmail.com>
Date: Mon, 13 Feb 2023 14:33:03 +0330
Subject: [PATCH] remove stop words before calculating similarities

---
 .../keyphrase_count_vectorizer.py                 | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
index 0a355bc..4e4260a 100644
--- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py
+++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -145,6 +145,11 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.binary = binary
         self.dtype = dtype
 
+    def remove_stopwords(self, text):
+        text = ' '.join([word for word in text.split(' ') if word not in self.stop_words])
+
+        return text
+
     def fit(self, raw_documents: List[str]) -> object:
         """
         Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents.
@@ -170,7 +175,15 @@ def fit(self, raw_documents: List[str]) -> object:
 
         # remove keyphrases that have more than 8 words, as they are probably no real keyphrases
         # additionally this prevents memory issues during transformation to a document-keyphrase matrix
-        self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 8]
+        self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 5]
+
+
+        keys = ' | '.join([key for key in self.keyphrases])
+        if self.stop_words is not None:
+            keys = self.remove_stopwords(keys)
+        keys = keys.replace(' | | ', ' | ')
+        self.keyphrases = list(np.unique(keys.split(' | ')))
+
 
         # compute document frequencies of keyphrases
         if self.max_df or self.min_df: