Merge remote-tracking branch 'origin/master'

TimSchopf · TimSchopf · commit 5e2811642b88 · 2024-05-02T15:22:22.000+02:00
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 sphinx:
   configuration: docs/conf.py
   builder: html
-  fail_on_warning: true
+  fail_on_warning: false
 
 # Optionally build your docs in additional formats such as PDF
 formats: all
@@ -18,7 +18,6 @@ formats: all
 python:
   install:
     - requirements: docs/requirements.txt
-    - requirements: requirements.txt
     - method: pip
       path: .
       extra_requirements:
diff --git a/README.md b/README.md
@@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer()
 
 # Print parameters
 print(vectorizer.get_params())
->> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
+>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
 
 
 class 'numpy.int64'>, 'lowercase': True, 'max_df': None
@@ -434,7 +434,7 @@ vectorizer.fit(docs)
 keyphrases = vectorizer.get_feature_names_out()
 print(keyphrases)
 
->>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
+>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
  'main topics' 'task' 'precise summary' 'supervised learning'
  'inductive bias' 'information retrieval environment'
  'supervised learning algorithm' 'function' 'input' 'pair'
@@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
 
 # intitial vectorizer fit
 vectorizer.fit_transform([docs[0]]).toarray()
->> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
+>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
              1, 1, 1]])
 
 # check learned keyphrases
 print(vectorizer.get_feature_names_out())
->> > ['output pairs', 'output value', 'function', 'optimal scenario',
+>>> ['output pairs', 'output value', 'function', 'optimal scenario',
       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
       'supervised learning algorithm', 'way', 'training examples',
       'input object', 'example', 'machine', 'output',
@@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out())
 # learn additional keyphrases from new documents with partial fit
 vectorizer.partial_fit([docs[1]])
 vectorizer.transform([docs[1]]).toarray()
->> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]])
 
 # check learned keyphrases, including newly learned ones
 print(vectorizer.get_feature_names_out())
->> > ['output pairs', 'output value', 'function', 'optimal scenario',
+>>> ['output pairs', 'output value', 'function', 'optimal scenario',
       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
       'supervised learning algorithm', 'way', 'training examples',
       'input object', 'example', 'machine', 'output',
@@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out())
 # update list of learned keyphrases according to 'delete_min_df'
 vectorizer.update_bow([docs[1]])
 vectorizer.transform([docs[1]]).toarray()
->> > array([[5, 5]])
+>>> array([[5, 5]])
 
 # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
 print(vectorizer.get_feature_names_out())
->> > ['keywords', 'document']
+>>> ['keywords', 'document']
 
 # update again and check the impact of 'decay' on the learned document-keyphrase matrix
 vectorizer.update_bow([docs[1]])
 vectorizer.X_.toarray()
->> > array([[7.5, 7.5]])
+>>> array([[7.5, 7.5]])
 ```
 
 <a name="#citation-information"/></a>
@@ -790,7 +790,8 @@ vectorizer.X_.toarray()
 [Back to Table of Contents](#toc)
 
 When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry:
-``` 
+
+```plaintext
 @conference{schopf_etal_kdir22,
 author={Tim Schopf and Simon Klimek and Florian Matthes},
 title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -14,6 +14,7 @@ docutils>=0.16
 numpy>=1.18.5
 spacy>=3.0.1
 spacy-transformers>=1.1.6
+spacy-curated-transformers>=0.2.2
 nltk>=3.6.1
 scikit-learn>=1.0
 scipy>=1.7.3
diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
         must be customized accordingly.
         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
-        In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
+        In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 
     Parameters
     ----------
@@ -458,11 +458,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix:
         that do not exceed `self.delete_min_df` are removed from its
         vocabulary and bag-of-keywords matrix.
 
-        Arguments:
-            raw_documents: A list of documents
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable of strings.
 
-        Returns:
-            X_: Bag-of-keywords matrix
+        Returns
+        -------
+        X_ : scipy.sparse.csr_matrix
+            Bag-of-keywords matrix
         """
 
         if hasattr(self, "X_"):
@@ -501,4 +505,4 @@ def _clean_bow(self) -> None:
         x = np.array(self.keyphrases)
         mask = np.full(len(self.keyphrases), True, dtype=bool)
         mask[indices] = False
-        self.keyphrases = list(x[~mask])
+        self.keyphrases = list(x[~mask])
diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
         must be customized accordingly.
         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
-        In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
+        In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 
     Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
     This is a common term weighting scheme in information retrieval,
diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
@@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L
         else:
             pos_tuples = custom_pos_tagger(raw_documents=document_list)
 
-        # get the original documents after they were processed by spaCy
+        # get the original documents after they were processed by a tokenizer and a POS tagger
         processed_docs = []
         for tup in pos_tuples:
             token = tup[0]
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,4 +1,7 @@
 pytest>=7.0.1
 keybert>=0.5.0
 flair==0.11.3
-scipy==1.7.3
+scipy==1.7.3
+bertopic>=0.16.1
+scikit-learn>=1.0.1
+umap-learn==0.5.4
diff --git a/tests/test_vectorizers.py b/tests/test_vectorizers.py
@@ -2,9 +2,11 @@
 
 import flair
 import spacy
+from bertopic import BERTopic
 from flair.models import SequenceTagger
 from flair.tokenization import SegtokSentenceSplitter
 from keybert import KeyBERT
+from sklearn.datasets import fetch_20newsgroups
 
 import tests.utils as utils
 from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
@@ -132,3 +134,47 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag
     keyphrases = vectorizer.get_feature_names_out()
 
     assert sorted(keyphrases) == sorted_english_test_keyphrases
+
+
+def test_online_vectorizer():
+    first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
+    second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
+    first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
+    english_keyphrases = utils.get_english_test_keyphrases()
+    frequencies_after_min_df = utils.get_frequencies_after_min_df()
+    frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
+    frequencies_after_bow = utils.get_frequencies_after_bow()
+
+    # intitial vectorizer fit
+    vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
+
+    assert [sorted(count_list) for count_list in
+            vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
+    assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases
+
+    # learn additional keyphrases from new documents with partial fit
+    vectorizer.partial_fit([english_docs[1]])
+
+    assert [sorted(count_list) for count_list in
+            vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
+    assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases
+
+    # update list of learned keyphrases according to 'delete_min_df'
+    vectorizer.update_bow([english_docs[1]])
+    assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()
+
+    # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
+    assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df
+
+    # update again and check the impact of 'decay' on the learned document-keyphrase matrix
+    vectorizer.update_bow([english_docs[1]])
+    assert (vectorizer.X_.toarray() == frequencies_after_bow).all()
+
+
+def test_bertopic():
+    data = fetch_20newsgroups(subset='train')
+    texts = data.data[:100]
+    topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
+    topics, probs = topic_model.fit_transform(documents=texts)
+    new_topics = topic_model.reduce_outliers(texts, topics)
+    topic_model.update_topics(texts, topics=new_topics)
diff --git a/tests/utils.py b/tests/utils.py
@@ -1,3 +1,4 @@
+import numpy as np
 def get_english_test_docs():
     english_docs = ["""Supervised learning is the machine learning task of learning a function that
              maps an input to an output based on example input-output pairs. It infers a
@@ -56,6 +57,17 @@ def get_english_test_keyphrases():
     return sorted_english_test_keyphrases
 
 
+def get_english_first_doc_test_keyphrases():
+    sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias',
+                                                'input', 'input object', 'machine', 'new examples', 'optimal scenario',
+                                                'output', 'output pairs', 'output value', 'pair', 'set',
+                                                'supervised learning', 'supervised learning algorithm',
+                                                'supervisory signal', 'task', 'training data', 'training examples',
+                                                'unseen instances', 'unseen situations', 'vector', 'way']
+
+    return sorted_english_first_doc_test_keyphrases
+
+
 def get_sorted_english_keyphrases_custom_flair_tagger():
     sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
                                                'document relevance',
@@ -102,6 +114,21 @@ def get_sorted_english_count_matrix():
     return sorted_english_count_matrix
 
 
+def get_sorted_english_first_doc_count_matrix():
+    sorted_english_first_doc_count_matrix = [
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]
+
+    return sorted_english_first_doc_count_matrix
+
+
+def get_sorted_english_second_doc_count_matrix():
+    sorted_english_second_doc_count_matrix = [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 2, 2, 5, 5]]
+
+    return sorted_english_second_doc_count_matrix
+
+
 def get_sorted_french_count_matrix():
     sorted_french_count_matrix = [[1, 1, 1, 1]]
 
@@ -130,3 +157,21 @@ def get_english_keybert_keyphrases():
          'document content']]
 
     return english_keybert_keyphrases
+
+
+def get_frequencies_after_min_df():
+    frequency_array = np.array([[5, 5]])
+
+    return frequency_array
+
+
+def get_frequencies_after_bow():
+    frequency_array = np.array([[7.5, 7.5]])
+
+    return frequency_array
+
+
+def get_frequent_keyphrases_after_min_df():
+    keyphrases = ['document', 'keywords']
+
+    return keyphrases