Skip to content

Commit 5e28116

Browse files
committed
Merge remote-tracking branch 'origin/master'
2 parents 3dc3212 + 0ca2367 commit 5e28116

File tree

9 files changed

+120
-21
lines changed

9 files changed

+120
-21
lines changed

.readthedocs.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ version: 2
99
sphinx:
1010
configuration: docs/conf.py
1111
builder: html
12-
fail_on_warning: true
12+
fail_on_warning: false
1313

1414
# Optionally build your docs in additional formats such as PDF
1515
formats: all
@@ -18,7 +18,6 @@ formats: all
1818
python:
1919
install:
2020
- requirements: docs/requirements.txt
21-
- requirements: requirements.txt
2221
- method: pip
2322
path: .
2423
extra_requirements:

README.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer()
255255

256256
# Print parameters
257257
print(vectorizer.get_params())
258-
>> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
258+
>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
259259

260260

261261
class 'numpy.int64'>, 'lowercase': True, 'max_df': None
@@ -434,7 +434,7 @@ vectorizer.fit(docs)
434434
keyphrases = vectorizer.get_feature_names_out()
435435
print(keyphrases)
436436

437-
>>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
437+
>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
438438
'main topics' 'task' 'precise summary' 'supervised learning'
439439
'inductive bias' 'information retrieval environment'
440440
'supervised learning algorithm' 'function' 'input' 'pair'
@@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
735735

736736
# intitial vectorizer fit
737737
vectorizer.fit_transform([docs[0]]).toarray()
738-
>> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
738+
>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
739739
1, 1, 1]])
740740

741741
# check learned keyphrases
742742
print(vectorizer.get_feature_names_out())
743-
>> > ['output pairs', 'output value', 'function', 'optimal scenario',
743+
>>> ['output pairs', 'output value', 'function', 'optimal scenario',
744744
'pair', 'supervised learning', 'supervisory signal', 'algorithm',
745745
'supervised learning algorithm', 'way', 'training examples',
746746
'input object', 'example', 'machine', 'output',
@@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out())
751751
# learn additional keyphrases from new documents with partial fit
752752
vectorizer.partial_fit([docs[1]])
753753
vectorizer.transform([docs[1]]).toarray()
754-
>> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
754+
>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
755755
0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]])
756756

757757
# check learned keyphrases, including newly learned ones
758758
print(vectorizer.get_feature_names_out())
759-
>> > ['output pairs', 'output value', 'function', 'optimal scenario',
759+
>>> ['output pairs', 'output value', 'function', 'optimal scenario',
760760
'pair', 'supervised learning', 'supervisory signal', 'algorithm',
761761
'supervised learning algorithm', 'way', 'training examples',
762762
'input object', 'example', 'machine', 'output',
@@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out())
771771
# update list of learned keyphrases according to 'delete_min_df'
772772
vectorizer.update_bow([docs[1]])
773773
vectorizer.transform([docs[1]]).toarray()
774-
>> > array([[5, 5]])
774+
>>> array([[5, 5]])
775775

776776
# check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
777777
print(vectorizer.get_feature_names_out())
778-
>> > ['keywords', 'document']
778+
>>> ['keywords', 'document']
779779

780780
# update again and check the impact of 'decay' on the learned document-keyphrase matrix
781781
vectorizer.update_bow([docs[1]])
782782
vectorizer.X_.toarray()
783-
>> > array([[7.5, 7.5]])
783+
>>> array([[7.5, 7.5]])
784784
```
785785

786786
<a name="#citation-information"/></a>
@@ -790,7 +790,8 @@ vectorizer.X_.toarray()
790790
[Back to Table of Contents](#toc)
791791

792792
When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry:
793-
```
793+
794+
```plaintext
794795
@conference{schopf_etal_kdir22,
795796
author={Tim Schopf and Simon Klimek and Florian Matthes},
796797
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ docutils>=0.16
1414
numpy>=1.18.5
1515
spacy>=3.0.1
1616
spacy-transformers>=1.1.6
17+
spacy-curated-transformers>=0.2.2
1718
nltk>=3.6.1
1819
scikit-learn>=1.0
1920
scipy>=1.7.3

keyphrase_vectorizers/keyphrase_count_vectorizer.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
3939
must be customized accordingly.
4040
Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages.
4141
Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
42-
In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
42+
In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
4343
4444
Parameters
4545
----------
@@ -458,11 +458,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix:
458458
that do not exceed `self.delete_min_df` are removed from its
459459
vocabulary and bag-of-keywords matrix.
460460
461-
Arguments:
462-
raw_documents: A list of documents
461+
Parameters
462+
----------
463+
raw_documents : iterable
464+
An iterable of strings.
463465
464-
Returns:
465-
X_: Bag-of-keywords matrix
466+
Returns
467+
-------
468+
X_ : scipy.sparse.csr_matrix
469+
Bag-of-keywords matrix
466470
"""
467471

468472
if hasattr(self, "X_"):
@@ -501,4 +505,4 @@ def _clean_bow(self) -> None:
501505
x = np.array(self.keyphrases)
502506
mask = np.full(len(self.keyphrases), True, dtype=bool)
503507
mask[indices] = False
504-
self.keyphrases = list(x[~mask])
508+
self.keyphrases = list(x[~mask])

keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
3737
must be customized accordingly.
3838
Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages.
3939
Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
40-
In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
40+
In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
4141
4242
Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
4343
This is a common term weighting scheme in information retrieval,

keyphrase_vectorizers/keyphrase_vectorizer_mixin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L
428428
else:
429429
pos_tuples = custom_pos_tagger(raw_documents=document_list)
430430

431-
# get the original documents after they were processed by spaCy
431+
# get the original documents after they were processed by a tokenizer and a POS tagger
432432
processed_docs = []
433433
for tup in pos_tuples:
434434
token = tup[0]

tests/requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
pytest>=7.0.1
22
keybert>=0.5.0
33
flair==0.11.3
4-
scipy==1.7.3
4+
scipy==1.7.3
5+
bertopic>=0.16.1
6+
scikit-learn>=1.0.1
7+
umap-learn==0.5.4

tests/test_vectorizers.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33
import flair
44
import spacy
5+
from bertopic import BERTopic
56
from flair.models import SequenceTagger
67
from flair.tokenization import SegtokSentenceSplitter
78
from keybert import KeyBERT
9+
from sklearn.datasets import fetch_20newsgroups
810

911
import tests.utils as utils
1012
from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
@@ -132,3 +134,47 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag
132134
keyphrases = vectorizer.get_feature_names_out()
133135

134136
assert sorted(keyphrases) == sorted_english_test_keyphrases
137+
138+
139+
def test_online_vectorizer():
140+
first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
141+
second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
142+
first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
143+
english_keyphrases = utils.get_english_test_keyphrases()
144+
frequencies_after_min_df = utils.get_frequencies_after_min_df()
145+
frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
146+
frequencies_after_bow = utils.get_frequencies_after_bow()
147+
148+
# intitial vectorizer fit
149+
vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
150+
151+
assert [sorted(count_list) for count_list in
152+
vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
153+
assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases
154+
155+
# learn additional keyphrases from new documents with partial fit
156+
vectorizer.partial_fit([english_docs[1]])
157+
158+
assert [sorted(count_list) for count_list in
159+
vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
160+
assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases
161+
162+
# update list of learned keyphrases according to 'delete_min_df'
163+
vectorizer.update_bow([english_docs[1]])
164+
assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()
165+
166+
# check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
167+
assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df
168+
169+
# update again and check the impact of 'decay' on the learned document-keyphrase matrix
170+
vectorizer.update_bow([english_docs[1]])
171+
assert (vectorizer.X_.toarray() == frequencies_after_bow).all()
172+
173+
174+
def test_bertopic():
175+
data = fetch_20newsgroups(subset='train')
176+
texts = data.data[:100]
177+
topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
178+
topics, probs = topic_model.fit_transform(documents=texts)
179+
new_topics = topic_model.reduce_outliers(texts, topics)
180+
topic_model.update_topics(texts, topics=new_topics)

tests/utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
def get_english_test_docs():
23
english_docs = ["""Supervised learning is the machine learning task of learning a function that
34
maps an input to an output based on example input-output pairs. It infers a
@@ -56,6 +57,17 @@ def get_english_test_keyphrases():
5657
return sorted_english_test_keyphrases
5758

5859

60+
def get_english_first_doc_test_keyphrases():
61+
sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias',
62+
'input', 'input object', 'machine', 'new examples', 'optimal scenario',
63+
'output', 'output pairs', 'output value', 'pair', 'set',
64+
'supervised learning', 'supervised learning algorithm',
65+
'supervisory signal', 'task', 'training data', 'training examples',
66+
'unseen instances', 'unseen situations', 'vector', 'way']
67+
68+
return sorted_english_first_doc_test_keyphrases
69+
70+
5971
def get_sorted_english_keyphrases_custom_flair_tagger():
6072
sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
6173
'document relevance',
@@ -102,6 +114,21 @@ def get_sorted_english_count_matrix():
102114
return sorted_english_count_matrix
103115

104116

117+
def get_sorted_english_first_doc_count_matrix():
118+
sorted_english_first_doc_count_matrix = [
119+
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]
120+
121+
return sorted_english_first_doc_count_matrix
122+
123+
124+
def get_sorted_english_second_doc_count_matrix():
125+
sorted_english_second_doc_count_matrix = [
126+
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127+
1, 2, 2, 5, 5]]
128+
129+
return sorted_english_second_doc_count_matrix
130+
131+
105132
def get_sorted_french_count_matrix():
106133
sorted_french_count_matrix = [[1, 1, 1, 1]]
107134

@@ -130,3 +157,21 @@ def get_english_keybert_keyphrases():
130157
'document content']]
131158

132159
return english_keybert_keyphrases
160+
161+
162+
def get_frequencies_after_min_df():
163+
frequency_array = np.array([[5, 5]])
164+
165+
return frequency_array
166+
167+
168+
def get_frequencies_after_bow():
169+
frequency_array = np.array([[7.5, 7.5]])
170+
171+
return frequency_array
172+
173+
174+
def get_frequent_keyphrases_after_min_df():
175+
keyphrases = ['document', 'keywords']
176+
177+
return keyphrases

0 commit comments

Comments
 (0)