Skip to content

Commit 15ea0cd

Browse files
author
Maarten Grootendorst
authored
v0.9.3 - Quickfix (#284)
* Fix #282, #285, #288
1 parent b3aa266 commit 15ea0cd

File tree

5 files changed

+30
-22
lines changed

5 files changed

+30
-22
lines changed

bertopic/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bertopic._bertopic import BERTopic
22

3-
__version__ = "0.9.2"
3+
__version__ = "0.9.3"
44

55
__all__ = [
66
"BERTopic",

bertopic/_bertopic.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1+
import yaml
12
import warnings
23
warnings.filterwarnings("ignore", category=FutureWarning)
34
warnings.filterwarnings("ignore", category=UserWarning)
45

6+
try:
7+
yaml._warnings_enabled["YAMLLoadWarning"] = False
8+
except (KeyError, AttributeError, TypeError) as e:
9+
pass
10+
511
import re
612
import joblib
713
import inspect
@@ -162,7 +168,6 @@ def __init__(self,
162168
self.topics = None
163169
self.topic_mapper = None
164170
self.topic_sizes = None
165-
self.mapped_topics = None
166171
self.merged_topics = None
167172
self.topic_embeddings = None
168173
self.topic_sim_matrix = None
@@ -372,10 +377,8 @@ def transform(self,
372377
else:
373378
probabilities = None
374379

375-
if self.mapped_topics:
376-
predictions = self._map_predictions(predictions)
377-
probabilities = self._map_probabilities(probabilities)
378-
380+
probabilities = self._map_probabilities(probabilities, original_topics=True)
381+
predictions = self._map_predictions(predictions)
379382
return predictions, probabilities
380383

381384
def topics_over_time(self,
@@ -780,7 +783,7 @@ def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
780783
return pd.DataFrame(self.topic_sizes.items(), columns=['Topic', 'Count']).sort_values("Count",
781784
ascending=False)
782785

783-
def get_representative_docs(self, topic: int) -> List[str]:
786+
def get_representative_docs(self, topic: int = None) -> List[str]:
784787
""" Extract representative documents per topic
785788
786789
Arguments:
@@ -1338,13 +1341,12 @@ def _extract_embeddings(self,
13381341

13391342
def _map_predictions(self, predictions: List[int]) -> List[int]:
13401343
""" Map predictions to the correct topics if topics were reduced """
1341-
if self.mapped_topics:
1342-
return [self.mapped_topics[prediction]
1343-
if prediction in self.mapped_topics
1344-
else prediction
1345-
for prediction in predictions]
1346-
else:
1347-
return predictions
1344+
mappings = self.topic_mapper.get_mappings(original_topics=True)
1345+
mapped_predictions = [mappings[prediction]
1346+
if prediction in mappings
1347+
else -1
1348+
for prediction in predictions]
1349+
return mapped_predictions
13481350

13491351
def _reduce_dimensionality(self,
13501352
embeddings: Union[np.ndarray, csr_matrix],
@@ -1786,9 +1788,6 @@ def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame:
17861788
"""
17871789
self._update_topic_size(documents)
17881790

1789-
if not self.mapped_topics:
1790-
self.mapped_topics = {topic: topic for topic in set(self.hdbscan_model.labels_)}
1791-
17921791
# Map topics based on frequency
17931792
df = pd.DataFrame(self.topic_sizes.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False)
17941793
df = df[df.Old_Topic != -1]

docs/changelog.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## **Version 0.9.3**
2+
*Release date: 17 October, 2021*
3+
4+
* Fix [#282](https://github.com/MaartenGr/BERTopic/issues/282)
5+
* As it turns out the old implementation of topic mapping was still found in the `transform` function
6+
* Fix [#285](https://github.com/MaartenGr/BERTopic/issues/285)
7+
* Fix getting all representative docs
8+
* Fix [#288](https://github.com/MaartenGr/BERTopic/issues/288)
9+
* A recent issue with the package `pyyaml` that can be found in Google Colab
10+
11+
112
## **Version 0.9.2**
213
*Release date: 12 October, 2021*
314

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
"scikit-learn>=0.22.2.post1",
2020
"tqdm>=4.41.1",
2121
"sentence-transformers>=0.4.1",
22-
"plotly>=4.7.0,<4.14.3"
22+
"plotly>=4.7.0,<4.14.3",
23+
"pyyaml<6.0"
2324
]
2425

2526
flair_packages = [
@@ -46,14 +47,13 @@
4647

4748
dev_packages = docs_packages + test_packages + extra_packages
4849

49-
5050
with open("README.md", "r") as fh:
5151
long_description = fh.read()
5252

5353
setup(
5454
name="bertopic",
5555
packages=find_packages(exclude=["notebooks", "docs"]),
56-
version="0.9.2",
56+
version="0.9.3",
5757
author="Maarten P. Grootendorst",
5858
author_email="maartengrootendorst@gmail.com",
5959
description="BERTopic performs topic Modeling with state-of-the-art transformer models.",

tests/test_topic_representation.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,7 @@ def test_topic_reduction(reduced_topics):
101101
assert old_freq.Count.sum() == new_freq.Count.sum()
102102
assert len(old_freq.Topic.unique()) == len(old_freq)
103103
assert len(new_freq.Topic.unique()) == len(new_freq)
104-
assert isinstance(model.mapped_topics, dict)
105104
assert not set(model.get_topic_freq().Topic).difference(set(new_documents.Topic))
106-
assert model.mapped_topics
107105

108106

109107
def test_topic_reduction_edge_cases():

0 commit comments

Comments
 (0)