v0.9.3 - Quickfix (#284)

Maarten Grootendorst · web-flow · commit 15ea0cd804d3 · 2021-10-17T08:26:59.000+02:00
* Fix #282, #285, #288
diff --git a/bertopic/__init__.py b/bertopic/__init__.py
@@ -1,6 +1,6 @@
 from bertopic._bertopic import BERTopic
 
-__version__ = "0.9.2"
+__version__ = "0.9.3"
 
 __all__ = [
     "BERTopic",
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -1,7 +1,13 @@
+import yaml
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
 
+try:
+    yaml._warnings_enabled["YAMLLoadWarning"] = False
+except (KeyError, AttributeError, TypeError) as e:
+    pass
+
 import re
 import joblib
 import inspect
@@ -162,7 +168,6 @@ def __init__(self,
         self.topics = None
         self.topic_mapper = None
         self.topic_sizes = None
-        self.mapped_topics = None
         self.merged_topics = None
         self.topic_embeddings = None
         self.topic_sim_matrix = None
@@ -372,10 +377,8 @@ def transform(self,
         else:
             probabilities = None
 
-        if self.mapped_topics:
-            predictions = self._map_predictions(predictions)
-            probabilities = self._map_probabilities(probabilities)
-
+        probabilities = self._map_probabilities(probabilities, original_topics=True)
+        predictions = self._map_predictions(predictions)
         return predictions, probabilities
 
     def topics_over_time(self,
@@ -780,7 +783,7 @@ def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
             return pd.DataFrame(self.topic_sizes.items(), columns=['Topic', 'Count']).sort_values("Count",
                                                                                                   ascending=False)
 
-    def get_representative_docs(self, topic: int) -> List[str]:
+    def get_representative_docs(self, topic: int = None) -> List[str]:
         """ Extract representative documents per topic
 
         Arguments:
@@ -1338,13 +1341,12 @@ def _extract_embeddings(self,
 
     def _map_predictions(self, predictions: List[int]) -> List[int]:
         """ Map predictions to the correct topics if topics were reduced """
-        if self.mapped_topics:
-            return [self.mapped_topics[prediction]
-                    if prediction in self.mapped_topics
-                    else prediction
-                    for prediction in predictions]
-        else:
-            return predictions
+        mappings = self.topic_mapper.get_mappings(original_topics=True)
+        mapped_predictions = [mappings[prediction]
+                              if prediction in mappings
+                              else -1
+                              for prediction in predictions]
+        return mapped_predictions
 
     def _reduce_dimensionality(self,
                                embeddings: Union[np.ndarray, csr_matrix],
@@ -1786,9 +1788,6 @@ def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame:
         """
         self._update_topic_size(documents)
 
-        if not self.mapped_topics:
-            self.mapped_topics = {topic: topic for topic in set(self.hdbscan_model.labels_)}
-
         # Map topics based on frequency
         df = pd.DataFrame(self.topic_sizes.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False)
         df = df[df.Old_Topic != -1]
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,3 +1,14 @@
+## **Version 0.9.3**
+*Release date:  17 October, 2021*
+
+* Fix [#282](https://github.com/MaartenGr/BERTopic/issues/282)
+    * As it turns out the old implementation of topic mapping was still found in the `transform` function
+* Fix [#285](https://github.com/MaartenGr/BERTopic/issues/285)
+    * Fix getting all representative docs
+* Fix [#288](https://github.com/MaartenGr/BERTopic/issues/288)
+    * A recent issue with the package `pyyaml` that can be found in Google Colab
+
+
 ## **Version 0.9.2**
 *Release date:  12 October, 2021*
 
diff --git a/setup.py b/setup.py
@@ -19,7 +19,8 @@
     "scikit-learn>=0.22.2.post1",
     "tqdm>=4.41.1",
     "sentence-transformers>=0.4.1",
-    "plotly>=4.7.0,<4.14.3"
+    "plotly>=4.7.0,<4.14.3",
+    "pyyaml<6.0"
 ]
 
 flair_packages = [
@@ -46,14 +47,13 @@
 
 dev_packages = docs_packages + test_packages + extra_packages
 
-
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
 setup(
     name="bertopic",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.9.2",
+    version="0.9.3",
     author="Maarten P. Grootendorst",
     author_email="maartengrootendorst@gmail.com",
     description="BERTopic performs topic Modeling with state-of-the-art transformer models.",
diff --git a/tests/test_topic_representation.py b/tests/test_topic_representation.py
@@ -101,9 +101,7 @@ def test_topic_reduction(reduced_topics):
     assert old_freq.Count.sum() == new_freq.Count.sum()
     assert len(old_freq.Topic.unique()) == len(old_freq)
     assert len(new_freq.Topic.unique()) == len(new_freq)
-    assert isinstance(model.mapped_topics, dict)
     assert not set(model.get_topic_freq().Topic).difference(set(new_documents.Topic))
-    assert model.mapped_topics
 
 
 def test_topic_reduction_edge_cases():