v0.9.1 (#211)

Maarten Grootendorst · web-flow · commit 0b321678ffe4 · 2021-09-01T08:01:04.000+02:00
Fix #210, #208, #205, and #213
diff --git a/README.md b/README.md
@@ -38,13 +38,6 @@ pip install bertopic[spacy]
 pip install bertopic[use]
 ```
 
-To install all backends:
-
-```bash
-pip install bertopic[all]
-```
-
-
 ## Getting Started
 For an in-depth overview of the features of BERTopic 
 you can check the full documentation [here](https://maartengr.github.io/BERTopic/) or you can follow along 
diff --git a/bertopic/__init__.py b/bertopic/__init__.py
@@ -1,6 +1,6 @@
 from bertopic._bertopic import BERTopic
 
-__version__ = "0.9.0"
+__version__ = "0.9.1"
 
 __all__ = [
     "BERTopic",
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -856,8 +856,9 @@ def reduce_topics(self,
         documents = pd.DataFrame({"Document": docs, "Topic": topics})
 
         # Reduce number of topics
-        self._extract_topics(documents)
         documents = self._reduce_topics(documents)
+        self.merged_topics = None
+        self._map_representative_docs()
 
         # Extract topics and map probabilities
         new_topics = documents.Topic.to_list()
@@ -960,6 +961,7 @@ def visualize_topics_over_time(self,
                                    topics_over_time: pd.DataFrame,
                                    top_n_topics: int = None,
                                    topics: List[int] = None,
+                                   normalize_frequency: bool = False,
                                    width: int = 1250,
                                    height: int = 450) -> go.Figure:
         """ Visualize topics over time
@@ -969,6 +971,7 @@ def visualize_topics_over_time(self,
                               corresponding topic representation
             top_n_topics: To visualize the most frequent topics instead of all
             topics: Select which topics you would like to be visualized
+            normalize_frequency: Whether to normalize each topic's frequency individually
             width: The width of the figure.
             height: The height of the figure.
 
@@ -996,13 +999,15 @@ def visualize_topics_over_time(self,
                                                    topics_over_time=topics_over_time,
                                                    top_n_topics=top_n_topics,
                                                    topics=topics,
+                                                   normalize_frequency=normalize_frequency,
                                                    width=width,
                                                    height=height)
 
     def visualize_topics_per_class(self,
                                    topics_per_class: pd.DataFrame,
                                    top_n_topics: int = 10,
                                    topics: List[int] = None,
+                                   normalize_frequency: bool = False,
                                    width: int = 1250,
                                    height: int = 900) -> go.Figure:
         """ Visualize topics per class
@@ -1012,6 +1017,7 @@ def visualize_topics_per_class(self,
                               corresponding topic representation
             top_n_topics: To visualize the most frequent topics instead of all
             topics: Select which topics you would like to be visualized
+            normalize_frequency: Whether to normalize each topic's frequency individually
             width: The width of the figure.
             height: The height of the figure.
 
@@ -1039,6 +1045,7 @@ def visualize_topics_per_class(self,
                                                    topics_per_class=topics_per_class,
                                                    top_n_topics=top_n_topics,
                                                    topics=topics,
+                                                   normalize_frequency=normalize_frequency,
                                                    width=width,
                                                    height=height)
 
@@ -1491,7 +1498,7 @@ def _map_representative_docs(self):
         representative_docs = self.representative_docs.copy()
 
         # Remove topics that were merged as the most frequent
-        # topic or the topics they were merged into contain as they contain
+        # topic or the topics they were merged into as they contain
         # better representative documents
         if self.merged_topics:
             for topic_to_remove in self.merged_topics:
@@ -1742,7 +1749,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
         if self.topic_embeddings is not None:
             embeddings = np.array(self.topic_embeddings)
         else:
-            embeddings = self.c_tf_idf
+            embeddings = self.c_tf_idf.toarray()
         norm_data = normalize(embeddings, norm='l2')
         predictions = hdbscan.HDBSCAN(min_cluster_size=2,
                                       metric='euclidean',
@@ -1828,13 +1835,14 @@ def _map_probabilities(self, probabilities: Union[np.ndarray, None]) -> Union[np
             mapped_probabilities: Updated probabilities
         """
         # Map array of probabilities (probability for assigned topic per document)
-        if len(probabilities.shape) == 2 and self.get_topic(-1):
-            mapped_probabilities = np.zeros((probabilities.shape[0],
-                                             len(set(self.mapped_topics.values()))-1))
-            for from_topic, to_topic in self.mapped_topics.items():
-                if to_topic != -1 and from_topic != -1:
-                    mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
-            return mapped_probabilities
+        if probabilities is not None:
+            if len(probabilities.shape) == 2 and self.get_topic(-1):
+                mapped_probabilities = np.zeros((probabilities.shape[0],
+                                                 len(set(self.mapped_topics.values()))-1))
+                for from_topic, to_topic in self.mapped_topics.items():
+                    if to_topic != -1 and from_topic != -1:
+                        mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
+                return mapped_probabilities
 
         return probabilities
 
diff --git a/bertopic/plotting/_distribution.py b/bertopic/plotting/_distribution.py
@@ -35,7 +35,7 @@ def visualize_distribution(topic_model,
     <iframe src="../../tutorial/visualization/probabilities.html"
     style="width:1000px; height: 500px; border: 0px;""></iframe>
     """
-    if len(probabilities.shape) != 2:
+    if len(probabilities.shape) != 1:
         raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False "
                          "as it uses the topic probabilities of all topics. ")
     if len(probabilities[probabilities > min_probability]) == 0:
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,3 +1,16 @@
+## **Version 0.9.1**
+*Release date:  1 September, 2021*
+
+A release focused on fixing several issues:
+
+**Fixes**:  
+
+* Fix TypeError when auto-reducing topics ([#210](https://github.com/MaartenGr/BERTopic/issues/210)) 
+* Fix mapping representative docs when reducing topics ([#208](https://github.com/MaartenGr/BERTopic/issues/208))
+* Fix visualization issues with probabilities ([#205](https://github.com/MaartenGr/BERTopic/issues/205))
+* Fix missing `normalize_frequency` param in plots ([#213](https://github.com/MaartenGr/BERTopic/issues/208))
+  
+
 ## **Version 0.9**
 *Release date:  9 August, 2021*
 
diff --git a/setup.py b/setup.py
@@ -53,7 +53,7 @@
 setup(
     name="bertopic",
     packages=find_packages(exclude=["notebooks", "docs"]),
-    version="0.9.0",
+    version="0.9.1",
     author="Maarten P. Grootendorst",
     author_email="maartengrootendorst@gmail.com",
     description="BERTopic performs topic Modeling with state-of-the-art transformer models.",
@@ -89,8 +89,7 @@
         "flair": flair_packages,
         "spacy": spacy_packages,
         "use": use_packages,
-        "gensim": gensim_packages,
-        "all": extra_packages
+        "gensim": gensim_packages
     },
     python_requires='>=3.6',
 )