Skip to content

Commit 0b32167

Browse files
author
Maarten Grootendorst
authored
v0.9.1 (#211)
Fix #210, #208, #205, and #213
1 parent 80c9fa1 commit 0b32167

File tree

6 files changed

+35
-22
lines changed

6 files changed

+35
-22
lines changed

README.md

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,6 @@ pip install bertopic[spacy]
3838
pip install bertopic[use]
3939
```
4040

41-
To install all backends:
42-
43-
```bash
44-
pip install bertopic[all]
45-
```
46-
47-
4841
## Getting Started
4942
For an in-depth overview of the features of BERTopic
5043
you can check the full documentation [here](https://maartengr.github.io/BERTopic/) or you can follow along

bertopic/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bertopic._bertopic import BERTopic
22

3-
__version__ = "0.9.0"
3+
__version__ = "0.9.1"
44

55
__all__ = [
66
"BERTopic",

bertopic/_bertopic.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -856,8 +856,9 @@ def reduce_topics(self,
856856
documents = pd.DataFrame({"Document": docs, "Topic": topics})
857857

858858
# Reduce number of topics
859-
self._extract_topics(documents)
860859
documents = self._reduce_topics(documents)
860+
self.merged_topics = None
861+
self._map_representative_docs()
861862

862863
# Extract topics and map probabilities
863864
new_topics = documents.Topic.to_list()
@@ -960,6 +961,7 @@ def visualize_topics_over_time(self,
960961
topics_over_time: pd.DataFrame,
961962
top_n_topics: int = None,
962963
topics: List[int] = None,
964+
normalize_frequency: bool = False,
963965
width: int = 1250,
964966
height: int = 450) -> go.Figure:
965967
""" Visualize topics over time
@@ -969,6 +971,7 @@ def visualize_topics_over_time(self,
969971
corresponding topic representation
970972
top_n_topics: To visualize the most frequent topics instead of all
971973
topics: Select which topics you would like to be visualized
974+
normalize_frequency: Whether to normalize each topic's frequency individually
972975
width: The width of the figure.
973976
height: The height of the figure.
974977
@@ -996,13 +999,15 @@ def visualize_topics_over_time(self,
996999
topics_over_time=topics_over_time,
9971000
top_n_topics=top_n_topics,
9981001
topics=topics,
1002+
normalize_frequency=normalize_frequency,
9991003
width=width,
10001004
height=height)
10011005

10021006
def visualize_topics_per_class(self,
10031007
topics_per_class: pd.DataFrame,
10041008
top_n_topics: int = 10,
10051009
topics: List[int] = None,
1010+
normalize_frequency: bool = False,
10061011
width: int = 1250,
10071012
height: int = 900) -> go.Figure:
10081013
""" Visualize topics per class
@@ -1012,6 +1017,7 @@ def visualize_topics_per_class(self,
10121017
corresponding topic representation
10131018
top_n_topics: To visualize the most frequent topics instead of all
10141019
topics: Select which topics you would like to be visualized
1020+
normalize_frequency: Whether to normalize each topic's frequency individually
10151021
width: The width of the figure.
10161022
height: The height of the figure.
10171023
@@ -1039,6 +1045,7 @@ def visualize_topics_per_class(self,
10391045
topics_per_class=topics_per_class,
10401046
top_n_topics=top_n_topics,
10411047
topics=topics,
1048+
normalize_frequency=normalize_frequency,
10421049
width=width,
10431050
height=height)
10441051

@@ -1491,7 +1498,7 @@ def _map_representative_docs(self):
14911498
representative_docs = self.representative_docs.copy()
14921499

14931500
# Remove topics that were merged as the most frequent
1494-
# topic or the topics they were merged into contain as they contain
1501+
# topic or the topics they were merged into as they contain
14951502
# better representative documents
14961503
if self.merged_topics:
14971504
for topic_to_remove in self.merged_topics:
@@ -1742,7 +1749,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
17421749
if self.topic_embeddings is not None:
17431750
embeddings = np.array(self.topic_embeddings)
17441751
else:
1745-
embeddings = self.c_tf_idf
1752+
embeddings = self.c_tf_idf.toarray()
17461753
norm_data = normalize(embeddings, norm='l2')
17471754
predictions = hdbscan.HDBSCAN(min_cluster_size=2,
17481755
metric='euclidean',
@@ -1828,13 +1835,14 @@ def _map_probabilities(self, probabilities: Union[np.ndarray, None]) -> Union[np
18281835
mapped_probabilities: Updated probabilities
18291836
"""
18301837
# Map array of probabilities (probability for assigned topic per document)
1831-
if len(probabilities.shape) == 2 and self.get_topic(-1):
1832-
mapped_probabilities = np.zeros((probabilities.shape[0],
1833-
len(set(self.mapped_topics.values()))-1))
1834-
for from_topic, to_topic in self.mapped_topics.items():
1835-
if to_topic != -1 and from_topic != -1:
1836-
mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
1837-
return mapped_probabilities
1838+
if probabilities is not None:
1839+
if len(probabilities.shape) == 2 and self.get_topic(-1):
1840+
mapped_probabilities = np.zeros((probabilities.shape[0],
1841+
len(set(self.mapped_topics.values()))-1))
1842+
for from_topic, to_topic in self.mapped_topics.items():
1843+
if to_topic != -1 and from_topic != -1:
1844+
mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
1845+
return mapped_probabilities
18381846

18391847
return probabilities
18401848

bertopic/plotting/_distribution.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def visualize_distribution(topic_model,
3535
<iframe src="../../tutorial/visualization/probabilities.html"
3636
style="width:1000px; height: 500px; border: 0px;""></iframe>
3737
"""
38-
if len(probabilities.shape) != 2:
38+
if len(probabilities.shape) != 1:
3939
raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False "
4040
"as it uses the topic probabilities of all topics. ")
4141
if len(probabilities[probabilities > min_probability]) == 0:

docs/changelog.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
## **Version 0.9.1**
2+
*Release date: 1 September, 2021*
3+
4+
A release focused on fixing several issues:
5+
6+
**Fixes**:
7+
8+
* Fix TypeError when auto-reducing topics ([#210](https://github.com/MaartenGr/BERTopic/issues/210))
9+
* Fix mapping representative docs when reducing topics ([#208](https://github.com/MaartenGr/BERTopic/issues/208))
10+
* Fix visualization issues with probabilities ([#205](https://github.com/MaartenGr/BERTopic/issues/205))
11+
* Fix missing `normalize_frequency` param in plots ([#213](https://github.com/MaartenGr/BERTopic/issues/208))
12+
13+
114
## **Version 0.9**
215
*Release date: 9 August, 2021*
316

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
setup(
5454
name="bertopic",
5555
packages=find_packages(exclude=["notebooks", "docs"]),
56-
version="0.9.0",
56+
version="0.9.1",
5757
author="Maarten P. Grootendorst",
5858
author_email="maartengrootendorst@gmail.com",
5959
description="BERTopic performs topic Modeling with state-of-the-art transformer models.",
@@ -89,8 +89,7 @@
8989
"flair": flair_packages,
9090
"spacy": spacy_packages,
9191
"use": use_packages,
92-
"gensim": gensim_packages,
93-
"all": extra_packages
92+
"gensim": gensim_packages
9493
},
9594
python_requires='>=3.6',
9695
)

0 commit comments

Comments
 (0)