Skip to content

Commit 2f47b92

Browse files
author
Maarten Grootendorst
authored
v0.10.0 (#492)
* Use any dimensionality reduction technique instead of UMAP * Use any clustering technique instead of HDBSCAN * Add a CountVectorizer page with tips and tricks on how to create topic representations that fit your use case * Added pages on how to use other dimensionality reduction and clustering algorithms * Additional instructions on how to reduce outliers in the FAQ * Fixed `None` being returned for probabilities when transforming unseen documents * Replaced all instances of `arg:` with `Arguments:` for consistency * Before saving a fitted BERTopic instance, we remove the stopwords in the fitted CountVectorizer model as it can get quite large due to the number of words that end in stopwords if `min_df` is set to a value larger than 1 * Set `"hdbscan>=0.8.28"` to prevent numpy issues * Update gensim dependency to `>=4.0.0` (#371) * Fix topic 0 not appearing in visualizations (#472) * Fix #506 * Fix #429
1 parent 681ac26 commit 2f47b92

File tree

26 files changed

+933
-271
lines changed

26 files changed

+933
-271
lines changed

.flake8

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
max-line-length = 160

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,4 @@ venv.bak/
7575

7676
.idea
7777
.idea/
78+
.vscode

bertopic/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bertopic._bertopic import BERTopic
22

3-
__version__ = "0.9.4"
3+
__version__ = "0.10.0"
44

55
__all__ = [
66
"BERTopic",

bertopic/_bertopic.py

Lines changed: 86 additions & 52 deletions
Large diffs are not rendered by default.

bertopic/_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,13 @@ def check_embeddings_shape(embeddings, docs):
5353

5454
def check_is_fitted(model):
5555
""" Checks if the model was fitted by verifying the presence of self.matches
56+
5657
Arguments:
5758
model: BERTopic instance for which the check is performed.
59+
5860
Returns:
5961
None
62+
6063
Raises:
6164
ValueError: If the matches were not found.
6265
"""

bertopic/backend/_gensim.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ def embed(self,
4848
Document/words embeddings with shape (n, m) with `n` documents/words
4949
that each have an embeddings size of `m`
5050
"""
51-
vector_shape = self.embedding_model.word_vec(list(self.embedding_model.vocab.keys())[0]).shape
52-
empty_vector = np.zeros(vector_shape[0])
51+
vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
52+
empty_vector = np.zeros(vector_shape)
5353

5454
embeddings = []
5555
for doc in tqdm(documents, disable=not verbose, position=0, leave=True):
@@ -58,7 +58,7 @@ def embed(self,
5858
# Extract word embeddings
5959
for word in doc.split(" "):
6060
try:
61-
word_embedding = self.embedding_model.word_vec(word)
61+
word_embedding = self.embedding_model.get_vector(word)
6262
doc_embedding.append(word_embedding)
6363
except KeyError:
6464
doc_embedding.append(empty_vector)

bertopic/plotting/_barchart.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@ def visualize_barchart(topic_model,
4646
colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])
4747

4848
# Select topics based on top_n and topics args
49+
freq_df = topic_model.get_topic_freq()
50+
freq_df = freq_df.loc[freq_df.Topic != -1, :]
4951
if topics is not None:
5052
topics = list(topics)
5153
elif top_n_topics is not None:
52-
topics = topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1]
54+
topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
5355
else:
54-
topics = topic_model.get_topic_freq().Topic.to_list()[1:7]
56+
topics = sorted(freq_df.Topic.to_list()[0:6])
5557

5658
# Initialize figure
5759
subplot_titles = [f"Topic {topic}" for topic in topics]

bertopic/plotting/_heatmap.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,14 @@ def visualize_heatmap(topic_model,
5656
embeddings = topic_model.c_tf_idf
5757

5858
# Select topics based on top_n and topics args
59+
freq_df = topic_model.get_topic_freq()
60+
freq_df = freq_df.loc[freq_df.Topic != -1, :]
5961
if topics is not None:
6062
topics = list(topics)
6163
elif top_n_topics is not None:
62-
topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
64+
topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
6365
else:
64-
topics = sorted(list(topic_model.get_topics().keys()))
66+
topics = sorted(freq_df.Topic.to_list())
6567

6668
# Order heatmap by similar clusters of topics
6769
if n_clusters:

bertopic/plotting/_hierarchy.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,14 @@ def visualize_hierarchy(topic_model,
5757
embeddings = topic_model.c_tf_idf
5858

5959
# Select topics based on top_n and topics args
60+
freq_df = topic_model.get_topic_freq()
61+
freq_df = freq_df.loc[freq_df.Topic != -1, :]
6062
if topics is not None:
61-
topics = sorted(list(topics))
63+
topics = list(topics)
6264
elif top_n_topics is not None:
63-
topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
65+
topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
6466
else:
65-
topics = sorted(list(topic_model.get_topics().keys()))
67+
topics = sorted(freq_df.Topic.to_list())
6668

6769
# Select embeddings
6870
all_topics = sorted(list(topic_model.get_topics().keys()))

bertopic/plotting/_topics.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@ def visualize_topics(topic_model,
4343
style="width:1000px; height: 680px; border: 0px;""></iframe>
4444
"""
4545
# Select topics based on top_n and topics args
46+
freq_df = topic_model.get_topic_freq()
47+
freq_df = freq_df.loc[freq_df.Topic != -1, :]
4648
if topics is not None:
4749
topics = list(topics)
4850
elif top_n_topics is not None:
49-
topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
51+
topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
5052
else:
51-
topics = sorted(list(topic_model.get_topics().keys()))
53+
topics = sorted(freq_df.Topic.to_list())
5254

5355
# Extract topic words and their frequencies
5456
topic_list = sorted(topics)

0 commit comments

Comments
 (0)