Skip to content

Commit 07b4428

Browse files
committed
Fix topic_sizes_ not updated in zero-shot topic modeling (#2384)
1 parent 144ab7b commit 07b4428

File tree

3 files changed

+194
-0
lines changed

3 files changed

+194
-0
lines changed

bertopic/_bertopic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,9 @@ def fit_transform(
505505
documents = assigned_documents
506506
embeddings = assigned_embeddings
507507

508+
# Update topic_sizes_ when all documents are assigned to zero-shot topics
509+
self._update_topic_size(documents)
510+
508511
# Sort and Map Topic IDs by their frequency
509512
if not self.nr_topics:
510513
documents = self._sort_mappings_by_frequency(documents)

docs/changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ hide:
55

66
# Changelog
77

8+
## **Unreleased**
9+
10+
<h3><b>Fixes:</a></b></h3>
11+
12+
* Fix `topic_sizes_` not being updated in zero-shot topic modeling when using `nr_topics` parameter ([#2384](https://github.com/MaartenGr/BERTopic/issues/2384))
13+
814
## **Version 0.17.3**
915
*Release date: 8 July, 2025*
1016

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
"""
2+
Tests for zero-shot topic modeling functionality.
3+
4+
This module tests various aspects of zero-shot topic modeling, including
5+
edge cases with the nr_topics parameter and topic_sizes_ consistency.
6+
"""
7+
8+
from bertopic import BERTopic
9+
from umap import UMAP
10+
11+
12+
def test_zeroshot_with_nr_topics():
13+
"""Test zero-shot topic modeling with nr_topics parameter."""
14+
docs = [
15+
"This is about machine learning and artificial intelligence",
16+
"Deep learning neural networks are powerful",
17+
"Python programming for data science",
18+
"Machine learning algorithms and models",
19+
"Artificial intelligence and deep learning",
20+
"Data science with Python programming",
21+
"Neural networks and machine learning",
22+
"Programming in Python for AI",
23+
"Deep learning models and algorithms",
24+
"Artificial intelligence programming",
25+
]
26+
27+
zeroshot_topics = ["Technology and Programming"]
28+
29+
topic_model = BERTopic(
30+
zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=0.1, nr_topics=2, min_topic_size=2
31+
)
32+
33+
topics, probs = topic_model.fit_transform(docs)
34+
35+
# Verify topic_sizes_ is properly populated
36+
assert topic_model.topic_sizes_ is not None
37+
assert len(topic_model.topic_sizes_) > 0
38+
39+
# Verify total document count matches
40+
total_in_sizes = sum(topic_model.topic_sizes_.values())
41+
assert total_in_sizes == len(docs)
42+
43+
# Verify all topics are accounted for
44+
for topic in set(topics):
45+
assert topic in topic_model.topic_sizes_
46+
47+
48+
def test_zeroshot_all_documents_assigned():
49+
"""Test edge case where all documents are assigned to zero-shot topics."""
50+
docs = [
51+
"Technology is advancing rapidly",
52+
"Software development is important",
53+
"Programming languages are evolving",
54+
"Computer science research continues",
55+
"Digital transformation is happening",
56+
"Innovation in technology sector",
57+
"Software engineering best practices",
58+
"Modern programming techniques",
59+
"Computer systems and architecture",
60+
"Digital solutions and platforms",
61+
"Technology trends and developments",
62+
"Software design patterns",
63+
"Programming paradigms evolution",
64+
"Computing infrastructure advances",
65+
"Digital innovation strategies",
66+
]
67+
68+
zeroshot_topics = ["Technology"]
69+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric="cosine", random_state=42)
70+
71+
topic_model = BERTopic(
72+
zeroshot_topic_list=zeroshot_topics,
73+
zeroshot_min_similarity=0.05,
74+
nr_topics=2,
75+
min_topic_size=1,
76+
umap_model=umap_model,
77+
)
78+
79+
topics, probs = topic_model.fit_transform(docs)
80+
81+
# Verify all documents are accounted for
82+
total_in_sizes = sum(topic_model.topic_sizes_.values())
83+
assert total_in_sizes == len(docs)
84+
assert topic_model.topic_sizes_ is not None
85+
86+
87+
def test_zeroshot_topic_info_consistency():
88+
"""Test consistency between topic_sizes_ and get_topic_info()."""
89+
docs = [
90+
"AI and machine learning research",
91+
"Deep learning neural networks",
92+
"Neural network architectures",
93+
"Machine learning algorithms",
94+
"Artificial intelligence systems",
95+
"Deep learning models training",
96+
"Neural network optimization",
97+
"Machine learning applications",
98+
"AI research and development",
99+
"Deep learning frameworks",
100+
]
101+
zeroshot_topics = ["Artificial Intelligence"]
102+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric="cosine", random_state=42)
103+
104+
topic_model = BERTopic(
105+
zeroshot_topic_list=zeroshot_topics,
106+
zeroshot_min_similarity=0.1,
107+
nr_topics=2,
108+
min_topic_size=1,
109+
umap_model=umap_model,
110+
)
111+
112+
topics, probs = topic_model.fit_transform(docs)
113+
114+
# Verify topic info consistency
115+
topic_info = topic_model.get_topic_info()
116+
assert not topic_info.empty
117+
assert topic_info.shape[0] > 0
118+
119+
# Verify topic_sizes_ and topic_info are consistent
120+
topic_info_counts = dict(zip(topic_info.Topic, topic_info.Count))
121+
for topic_id, count in topic_model.topic_sizes_.items():
122+
assert topic_id in topic_info_counts
123+
assert topic_info_counts[topic_id] == count
124+
125+
126+
def test_github_issue_2384_reproduction():
127+
"""Test exact reproduction case from GitHub issue #2384."""
128+
# Exact reproduction case from GitHub issue #2384
129+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 50
130+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
131+
132+
model = BERTopic(
133+
zeroshot_topic_list=zeroshot_topics,
134+
zeroshot_min_similarity=-1,
135+
nr_topics=4,
136+
)
137+
138+
topics, _ = model.fit_transform(docs)
139+
140+
# Verify the fix
141+
assert model.topic_sizes_ is not None
142+
assert len(model.topic_sizes_) > 0
143+
144+
# Verify get_topic_info() works
145+
topic_info = model.get_topic_info()
146+
assert not topic_info.empty
147+
assert topic_info.shape[0] > 0
148+
149+
# Verify total document count matches
150+
total_docs_in_sizes = sum(model.topic_sizes_.values())
151+
assert total_docs_in_sizes == len(docs)
152+
153+
# Verify topic_representations_ still works (no regression)
154+
assert model.topic_representations_ is not None
155+
assert len(model.topic_representations_) > 0
156+
157+
158+
def test_zeroshot_nr_topics_consistency():
159+
"""Test consistency between using nr_topics and not using it."""
160+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 20
161+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
162+
163+
# Test without nr_topics
164+
model_without = BERTopic(zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=-1)
165+
topics_without, _ = model_without.fit_transform(docs)
166+
167+
# Test with nr_topics
168+
model_with = BERTopic(zeroshot_topic_list=zeroshot_topics, zeroshot_min_similarity=-1, nr_topics=4)
169+
topics_with, _ = model_with.fit_transform(docs)
170+
171+
# Both should have properly populated topic_sizes_
172+
assert model_without.topic_sizes_ is not None
173+
assert model_with.topic_sizes_ is not None
174+
175+
# Both should have same total document count
176+
total_without = sum(model_without.topic_sizes_.values())
177+
total_with = sum(model_with.topic_sizes_.values())
178+
assert total_without == len(docs)
179+
assert total_with == len(docs)
180+
181+
# Both should have working get_topic_info()
182+
info_without = model_without.get_topic_info()
183+
info_with = model_with.get_topic_info()
184+
assert not info_without.empty
185+
assert not info_with.empty

0 commit comments

Comments
 (0)