Skip to content

Commit 97e4d13

Browse files
committed
Fix topic_sizes_ not updated in zero-shot topic modeling (#2384)
1 parent 144ab7b commit 97e4d13

File tree

3 files changed

+205
-0
lines changed

3 files changed

+205
-0
lines changed

bertopic/_bertopic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,9 @@ def fit_transform(
504504
# All documents matches zero-shot topics
505505
documents = assigned_documents
506506
embeddings = assigned_embeddings
507+
508+
# Update topic_sizes_ when all documents are assigned to zero-shot topics
509+
self._update_topic_size(documents)
507510

508511
# Sort and Map Topic IDs by their frequency
509512
if not self.nr_topics:

docs/changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ hide:
55

66
# Changelog
77

8+
## **Unreleased**
9+
10+
<h3><b>Fixes:</a></b></h3>
11+
12+
* Fix `topic_sizes_` not being updated in zero-shot topic modeling when using `nr_topics` parameter ([#2384](https://github.com/MaartenGr/BERTopic/issues/2384))
13+
814
## **Version 0.17.3**
915
*Release date: 8 July, 2025*
1016

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
"""
2+
Tests for zero-shot topic modeling functionality.
3+
4+
This module tests various aspects of zero-shot topic modeling, including
5+
edge cases with the nr_topics parameter and topic_sizes_ consistency.
6+
"""
7+
8+
import numpy as np
9+
from bertopic import BERTopic
10+
from umap import UMAP
11+
12+
13+
def test_zeroshot_with_nr_topics():
14+
"""Test zero-shot topic modeling with nr_topics parameter."""
15+
docs = [
16+
"This is about machine learning and artificial intelligence",
17+
"Deep learning neural networks are powerful",
18+
"Python programming for data science",
19+
"Machine learning algorithms and models",
20+
"Artificial intelligence and deep learning",
21+
"Data science with Python programming",
22+
"Neural networks and machine learning",
23+
"Programming in Python for AI",
24+
"Deep learning models and algorithms",
25+
"Artificial intelligence programming"
26+
]
27+
28+
zeroshot_topics = ["Technology and Programming"]
29+
30+
topic_model = BERTopic(
31+
zeroshot_topic_list=zeroshot_topics,
32+
zeroshot_min_similarity=0.1,
33+
nr_topics=2,
34+
min_topic_size=2
35+
)
36+
37+
topics, probs = topic_model.fit_transform(docs)
38+
39+
# Verify topic_sizes_ is properly populated
40+
assert topic_model.topic_sizes_ is not None
41+
assert len(topic_model.topic_sizes_) > 0
42+
43+
# Verify total document count matches
44+
total_in_sizes = sum(topic_model.topic_sizes_.values())
45+
assert total_in_sizes == len(docs)
46+
47+
# Verify all topics are accounted for
48+
for topic in set(topics):
49+
assert topic in topic_model.topic_sizes_
50+
51+
52+
def test_zeroshot_all_documents_assigned():
53+
"""Test edge case where all documents are assigned to zero-shot topics."""
54+
docs = [
55+
"Technology is advancing rapidly",
56+
"Software development is important",
57+
"Programming languages are evolving",
58+
"Computer science research continues",
59+
"Digital transformation is happening",
60+
"Innovation in technology sector",
61+
"Software engineering best practices",
62+
"Modern programming techniques",
63+
"Computer systems and architecture",
64+
"Digital solutions and platforms",
65+
"Technology trends and developments",
66+
"Software design patterns",
67+
"Programming paradigms evolution",
68+
"Computing infrastructure advances",
69+
"Digital innovation strategies"
70+
]
71+
72+
zeroshot_topics = ["Technology"]
73+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42)
74+
75+
topic_model = BERTopic(
76+
zeroshot_topic_list=zeroshot_topics,
77+
zeroshot_min_similarity=0.05,
78+
nr_topics=2,
79+
min_topic_size=1,
80+
umap_model=umap_model
81+
)
82+
83+
topics, probs = topic_model.fit_transform(docs)
84+
85+
# Verify all documents are accounted for
86+
total_in_sizes = sum(topic_model.topic_sizes_.values())
87+
assert total_in_sizes == len(docs)
88+
assert topic_model.topic_sizes_ is not None
89+
90+
91+
def test_zeroshot_topic_info_consistency():
92+
"""Test consistency between topic_sizes_ and get_topic_info()."""
93+
docs = [
94+
"AI and machine learning research",
95+
"Deep learning neural networks",
96+
"Neural network architectures",
97+
"Machine learning algorithms",
98+
"Artificial intelligence systems",
99+
"Deep learning models training",
100+
"Neural network optimization",
101+
"Machine learning applications",
102+
"AI research and development",
103+
"Deep learning frameworks"
104+
]
105+
zeroshot_topics = ["Artificial Intelligence"]
106+
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42)
107+
108+
topic_model = BERTopic(
109+
zeroshot_topic_list=zeroshot_topics,
110+
zeroshot_min_similarity=0.1,
111+
nr_topics=2,
112+
min_topic_size=1,
113+
umap_model=umap_model
114+
)
115+
116+
topics, probs = topic_model.fit_transform(docs)
117+
118+
# Verify topic info consistency
119+
topic_info = topic_model.get_topic_info()
120+
assert not topic_info.empty
121+
assert topic_info.shape[0] > 0
122+
123+
# Verify topic_sizes_ and topic_info are consistent
124+
topic_info_counts = dict(zip(topic_info.Topic, topic_info.Count))
125+
for topic_id, count in topic_model.topic_sizes_.items():
126+
assert topic_id in topic_info_counts
127+
assert topic_info_counts[topic_id] == count
128+
129+
130+
def test_github_issue_2384_reproduction():
131+
"""Test exact reproduction case from GitHub issue #2384."""
132+
# Exact reproduction case from GitHub issue #2384
133+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 50
134+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
135+
136+
model = BERTopic(
137+
zeroshot_topic_list=zeroshot_topics,
138+
zeroshot_min_similarity=-1,
139+
nr_topics=4,
140+
)
141+
142+
topics, _ = model.fit_transform(docs)
143+
144+
# Verify the fix
145+
assert model.topic_sizes_ is not None
146+
assert len(model.topic_sizes_) > 0
147+
148+
# Verify get_topic_info() works
149+
topic_info = model.get_topic_info()
150+
assert not topic_info.empty
151+
assert topic_info.shape[0] > 0
152+
153+
# Verify total document count matches
154+
total_docs_in_sizes = sum(model.topic_sizes_.values())
155+
assert total_docs_in_sizes == len(docs)
156+
157+
# Verify topic_representations_ still works (no regression)
158+
assert model.topic_representations_ is not None
159+
assert len(model.topic_representations_) > 0
160+
161+
162+
def test_zeroshot_nr_topics_consistency():
163+
"""Test consistency between using nr_topics and not using it."""
164+
docs = ["I need help with my voucher", "Gift card not working", "Customer service was poor"] * 20
165+
zeroshot_topics = ["Voucher inquiries", "Gift card issues", "Customer service feedback"]
166+
167+
# Test without nr_topics
168+
model_without = BERTopic(
169+
zeroshot_topic_list=zeroshot_topics,
170+
zeroshot_min_similarity=-1
171+
)
172+
topics_without, _ = model_without.fit_transform(docs)
173+
174+
# Test with nr_topics
175+
model_with = BERTopic(
176+
zeroshot_topic_list=zeroshot_topics,
177+
zeroshot_min_similarity=-1,
178+
nr_topics=4
179+
)
180+
topics_with, _ = model_with.fit_transform(docs)
181+
182+
# Both should have properly populated topic_sizes_
183+
assert model_without.topic_sizes_ is not None
184+
assert model_with.topic_sizes_ is not None
185+
186+
# Both should have same total document count
187+
total_without = sum(model_without.topic_sizes_.values())
188+
total_with = sum(model_with.topic_sizes_.values())
189+
assert total_without == len(docs)
190+
assert total_with == len(docs)
191+
192+
# Both should have working get_topic_info()
193+
info_without = model_without.get_topic_info()
194+
info_with = model_with.get_topic_info()
195+
assert not info_without.empty
196+
assert not info_with.empty

0 commit comments

Comments
 (0)