Skip to content

Commit 04c52a8

Browse files
authored
Fix #1749 (#2267)
1 parent 6faf0de commit 04c52a8

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

bertopic/_bertopic.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4509,7 +4509,6 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
45094509
"""
45104510
topics = documents.Topic.tolist().copy()
45114511
unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :]
4512-
max_topic = unique_topics[-1]
45134512

45144513
# Find similar topics
45154514
embeddings = select_topic_representation(
@@ -4529,12 +4528,23 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
45294528
min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
45304529
).fit_predict(norm_data[self._outliers :])
45314530

4532-
# Map similar topics
4533-
mapped_topics = {
4534-
unique_topics[index]: prediction + max_topic
4535-
for index, prediction in enumerate(predictions)
4536-
if prediction != -1
4537-
}
4531+
# Map clusters to their lowest topic_id
4532+
cluster_to_lowest = {}
4533+
for cluster, topic_id in zip(predictions, unique_topics):
4534+
if cluster != -1: # Ignore unclustered items
4535+
if cluster not in cluster_to_lowest:
4536+
cluster_to_lowest[cluster] = topic_id
4537+
else:
4538+
cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id)
4539+
4540+
# Map each topic_id to the lowest topic_id in its cluster
4541+
mapped_topics = {}
4542+
for cluster, topic_id in zip(predictions, unique_topics):
4543+
if cluster == -1:
4544+
mapped_topics[topic_id] = topic_id # No clustering, stays the same
4545+
else:
4546+
mapped_topics[topic_id] = cluster_to_lowest[cluster]
4547+
45384548
documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
45394549
mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}
45404550

0 commit comments

Comments
 (0)