Fix #1749 (#2267)

MaartenGr · web-flow · commit 04c52a83c086 · 2025-07-04T14:42:22.000+02:00
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -4509,7 +4509,6 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
         """
         topics = documents.Topic.tolist().copy()
         unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :]
-        max_topic = unique_topics[-1]
 
         # Find similar topics
         embeddings = select_topic_representation(
@@ -4529,12 +4528,23 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
                 min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
             ).fit_predict(norm_data[self._outliers :])
 
-        # Map similar topics
-        mapped_topics = {
-            unique_topics[index]: prediction + max_topic
-            for index, prediction in enumerate(predictions)
-            if prediction != -1
-        }
+        # Map clusters to their lowest topic_id
+        cluster_to_lowest = {}
+        for cluster, topic_id in zip(predictions, unique_topics):
+            if cluster != -1:  # Ignore unclustered items
+                if cluster not in cluster_to_lowest:
+                    cluster_to_lowest[cluster] = topic_id
+                else:
+                    cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id)
+
+        # Map each topic_id to the lowest topic_id in its cluster
+        mapped_topics = {}
+        for cluster, topic_id in zip(predictions, unique_topics):
+            if cluster == -1:
+                mapped_topics[topic_id] = topic_id  # No clustering, stays the same
+            else:
+                mapped_topics[topic_id] = cluster_to_lowest[cluster]
+
         documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
         mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}