@@ -4509,7 +4509,6 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
45094509 """
45104510 topics = documents .Topic .tolist ().copy ()
45114511 unique_topics = sorted (list (documents .Topic .unique ()))[self ._outliers :]
4512- max_topic = unique_topics [- 1 ]
45134512
45144513 # Find similar topics
45154514 embeddings = select_topic_representation (
@@ -4529,12 +4528,23 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
45294528 min_cluster_size = 2 , metric = "euclidean" , cluster_selection_method = "eom" , n_jobs = - 1
45304529 ).fit_predict (norm_data [self ._outliers :])
45314530
4532- # Map similar topics
4533- mapped_topics = {
4534- unique_topics [index ]: prediction + max_topic
4535- for index , prediction in enumerate (predictions )
4536- if prediction != - 1
4537- }
4531+ # Map clusters to their lowest topic_id
4532+ cluster_to_lowest = {}
4533+ for cluster , topic_id in zip (predictions , unique_topics ):
4534+ if cluster != - 1 : # Ignore unclustered items
4535+ if cluster not in cluster_to_lowest :
4536+ cluster_to_lowest [cluster ] = topic_id
4537+ else :
4538+ cluster_to_lowest [cluster ] = min (cluster_to_lowest [cluster ], topic_id )
4539+
4540+ # Map each topic_id to the lowest topic_id in its cluster
4541+ mapped_topics = {}
4542+ for cluster , topic_id in zip (predictions , unique_topics ):
4543+ if cluster == - 1 :
4544+ mapped_topics [topic_id ] = topic_id # No clustering, stays the same
4545+ else :
4546+ mapped_topics [topic_id ] = cluster_to_lowest [cluster ]
4547+
45384548 documents .Topic = documents .Topic .map (mapped_topics ).fillna (documents .Topic ).astype (int )
45394549 mapped_topics = {from_topic : to_topic for from_topic , to_topic in zip (topics , documents .Topic .tolist ())}
45404550
0 commit comments