From 4a94ac071503062bc6ca935ef0ad1ac3a8db002c Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Tue, 27 May 2025 19:13:07 +0530 Subject: [PATCH 01/13] Reuse precomputed embeddings --- bertopic/_bertopic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 92fe0855..ea062153 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -4051,6 +4051,7 @@ def _extract_topics( documents, fine_tune_representation=fine_tune_representation, calculate_aspects=fine_tune_representation, + embeddings=embeddings, ) self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) @@ -4311,6 +4312,7 @@ def _extract_words_per_topic( c_tf_idf: csr_matrix = None, fine_tune_representation: bool = True, calculate_aspects: bool = True, + embeddings: np.ndarray = None, ) -> Mapping[str, List[Tuple[str, float]]]: """Based on tf_idf scores per topic, extract the top n words per topic. @@ -4362,7 +4364,7 @@ def _extract_words_per_topic( for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): - topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) + topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings) elif fine_tune_representation and isinstance(self.representation_model, dict): if self.representation_model.get("Main"): main_model = self.representation_model["Main"] From c974679a28d99f2539ab6d7465a34b482ce8c02f Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Tue, 27 May 2025 19:23:10 +0530 Subject: [PATCH 02/13] Use precomputed --- bertopic/representation/_keybert.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index f91c01cc..48e7df0d 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -71,6 +71,7 @@ def extract_topics( documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], + embeddings: np.ndarray = None, ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. @@ -79,6 +80,8 @@ def extract_topics( documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model Returns: updated_topics: Updated topic representations @@ -88,12 +91,17 @@ def extract_topics( c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs ) + # If document embeddings are precomputed extract the embeddings of the represenantative documents based on repr_doc_indices + repr_embeddings = None + if embeddings is not None: + repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)] + # We extract the top n words per class topics = self._extract_candidate_words(topic_model, c_tf_idf, topics) # We calculate the similarity between word and document embeddings and create # topic embeddings from the representative document embeddings - sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices) + sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices,repr_embeddings) # Find the best matching words based on the similarity matrix for each topic updated_topics = self._extract_top_words(words, topics, sim_matrix) @@ -150,6 +158,7 @@ def _extract_embeddings( topics: Mapping[str, List[Tuple[str, float]]], representative_docs: List[str], repr_doc_indices: List[List[int]], + repr_embeddings: np.ndarray = None, ) -> Union[np.ndarray, List[str]]: """Extract the representative document embeddings and create topic embeddings. Then extract word embeddings and calculate the cosine similarity between topic @@ -162,13 +171,18 @@ def _extract_embeddings( representative_docs: A flat list of representative documents repr_doc_indices: The indices of representative documents that belong to each topic + repr_embeddings: Embeddings of respective representative_docs Returns: sim: The similarity matrix between word and topic embeddings vocab: The complete vocabulary of input documents """ # Calculate representative docs embeddings and create topic embeddings - repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) + # If there are no precomputed embeddings, only then create embeddings + if repr_embeddings is None: + logger.info("Embedding - Transforming representative documents to embeddings.") + repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) + topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] # Calculate word embeddings and extract best matching with updated topic_embeddings From 1356f0f8eb2950609f0c380a5cb92b7778a27818 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Tue, 27 May 2025 19:26:56 +0530 Subject: [PATCH 03/13] added argument description for embeddings --- bertopic/_bertopic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index ea062153..f29fa02e 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -4328,6 +4328,8 @@ def _extract_words_per_topic( fine_tune_representation: If True, the topic representation will be fine-tuned using representation models. If False, the topic representation will remain as the base c-TF-IDF representation. calculate_aspects: Whether to calculate additional topic aspects + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model Returns: topics: The top words per topic From 1819c0322c44054def81f2bf9cc582ad0e800ca8 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:28:58 +0530 Subject: [PATCH 04/13] Update _keybert.py --- bertopic/representation/_keybert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index 48e7df0d..ba366b5a 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -180,7 +180,6 @@ def _extract_embeddings( # Calculate representative docs embeddings and create topic embeddings # If there are no precomputed embeddings, only then create embeddings if repr_embeddings is None: - logger.info("Embedding - Transforming representative documents to embeddings.") repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] From c5a2fbba21aa4ed1c4b95d280ea543262465fa15 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:33:35 +0530 Subject: [PATCH 05/13] Update _keybert.py --- bertopic/representation/_keybert.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index ba366b5a..f093acae 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -101,8 +101,9 @@ def extract_topics( # We calculate the similarity between word and document embeddings and create # topic embeddings from the representative document embeddings - sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices,repr_embeddings) - + sim_matrix, words = self._extract_embeddings( + topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings + ) # Find the best matching words based on the similarity matrix for each topic updated_topics = self._extract_top_words(words, topics, sim_matrix) From 776927520fdcb9ba6755a68aa4cb9b72356608e6 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:39:04 +0530 Subject: [PATCH 06/13] Update _keybert.py --- bertopic/representation/_keybert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index f093acae..36a15ab3 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -102,8 +102,8 @@ def extract_topics( # We calculate the similarity between word and document embeddings and create # topic embeddings from the representative document embeddings sim_matrix, words = self._extract_embeddings( - topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings - ) + topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings + ) # Find the best matching words based on the similarity matrix for each topic updated_topics = self._extract_top_words(words, topics, sim_matrix) From b9bff48e2e679c34180c4d320b2a50ae2fd7e2c9 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:47:54 +0530 Subject: [PATCH 07/13] Update _keybert.py to trim white space --- bertopic/representation/_keybert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index 36a15ab3..cfdb0c6d 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -182,9 +182,10 @@ def _extract_embeddings( # If there are no precomputed embeddings, only then create embeddings if repr_embeddings is None: repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) + topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] - + # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) From fcf6c8de8b03dd73845b2b60f99fe8cc6430fe97 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:52:00 +0530 Subject: [PATCH 08/13] Update _keybert.py --- bertopic/representation/_keybert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index cfdb0c6d..176e66a9 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -183,8 +183,8 @@ def _extract_embeddings( if repr_embeddings is None: repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) - topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] + # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) From 741eb2cc08373b2987103982f45f552eea7cc2a3 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 09:53:22 +0530 Subject: [PATCH 09/13] Update _keybert.py --- bertopic/representation/_keybert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index 176e66a9..f729e2a4 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -184,7 +184,6 @@ def _extract_embeddings( repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] - # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) From ac16aa5b458bf4402dc0bc4fb9e5a90745926bd0 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 12:24:05 +0530 Subject: [PATCH 10/13] Update _keybert.py for trailing spaces --- bertopic/representation/_keybert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index f729e2a4..14e38549 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -184,7 +184,7 @@ def _extract_embeddings( repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] - + # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) From 0531b6d506d97675311dcaf36689363cb6f40460 Mon Sep 17 00:00:00 2001 From: saikumaru <44021002+saikumaru@users.noreply.github.com> Date: Wed, 28 May 2025 13:51:07 +0530 Subject: [PATCH 11/13] specifically pass embeddings to keybert instance --- bertopic/_bertopic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index f29fa02e..f4c731b4 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -59,7 +59,7 @@ from bertopic.representation._mmr import mmr from bertopic.backend._utils import select_backend from bertopic.vectorizers import ClassTfidfTransformer -from bertopic.representation import BaseRepresentation +from bertopic.representation import BaseRepresentation, KeyBERTInspired from bertopic.dimensionality import BaseDimensionalityReduction from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import ( @@ -4365,8 +4365,10 @@ def _extract_words_per_topic( elif fine_tune_representation and isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) - elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): + elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings) + elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): + topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) elif fine_tune_representation and isinstance(self.representation_model, dict): if self.representation_model.get("Main"): main_model = self.representation_model["Main"] From 83451b96ec9ebd3b6472bb4cfa139195e69935c2 Mon Sep 17 00:00:00 2001 From: Saikumar Date: Mon, 2 Jun 2025 16:37:23 +0530 Subject: [PATCH 12/13] updated for typos and description --- bertopic/_bertopic.py | 6 +++--- bertopic/representation/_keybert.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index f4c731b4..dbdccbaf 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -59,7 +59,7 @@ from bertopic.representation._mmr import mmr from bertopic.backend._utils import select_backend from bertopic.vectorizers import ClassTfidfTransformer -from bertopic.representation import BaseRepresentation, KeyBERTInspired +from bertopic.representation import BaseRepresentation, KeyBERTInspired, MaximalMarginalRelevance from bertopic.dimensionality import BaseDimensionalityReduction from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import ( @@ -4329,7 +4329,7 @@ def _extract_words_per_topic( If False, the topic representation will remain as the base c-TF-IDF representation. calculate_aspects: Whether to calculate additional topic aspects embeddings: Pre-trained document embeddings. These can be used - instead of the sentence-transformer model + instead of an embedding model Returns: topics: The top words per topic @@ -4365,7 +4365,7 @@ def _extract_words_per_topic( elif fine_tune_representation and isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) - elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired): + elif fine_tune_representation and isinstance(self.representation_model, (KeyBERTInspired, MaximalMarginalRelevance)): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings) elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index 14e38549..10812369 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -81,7 +81,7 @@ def extract_topics( c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF embeddings: Pre-trained document embeddings. These can be used - instead of the sentence-transformer model + instead of an embedding model Returns: updated_topics: Updated topic representations @@ -91,7 +91,7 @@ def extract_topics( c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs ) - # If document embeddings are precomputed extract the embeddings of the represenantative documents based on repr_doc_indices + # If document embeddings are precomputed, extract the embeddings of the representative documents based on repr_doc_indices repr_embeddings = None if embeddings is not None: repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)] @@ -178,8 +178,7 @@ def _extract_embeddings( sim: The similarity matrix between word and topic embeddings vocab: The complete vocabulary of input documents """ - # Calculate representative docs embeddings and create topic embeddings - # If there are no precomputed embeddings, only then create embeddings + # Calculate representative document embeddings if there are no precomputed embeddings. if repr_embeddings is None: repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) From 525f81aae6c8dc86dd96cd634682effce855d065 Mon Sep 17 00:00:00 2001 From: Saikumar Date: Mon, 2 Jun 2025 16:44:21 +0530 Subject: [PATCH 13/13] removed mmr changes are they arent needed --- bertopic/_bertopic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index dbdccbaf..61418526 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -59,7 +59,7 @@ from bertopic.representation._mmr import mmr from bertopic.backend._utils import select_backend from bertopic.vectorizers import ClassTfidfTransformer -from bertopic.representation import BaseRepresentation, KeyBERTInspired, MaximalMarginalRelevance +from bertopic.representation import BaseRepresentation, KeyBERTInspired from bertopic.dimensionality import BaseDimensionalityReduction from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import ( @@ -4365,7 +4365,7 @@ def _extract_words_per_topic( elif fine_tune_representation and isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) - elif fine_tune_representation and isinstance(self.representation_model, (KeyBERTInspired, MaximalMarginalRelevance)): + elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings) elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)