From 4a94ac071503062bc6ca935ef0ad1ac3a8db002c Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Tue, 27 May 2025 19:13:07 +0530
Subject: [PATCH 01/13] Reuse precomputed embeddings

---
 bertopic/_bertopic.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 92fe0855..ea062153 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -4051,6 +4051,7 @@ def _extract_topics(
             documents,
             fine_tune_representation=fine_tune_representation,
             calculate_aspects=fine_tune_representation,
+            embeddings=embeddings,
         )
         self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)
 
@@ -4311,6 +4312,7 @@ def _extract_words_per_topic(
         c_tf_idf: csr_matrix = None,
         fine_tune_representation: bool = True,
         calculate_aspects: bool = True,
+        embeddings: np.ndarray = None,
     ) -> Mapping[str, List[Tuple[str, float]]]:
         """Based on tf_idf scores per topic, extract the top n words per topic.
 
@@ -4362,7 +4364,7 @@ def _extract_words_per_topic(
             for tuner in self.representation_model:
                 topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
         elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation):
-            topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
+            topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings)
         elif fine_tune_representation and isinstance(self.representation_model, dict):
             if self.representation_model.get("Main"):
                 main_model = self.representation_model["Main"]

From c974679a28d99f2539ab6d7465a34b482ce8c02f Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Tue, 27 May 2025 19:23:10 +0530
Subject: [PATCH 02/13] Use precomputed

---
 bertopic/representation/_keybert.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index f91c01cc..48e7df0d 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -71,6 +71,7 @@ def extract_topics(
         documents: pd.DataFrame,
         c_tf_idf: csr_matrix,
         topics: Mapping[str, List[Tuple[str, float]]],
+        embeddings: np.ndarray = None,
     ) -> Mapping[str, List[Tuple[str, float]]]:
         """Extract topics.
 
@@ -79,6 +80,8 @@ def extract_topics(
             documents: All input documents
             c_tf_idf: The topic c-TF-IDF representation
             topics: The candidate topics as calculated with c-TF-IDF
+            embeddings: Pre-trained document embeddings. These can be used
+                        instead of the sentence-transformer model
 
         Returns:
             updated_topics: Updated topic representations
@@ -88,12 +91,17 @@ def extract_topics(
             c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
         )
 
+        # If document embeddings are precomputed extract the embeddings of the represenantative documents based on repr_doc_indices
+        repr_embeddings = None
+        if embeddings is not None:
+            repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)]
+
         # We extract the top n words per class
         topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)
 
         # We calculate the similarity between word and document embeddings and create
         # topic embeddings from the representative document embeddings
-        sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)
+        sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices,repr_embeddings)
 
         # Find the best matching words based on the similarity matrix for each topic
         updated_topics = self._extract_top_words(words, topics, sim_matrix)
@@ -150,6 +158,7 @@ def _extract_embeddings(
         topics: Mapping[str, List[Tuple[str, float]]],
         representative_docs: List[str],
         repr_doc_indices: List[List[int]],
+        repr_embeddings: np.ndarray = None,
     ) -> Union[np.ndarray, List[str]]:
         """Extract the representative document embeddings and create topic embeddings.
         Then extract word embeddings and calculate the cosine similarity between topic
@@ -162,13 +171,18 @@ def _extract_embeddings(
             representative_docs: A flat list of representative documents
             repr_doc_indices: The indices of representative documents
                               that belong to each topic
+            repr_embeddings: Embeddings of respective representative_docs
 
         Returns:
             sim: The similarity matrix between word and topic embeddings
             vocab: The complete vocabulary of input documents
         """
         # Calculate representative docs embeddings and create topic embeddings
-        repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
+        # If there are no precomputed embeddings, only then create embeddings
+        if repr_embeddings is None:
+            logger.info("Embedding - Transforming representative documents to embeddings.")
+            repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
+        
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
 
         # Calculate word embeddings and extract best matching with updated topic_embeddings

From 1356f0f8eb2950609f0c380a5cb92b7778a27818 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Tue, 27 May 2025 19:26:56 +0530
Subject: [PATCH 03/13] added argument description for embeddings

---
 bertopic/_bertopic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index ea062153..f29fa02e 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -4328,6 +4328,8 @@ def _extract_words_per_topic(
             fine_tune_representation: If True, the topic representation will be fine-tuned using representation models.
                                       If False, the topic representation will remain as the base c-TF-IDF representation.
             calculate_aspects: Whether to calculate additional topic aspects
+            embeddings: Pre-trained document embeddings. These can be used
+                        instead of the sentence-transformer model
 
         Returns:
             topics: The top words per topic

From 1819c0322c44054def81f2bf9cc582ad0e800ca8 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:28:58 +0530
Subject: [PATCH 04/13] Update _keybert.py

---
 bertopic/representation/_keybert.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index 48e7df0d..ba366b5a 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -180,7 +180,6 @@ def _extract_embeddings(
         # Calculate representative docs embeddings and create topic embeddings
         # If there are no precomputed embeddings, only then create embeddings
         if repr_embeddings is None:
-            logger.info("Embedding - Transforming representative documents to embeddings.")
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
         
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]

From c5a2fbba21aa4ed1c4b95d280ea543262465fa15 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:33:35 +0530
Subject: [PATCH 05/13] Update _keybert.py

---
 bertopic/representation/_keybert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index ba366b5a..f093acae 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -101,8 +101,9 @@ def extract_topics(
 
         # We calculate the similarity between word and document embeddings and create
         # topic embeddings from the representative document embeddings
-        sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices,repr_embeddings)
-
+        sim_matrix, words = self._extract_embeddings(
+             topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings
+         )
         # Find the best matching words based on the similarity matrix for each topic
         updated_topics = self._extract_top_words(words, topics, sim_matrix)
 

From 776927520fdcb9ba6755a68aa4cb9b72356608e6 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:39:04 +0530
Subject: [PATCH 06/13] Update _keybert.py

---
 bertopic/representation/_keybert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index f093acae..36a15ab3 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -102,8 +102,8 @@ def extract_topics(
         # We calculate the similarity between word and document embeddings and create
         # topic embeddings from the representative document embeddings
         sim_matrix, words = self._extract_embeddings(
-             topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings
-         )
+            topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings
+        )
         # Find the best matching words based on the similarity matrix for each topic
         updated_topics = self._extract_top_words(words, topics, sim_matrix)
 

From b9bff48e2e679c34180c4d320b2a50ae2fd7e2c9 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:47:54 +0530
Subject: [PATCH 07/13] Update _keybert.py to trim white space

---
 bertopic/representation/_keybert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index 36a15ab3..cfdb0c6d 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -182,9 +182,10 @@ def _extract_embeddings(
         # If there are no precomputed embeddings, only then create embeddings
         if repr_embeddings is None:
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
+
         
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
-
+        
         # Calculate word embeddings and extract best matching with updated topic_embeddings
         vocab = list(set([word for words in topics.values() for word in words]))
         word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)

From fcf6c8de8b03dd73845b2b60f99fe8cc6430fe97 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:52:00 +0530
Subject: [PATCH 08/13] Update _keybert.py

---
 bertopic/representation/_keybert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index cfdb0c6d..176e66a9 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -183,8 +183,8 @@ def _extract_embeddings(
         if repr_embeddings is None:
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
 
-        
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
+
         
         # Calculate word embeddings and extract best matching with updated topic_embeddings
         vocab = list(set([word for words in topics.values() for word in words]))

From 741eb2cc08373b2987103982f45f552eea7cc2a3 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 09:53:22 +0530
Subject: [PATCH 09/13] Update _keybert.py

---
 bertopic/representation/_keybert.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index 176e66a9..f729e2a4 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -184,7 +184,6 @@ def _extract_embeddings(
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
 
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
-
         
         # Calculate word embeddings and extract best matching with updated topic_embeddings
         vocab = list(set([word for words in topics.values() for word in words]))

From ac16aa5b458bf4402dc0bc4fb9e5a90745926bd0 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 12:24:05 +0530
Subject: [PATCH 10/13] Update _keybert.py for trailing spaces

---
 bertopic/representation/_keybert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index f729e2a4..14e38549 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -184,7 +184,7 @@ def _extract_embeddings(
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
 
         topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
-        
+
         # Calculate word embeddings and extract best matching with updated topic_embeddings
         vocab = list(set([word for words in topics.values() for word in words]))
         word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)

From 0531b6d506d97675311dcaf36689363cb6f40460 Mon Sep 17 00:00:00 2001
From: saikumaru <44021002+saikumaru@users.noreply.github.com>
Date: Wed, 28 May 2025 13:51:07 +0530
Subject: [PATCH 11/13] specifically pass embeddings to keybert instance

---
 bertopic/_bertopic.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index f29fa02e..f4c731b4 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -59,7 +59,7 @@
 from bertopic.representation._mmr import mmr
 from bertopic.backend._utils import select_backend
 from bertopic.vectorizers import ClassTfidfTransformer
-from bertopic.representation import BaseRepresentation
+from bertopic.representation import BaseRepresentation, KeyBERTInspired
 from bertopic.dimensionality import BaseDimensionalityReduction
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
@@ -4365,8 +4365,10 @@ def _extract_words_per_topic(
         elif fine_tune_representation and isinstance(self.representation_model, list):
             for tuner in self.representation_model:
                 topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
-        elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation):
+        elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired):
             topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings)
+        elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation):
+            topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
         elif fine_tune_representation and isinstance(self.representation_model, dict):
             if self.representation_model.get("Main"):
                 main_model = self.representation_model["Main"]

From 83451b96ec9ebd3b6472bb4cfa139195e69935c2 Mon Sep 17 00:00:00 2001
From: Saikumar <you@example.com>
Date: Mon, 2 Jun 2025 16:37:23 +0530
Subject: [PATCH 12/13] updated for typos and description

---
 bertopic/_bertopic.py               | 6 +++---
 bertopic/representation/_keybert.py | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index f4c731b4..dbdccbaf 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -59,7 +59,7 @@
 from bertopic.representation._mmr import mmr
 from bertopic.backend._utils import select_backend
 from bertopic.vectorizers import ClassTfidfTransformer
-from bertopic.representation import BaseRepresentation, KeyBERTInspired
+from bertopic.representation import BaseRepresentation, KeyBERTInspired, MaximalMarginalRelevance
 from bertopic.dimensionality import BaseDimensionalityReduction
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
@@ -4329,7 +4329,7 @@ def _extract_words_per_topic(
                                       If False, the topic representation will remain as the base c-TF-IDF representation.
             calculate_aspects: Whether to calculate additional topic aspects
             embeddings: Pre-trained document embeddings. These can be used
-                        instead of the sentence-transformer model
+                        instead of an embedding model
 
         Returns:
             topics: The top words per topic
@@ -4365,7 +4365,7 @@ def _extract_words_per_topic(
         elif fine_tune_representation and isinstance(self.representation_model, list):
             for tuner in self.representation_model:
                 topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
-        elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired):
+        elif fine_tune_representation and isinstance(self.representation_model, (KeyBERTInspired, MaximalMarginalRelevance)):
             topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings)
         elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation):
             topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py
index 14e38549..10812369 100644
--- a/bertopic/representation/_keybert.py
+++ b/bertopic/representation/_keybert.py
@@ -81,7 +81,7 @@ def extract_topics(
             c_tf_idf: The topic c-TF-IDF representation
             topics: The candidate topics as calculated with c-TF-IDF
             embeddings: Pre-trained document embeddings. These can be used
-                        instead of the sentence-transformer model
+                        instead of an embedding model
 
         Returns:
             updated_topics: Updated topic representations
@@ -91,7 +91,7 @@ def extract_topics(
             c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
         )
 
-        # If document embeddings are precomputed extract the embeddings of the represenantative documents based on repr_doc_indices
+        # If document embeddings are precomputed, extract the embeddings of the representative documents based on repr_doc_indices
         repr_embeddings = None
         if embeddings is not None:
             repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)]
@@ -178,8 +178,7 @@ def _extract_embeddings(
             sim: The similarity matrix between word and topic embeddings
             vocab: The complete vocabulary of input documents
         """
-        # Calculate representative docs embeddings and create topic embeddings
-        # If there are no precomputed embeddings, only then create embeddings
+        # Calculate representative document embeddings if there are no precomputed embeddings.
         if repr_embeddings is None:
             repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
 

From 525f81aae6c8dc86dd96cd634682effce855d065 Mon Sep 17 00:00:00 2001
From: Saikumar <you@example.com>
Date: Mon, 2 Jun 2025 16:44:21 +0530
Subject: [PATCH 13/13] removed mmr changes are they arent needed

---
 bertopic/_bertopic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index dbdccbaf..61418526 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -59,7 +59,7 @@
 from bertopic.representation._mmr import mmr
 from bertopic.backend._utils import select_backend
 from bertopic.vectorizers import ClassTfidfTransformer
-from bertopic.representation import BaseRepresentation, KeyBERTInspired, MaximalMarginalRelevance
+from bertopic.representation import BaseRepresentation, KeyBERTInspired
 from bertopic.dimensionality import BaseDimensionalityReduction
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
@@ -4365,7 +4365,7 @@ def _extract_words_per_topic(
         elif fine_tune_representation and isinstance(self.representation_model, list):
             for tuner in self.representation_model:
                 topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
-        elif fine_tune_representation and isinstance(self.representation_model, (KeyBERTInspired, MaximalMarginalRelevance)):
+        elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired):
             topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings)
         elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation):
             topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)