From ecae7fe87a49fa6eb78d1b0e69692c4538164e2f Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Mon, 31 Mar 2025 23:42:03 -0400
Subject: [PATCH 1/7] inital_try_merge_to_outlier

---
 bertopic/_bertopic.py               |  79 ++++++++
 dev_test_sc.ipynb                   | 267 ++++++++++++++++++++++++++++
 tests/test_reduction/test_delete.py |  84 +++++++++
 3 files changed, 430 insertions(+)
 create mode 100644 dev_test_sc.ipynb
 create mode 100644 tests/test_reduction/test_delete.py

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 6a75171b..99e33240 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -2168,6 +2168,85 @@ def merge_topics(
         self._save_representative_docs(documents)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
 
+    def delete_topics(
+        self,
+        topics_to_delete: List[int],
+    ) -> None:
+        """Delete specified topics from the topic model.
+
+        This method allows you to remove topics from the model by mapping them to a special
+        label (-1) and updating the internal topic representation accordingly. It also
+        updates the topic sizes and any relevant attributes to reflect the changes.
+
+        Arguments:
+            topics_to_delete: A list of topic IDs to be deleted from the model.
+
+        Examples:
+        To delete topics 1 and 2 from the model:
+
+        ```python
+        topic_model.delete_topics([1, 2])
+        ```
+        """
+        check_is_fitted(self)
+
+        # First map deleted topics to -1
+        initial_mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}
+        initial_mapping[-1] = -1
+
+        # Update topics to mark deletions
+        self.topics_ = [initial_mapping[topic] for topic in self.topics_]
+        self._update_topic_size(pd.DataFrame({"Topic": self.topics_}))
+
+        # Create size-based mapping for remaining topics
+        df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False)
+        df = df[df.Old_Topic != -1]  # Exclude outliers
+        final_mapping = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))}
+
+        # Update topics with final mapping
+        self.topics_ = [final_mapping[topic] for topic in self.topics_]
+        self.topic_mapper_.add_mappings(final_mapping, topic_model=self)
+        self._update_topic_size(pd.DataFrame({"Topic": self.topics_}))
+
+        # Update probabilities if they exist
+        if self.probabilities_ is not None:
+            self.probabilities_ = self._map_probabilities(self.probabilities_)
+
+        # Update dictionary-based attributes
+        for attr in ["topic_representations_", "topic_aspects_"]:
+            if hasattr(self, attr) and getattr(self, attr) is not None:
+                old_dict = getattr(self, attr)
+                if attr == "topic_aspects_":
+                    # Handle nested dictionary for aspects
+                    new_dict = {
+                        aspect: {
+                            final_mapping[old_topic]: content
+                            for old_topic, content in topics.items()
+                            if old_topic not in topics_to_delete
+                        }
+                        for aspect, topics in old_dict.items()
+                    }
+                else:
+                    # Handle flat dictionary
+                    new_dict = {
+                        final_mapping[old_topic]: content
+                        for old_topic, content in old_dict.items()
+                        if old_topic not in topics_to_delete
+                    }
+                setattr(self, attr, new_dict)
+
+        # Update array-based attributes using masks
+        for attr in ["topic_embeddings_", "c_tf_idf_"]:
+            if hasattr(self, attr) and getattr(self, attr) is not None:
+                matrix = getattr(self, attr)
+                mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])])
+                setattr(self, attr, matrix[mask])
+
+        # Update ctfidf model
+        if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
+            mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])])
+            self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask]
+
     def reduce_topics(
         self,
         docs: List[str],
diff --git a/dev_test_sc.ipynb b/dev_test_sc.ipynb
new file mode 100644
index 00000000..8307bb19
--- /dev/null
+++ b/dev_test_sc.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%autoreload 2\n",
+    "from bertopic import BERTopic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== BERTopic Delete Topics Test Results ===\n",
+      "\n",
+      "Initializing and fitting BERTopic model...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-03-31 21:49:21,445 - BERTopic - Embedding - Transforming documents to embeddings.\n",
+      "Batches: 100%|██████████| 16/16 [00:12<00:00,  1.29it/s]\n",
+      "2025-03-31 21:49:34,425 - BERTopic - Embedding - Completed ✓\n",
+      "2025-03-31 21:49:34,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n",
+      "2025-03-31 21:49:35,290 - BERTopic - Dimensionality - Completed ✓\n",
+      "2025-03-31 21:49:35,291 - BERTopic - Cluster - Start clustering the reduced embeddings\n",
+      "2025-03-31 21:49:35,307 - BERTopic - Cluster - Completed ✓\n",
+      "2025-03-31 21:49:35,309 - BERTopic - Representation - Fine-tuning topics using representation models.\n",
+      "2025-03-31 21:49:35,493 - BERTopic - Representation - Completed ✓\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Initial State:\n",
+      "Total topics: 73\n",
+      "Topic sizes (top 5): {0: 11, 1: 11, 2: 10, 3: 10, 4: 9}\n",
+      "Matrix shapes - c_tf_idf: (74, 19877), embeddings: (74, 384)\n",
+      "\n",
+      "Deleting topics: [3, 4, 6]\n",
+      "Original sizes of topics to delete: [10, 9, 9]\n",
+      "\n",
+      "✓ Topic deletion completed\n",
+      "\n",
+      "=== Validation Results ===\n",
+      "\n",
+      "1. Topic Counts:\n",
+      "   Before: 73 topics\n",
+      "   After: 70 topics\n",
+      "   Expected: 70 topics\n",
+      "\n",
+      "2. Size-based Ordering:\n",
+      "   Topic IDs by size: [0, 1, 2, 4, 3, 5, 8, 6, 7, 13, 10, 9, 12, 11, 17, 15, 16, 14, 20, 21, 22, 18, 19, 23, 26, 28, 24, 27, 25, 29, 35, 36, 41, 37, 31, 30, 40, 38, 34, 33, 32, 42, 39, 43, 45, 50, 51, 49, 52, 53, 44, 48, 57, 47, 54, 55, 56, 46, 58, 61, 60, 59, 62, 63, 64, 65, 66, 67, 68, 69]\n",
+      "   Sizes: [11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n",
+      "   Correctly ordered by size: False\n",
+      "   WARNING: Topics not properly ordered by size!\n",
+      "\n",
+      "3. Matrix Shapes:\n",
+      "   c_tf_idf - Before: (74, 19877), After: (71, 19877)\n",
+      "   embeddings - Before: (74, 384), After: (71, 384)\n",
+      "\n",
+      "4. Topic Representations:\n",
+      "   Before: 74 representations\n",
+      "   After: 71 representations\n",
+      "\n",
+      "5. Outlier Topic (-1):\n",
+      "   Present in topics_: True\n",
+      "   Present in sizes: True\n",
+      "   Present in representations: True\n",
+      "\n",
+      "6. Topic Deletion and Reordering:\n",
+      "   Expected topic count: 70\n",
+      "   Actual topic count: 70\n",
+      "   Sequential topic numbering: True\n",
+      "   Sizes match: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from umap import UMAP\n",
+    "from hdbscan import HDBSCAN\n",
+    "\n",
+    "def create_topic_model():\n",
+    "    \"\"\"Create and fit a BERTopic model\"\"\"\n",
+    "    # Create sample data\n",
+    "    docs = fetch_20newsgroups(subset='all')['data'][:500]\n",
+    "    \n",
+    "    # Initialize BERTopic with specific models\n",
+    "    umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=True, random_state=42)\n",
+    "    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)\n",
+    "    topic_model = BERTopic(umap_model=umap, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', verbose=True)\n",
+    "    \n",
+    "    # Fit the model\n",
+    "    topics, probs = topic_model.fit_transform(docs)\n",
+    "    return topic_model\n",
+    "\n",
+    "def run_deletion_tests():\n",
+    "    \"\"\"Run comprehensive tests for topic deletion and print detailed comparisons\"\"\"\n",
+    "    print(\"\\n=== BERTopic Delete Topics Test Results ===\\n\")\n",
+    "    \n",
+    "    # Setup model\n",
+    "    print(\"Initializing and fitting BERTopic model...\")\n",
+    "    topic_model = create_topic_model()\n",
+    "    \n",
+    "    # Record initial state\n",
+    "    initial_state = {\n",
+    "        'topic_sizes': topic_model.topic_sizes_.copy(),\n",
+    "        'topics_set': set(topic_model.topics_),\n",
+    "        'c_tf_idf_shape': topic_model.c_tf_idf_.shape,\n",
+    "        'embeddings_shape': topic_model.topic_embeddings_.shape,\n",
+    "        'representations_count': len(topic_model.topic_representations_),\n",
+    "    }\n",
+    "    \n",
+    "    # Get topics sorted by size (excluding -1)\n",
+    "    sorted_topics = sorted(\n",
+    "        [(topic, size) for topic, size in initial_state['topic_sizes'].items() if topic != -1],\n",
+    "        key=lambda x: x[1],\n",
+    "        reverse=True\n",
+    "    )\n",
+    "    \n",
+    "    print(\"\\nInitial State:\")\n",
+    "    print(f\"Total topics: {len(sorted_topics)}\")\n",
+    "    print(f\"Topic sizes (top 5): {dict(sorted_topics[:5])}\")\n",
+    "    print(f\"Matrix shapes - c_tf_idf: {initial_state['c_tf_idf_shape']}, embeddings: {initial_state['embeddings_shape']}\")\n",
+    "    \n",
+    "    # Select topics to delete (4th, 5th, 6th largest)\n",
+    "    topics_to_delete = [item[0] for item in sorted_topics[3:6]]\n",
+    "    print(f\"\\nDeleting topics: {topics_to_delete}\")\n",
+    "    print(f\"Original sizes of topics to delete: {[initial_state['topic_sizes'][t] for t in topics_to_delete]}\")\n",
+    "    \n",
+    "    # Perform deletion\n",
+    "    try:\n",
+    "        topic_model.delete_topics(topics_to_delete)\n",
+    "        print(\"\\n✓ Topic deletion completed\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"\\n❌ Error during topic deletion: {str(e)}\")\n",
+    "        return\n",
+    "    \n",
+    "    # Analyze results\n",
+    "    print(\"\\n=== Validation Results ===\\n\")\n",
+    "    \n",
+    "    # 1. Check topic counts\n",
+    "    new_topics = set(topic_model.topics_) - {-1}\n",
+    "    print(f\"1. Topic Counts:\")\n",
+    "    print(f\"   Before: {len(sorted_topics)} topics\")\n",
+    "    print(f\"   After: {len(new_topics)} topics\")\n",
+    "    print(f\"   Expected: {len(sorted_topics) - len(topics_to_delete)} topics\")\n",
+    "    \n",
+    "    # 2. Check size ordering\n",
+    "    new_sorted_topics = sorted(\n",
+    "        [(topic, size) for topic, size in topic_model.topic_sizes_.items() if topic != -1],\n",
+    "        key=lambda x: x[1],\n",
+    "        reverse=True\n",
+    "    )\n",
+    "    \n",
+    "    print(\"\\n2. Size-based Ordering:\")\n",
+    "    print(f\"   Topic IDs by size: {[t[0] for t in new_sorted_topics]}\")\n",
+    "    print(f\"   Sizes: {[t[1] for t in new_sorted_topics]}\")\n",
+    "    is_ordered = all(i == t[0] for i, t in enumerate(new_sorted_topics))\n",
+    "    print(f\"   Correctly ordered by size: {is_ordered}\")\n",
+    "    if not is_ordered:\n",
+    "        print(\"   WARNING: Topics not properly ordered by size!\")\n",
+    "    \n",
+    "    # 3. Check matrix shapes\n",
+    "    print(\"\\n3. Matrix Shapes:\")\n",
+    "    print(f\"   c_tf_idf - Before: {initial_state['c_tf_idf_shape']}, After: {topic_model.c_tf_idf_.shape}\")\n",
+    "    print(f\"   embeddings - Before: {initial_state['embeddings_shape']}, After: {topic_model.topic_embeddings_.shape}\")\n",
+    "    \n",
+    "    # 4. Check representations\n",
+    "    print(\"\\n4. Topic Representations:\")\n",
+    "    print(f\"   Before: {initial_state['representations_count']} representations\")\n",
+    "    print(f\"   After: {len(topic_model.topic_representations_)} representations\")\n",
+    "    \n",
+    "    # 5. Check outlier topic\n",
+    "    print(\"\\n5. Outlier Topic (-1):\")\n",
+    "    print(f\"   Present in topics_: {-1 in topic_model.topics_}\")\n",
+    "    print(f\"   Present in sizes: {-1 in topic_model.topic_sizes_}\")\n",
+    "    print(f\"   Present in representations: {-1 in topic_model.topic_representations_}\")\n",
+    "    \n",
+    "    # 6. Verify topic deletion and reordering\n",
+    "    print(\"\\n6. Topic Deletion and Reordering:\")\n",
+    "    expected_topic_count = len(sorted_topics) - len(topics_to_delete)\n",
+    "    actual_topic_count = len([t for t in topic_model.topic_sizes_.keys() if t != -1])\n",
+    "    \n",
+    "    print(f\"   Expected topic count: {expected_topic_count}\")\n",
+    "    print(f\"   Actual topic count: {actual_topic_count}\")\n",
+    "    \n",
+    "    # Check sequential numbering\n",
+    "    expected_topic_numbers = set(range(expected_topic_count))\n",
+    "    actual_topic_numbers = set(t for t in topic_model.topic_sizes_.keys() if t != -1)\n",
+    "    sequential_numbering = expected_topic_numbers == actual_topic_numbers\n",
+    "    \n",
+    "    print(f\"   Sequential topic numbering: {sequential_numbering}\")\n",
+    "    if not sequential_numbering:\n",
+    "        print(f\"   Expected topics: {sorted(expected_topic_numbers)}\")\n",
+    "        print(f\"   Actual topics: {sorted(actual_topic_numbers)}\")\n",
+    "    \n",
+    "    # Check sizes match (excluding deleted topics)\n",
+    "    expected_sizes = sorted([size for topic, size in sorted_topics if topic not in topics_to_delete], reverse=True)\n",
+    "    actual_sizes = sorted([size for topic, size in topic_model.topic_sizes_.items() if topic != -1], reverse=True)\n",
+    "    sizes_match = expected_sizes == actual_sizes\n",
+    "    \n",
+    "    print(f\"   Sizes match: {sizes_match}\")\n",
+    "    if not sizes_match:\n",
+    "        print(f\"   Expected sizes: {expected_sizes}\")\n",
+    "        print(f\"   Actual sizes: {actual_sizes}\")\n",
+    "\n",
+    "    # Update the validations dictionary\n",
+    "    validations = {\n",
+    "        \"Topic count correct\": actual_topic_count == expected_topic_count,\n",
+    "        \"Size ordering correct\": is_ordered,\n",
+    "        \"Matrix shapes consistent\": topic_model.c_tf_idf_.shape[0] == topic_model.topic_embeddings_.shape[0],\n",
+    "        \"Sequential topic numbering\": sequential_numbering,\n",
+    "        \"Topic sizes preserved\": sizes_match,\n",
+    "        \"Outlier preserved\": all([-1 in topic_model.topics_, -1 in topic_model.topic_sizes_, -1 in topic_model.topic_representations_])\n",
+    "    }\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    run_deletion_tests()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bertopic-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py
new file mode 100644
index 00000000..b5ae7661
--- /dev/null
+++ b/tests/test_reduction/test_delete.py
@@ -0,0 +1,84 @@
+import copy
+import pytest
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_delete(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    nr_topics = len(set(topic_model.topics_))
+    length_documents = len(topic_model.topics_)
+    
+    print("\n" + "="*50)
+    print(f"Testing model: {model}")
+    print(f"Initial number of topics: {nr_topics}")
+    print(f"Initial topics: {sorted(list(set(topic_model.topics_)))}")
+    print(f"Number of documents: {length_documents}")
+    print("="*50)
+
+    # First deletion
+    topics_to_delete = [1, 2]
+    print(f"\nFirst deletion - attempting to delete topics: {topics_to_delete}")
+    topic_model.delete_topics(topics_to_delete)
+    
+    print(f"Topics after first deletion: {sorted(list(set(topic_model.topics_)))}")
+    print(f"Number of topics after first deletion: {len(set(topic_model.topics_))}")
+    
+    mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
+    print(f"Topic mappings after first deletion: {mappings}")
+    
+    mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
+    print(f"First 10 mapped labels: {mapped_labels[:10]}")
+    print(f"First 10 model topics: {topic_model.topics_[:10]}")
+
+    print("\nFirst deletion - Assertions:")
+    print(f"Expected topics: {nr_topics - 2}, Actual topics: {len(set(topic_model.topics_))}")
+    print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}")
+    
+    assert nr_topics == len(set(topic_model.topics_)) + 2
+    assert topic_model.get_topic_info().Count.sum() == length_documents
+    if model == "online_topic_model":
+        assert mapped_labels == topic_model.topics_[950:]
+    else:
+        assert mapped_labels == topic_model.topics_
+
+    # Find two existing topics for second deletion
+    remaining_topics = sorted(list(set(topic_model.topics_)))
+    remaining_topics = [t for t in remaining_topics if t != -1]  # Exclude outlier topic
+    topics_to_delete = remaining_topics[:2]  # Take first two remaining topics
+    
+    print(f"\nSecond deletion - attempting to delete topics: {topics_to_delete}")
+    print(f"All remaining topics before second deletion: {remaining_topics}")
+    
+    # Second deletion
+    topic_model.delete_topics(topics_to_delete)
+    
+    print(f"Topics after second deletion: {sorted(list(set(topic_model.topics_)))}")
+    print(f"Number of topics after second deletion: {len(set(topic_model.topics_))}")
+    
+    mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
+    print(f"Topic mappings after second deletion: {mappings}")
+    
+    mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
+    print(f"First 10 mapped labels: {mapped_labels[:10]}")
+    print(f"First 10 model topics: {topic_model.topics_[:10]}")
+
+    print("\nSecond deletion - Assertions:")
+    print(f"Expected topics: {nr_topics - 4}, Actual topics: {len(set(topic_model.topics_))}")
+    print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}")
+    
+    assert nr_topics == len(set(topic_model.topics_)) + 4
+    assert topic_model.get_topic_info().Count.sum() == length_documents
+    if model == "online_topic_model":
+        assert mapped_labels == topic_model.topics_[950:]
+    else:
+        assert mapped_labels == topic_model.topics_

From b8e01c5928497c3af3d78e8be5f8337a07b090b1 Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Tue, 1 Apr 2025 22:46:52 -0400
Subject: [PATCH 2/7] Updated delete_topics and test_delete.py to account for
 models without -1 topics

---
 bertopic/_bertopic.py               | 108 +++++++----
 dev_test_sc.ipynb                   | 267 ----------------------------
 tests/test_reduction/test_delete.py |  55 ++----
 3 files changed, 84 insertions(+), 346 deletions(-)
 delete mode 100644 dev_test_sc.ipynb

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 99e33240..eab008ae 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -827,7 +827,7 @@ def topics_over_time(
             nr_bins: The number of bins you want to create for the timestamps. The left interval will
                      be chosen as the timestamp. An additional column will be created with the
                      entire interval.
-            datetime_format: The datetime format of the timestamps if they are strings, eg “%d/%m/%Y”.
+            datetime_format: The datetime format of the timestamps if they are strings, eg "%d/%m/%Y".
                              Set this to None if you want to have it automatically detect the format.
                              See strftime documentation for more information on choices:
                              https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
@@ -1778,8 +1778,7 @@ def get_document_info(
         # the topic distributions
         document_info = topic_model.get_document_info(docs, df=df,
                                                       metadata={"Topic_distribution": distributions})
-        ```
-        """
+                """
         check_documents_type(docs)
         if df is not None:
             document_info = df.copy()
@@ -2172,47 +2171,78 @@ def delete_topics(
         self,
         topics_to_delete: List[int],
     ) -> None:
-        """Delete specified topics from the topic model.
-
-        This method allows you to remove topics from the model by mapping them to a special
-        label (-1) and updating the internal topic representation accordingly. It also
-        updates the topic sizes and any relevant attributes to reflect the changes.
+        check_is_fitted(self)
 
-        Arguments:
-            topics_to_delete: A list of topic IDs to be deleted from the model.
+        topics_df = pd.DataFrame(
+                {
+                    "Topic": self.topics_
+                }
+            )
 
-        Examples:
-        To delete topics 1 and 2 from the model:
-
-        ```python
-        topic_model.delete_topics([1, 2])
-        ```
-        """
-        check_is_fitted(self)
+        # Check if -1 exists in the current topics
+        had_outliers = -1 in set(self.topics_)
 
         # First map deleted topics to -1
-        initial_mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}
-        initial_mapping[-1] = -1
+        mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}
+        mapping[-1] = -1
+
+        # Track mappings and sizes of topics for merging topic embeddings
+        mappings = defaultdict(list)
+        for key, val in sorted(mapping.items()):
+            mappings[val].append(key)
+        mappings = {
+            topic_to: {
+                "topics_from": topics_from,
+                "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from],
+            }
+            for topic_to, topics_from in mappings.items()
+        }
 
-        # Update topics to mark deletions
-        self.topics_ = [initial_mapping[topic] for topic in self.topics_]
-        self._update_topic_size(pd.DataFrame({"Topic": self.topics_}))
+        # If adding -1 for the first time, initialize its attributes
+        if not had_outliers and any(topic in topics_to_delete for topic in self.topics_):
+            # Initialize c_tf_idf for -1 topic (zeros)
+            if hasattr(self, "c_tf_idf_") and self.c_tf_idf_ is not None:
+                outlier_row = np.zeros((1, self.c_tf_idf_.shape[1]))
+                if isinstance(self.c_tf_idf_, sp.csr_matrix):
+                    outlier_row = sp.csr_matrix(outlier_row)
+                    self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
+                else:
+                    self.c_tf_idf_ = np.vstack([outlier_row, self.c_tf_idf_])
+
+            # Initialize topic embeddings for -1 topic (zeros)
+            if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None:
+                outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1]))
+                self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_])
+
+            # Initialize topic representations for -1 topic: ('N/A', 1e-05)]
+            if hasattr(self, "topic_representations_") and self.topic_representations_ is not None:
+                self.topic_representations_[-1] = [('N/A', 1e-05)]
+
+            # Initialize ctfidf model diagonal for -1 topic (ones)
+            if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None and hasattr(self.ctfidf_model, "_idf_diag"):
+                if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix):
+                    n_features = self.ctfidf_model._idf_diag.shape[1]
+                    outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features))
+                    self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag])
+                else:
+                    outlier_diag = np.ones(1)
+                    self.ctfidf_model._idf_diag = np.concatenate([outlier_diag, self.ctfidf_model._idf_diag])
 
-        # Create size-based mapping for remaining topics
-        df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False)
-        df = df[df.Old_Topic != -1]  # Exclude outliers
-        final_mapping = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))}
+            # Initialize topic aspects for -1 topic (empty dict for each aspect)
+            if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None:
+                for aspect in self.topic_aspects_:
+                    self.topic_aspects_[aspect][-1] = {}
 
-        # Update topics with final mapping
-        self.topics_ = [final_mapping[topic] for topic in self.topics_]
-        self.topic_mapper_.add_mappings(final_mapping, topic_model=self)
-        self._update_topic_size(pd.DataFrame({"Topic": self.topics_}))
+        # Continue with the rest of the delete_topics logic
+        topics_df.Topic = topics_df.Topic.map(mapping)
+        self.topic_mapper_.add_mappings(mapping, topic_model=self)
+        topics_df = self._sort_mappings_by_frequency(topics_df)
+        self._update_topic_size(topics_df)
+        self.probabilities_ = self._map_probabilities(self.probabilities_)
 
-        # Update probabilities if they exist
-        if self.probabilities_ is not None:
-            self.probabilities_ = self._map_probabilities(self.probabilities_)
+        final_mapping = self.topic_mapper_.get_mappings()
 
-        # Update dictionary-based attributes
+        # Update dictionary-based attributes to remove deleted topics
         for attr in ["topic_representations_", "topic_aspects_"]:
             if hasattr(self, attr) and getattr(self, attr) is not None:
                 old_dict = getattr(self, attr)
@@ -2220,7 +2250,7 @@ def delete_topics(
                     # Handle nested dictionary for aspects
                     new_dict = {
                         aspect: {
-                            final_mapping[old_topic]: content
+                            (final_mapping[old_topic] if old_topic != -1 else -1): content
                             for old_topic, content in topics.items()
                             if old_topic not in topics_to_delete
                         }
@@ -2229,20 +2259,20 @@ def delete_topics(
                 else:
                     # Handle flat dictionary
                     new_dict = {
-                        final_mapping[old_topic]: content
+                        (final_mapping[old_topic] if old_topic != -1 else -1): content
                         for old_topic, content in old_dict.items()
                         if old_topic not in topics_to_delete
                     }
                 setattr(self, attr, new_dict)
 
-        # Update array-based attributes using masks
+        # Update array-based attributes using masks to remove deleted topics
         for attr in ["topic_embeddings_", "c_tf_idf_"]:
             if hasattr(self, attr) and getattr(self, attr) is not None:
                 matrix = getattr(self, attr)
                 mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])])
                 setattr(self, attr, matrix[mask])
 
-        # Update ctfidf model
+        # Update ctfidf model to remove deleted topics
         if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
             mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])])
             self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask]
diff --git a/dev_test_sc.ipynb b/dev_test_sc.ipynb
deleted file mode 100644
index 8307bb19..00000000
--- a/dev_test_sc.ipynb
+++ /dev/null
@@ -1,267 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%autoreload 2\n",
-    "from bertopic import BERTopic"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "=== BERTopic Delete Topics Test Results ===\n",
-      "\n",
-      "Initializing and fitting BERTopic model...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-03-31 21:49:21,445 - BERTopic - Embedding - Transforming documents to embeddings.\n",
-      "Batches: 100%|██████████| 16/16 [00:12<00:00,  1.29it/s]\n",
-      "2025-03-31 21:49:34,425 - BERTopic - Embedding - Completed ✓\n",
-      "2025-03-31 21:49:34,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n",
-      "2025-03-31 21:49:35,290 - BERTopic - Dimensionality - Completed ✓\n",
-      "2025-03-31 21:49:35,291 - BERTopic - Cluster - Start clustering the reduced embeddings\n",
-      "2025-03-31 21:49:35,307 - BERTopic - Cluster - Completed ✓\n",
-      "2025-03-31 21:49:35,309 - BERTopic - Representation - Fine-tuning topics using representation models.\n",
-      "2025-03-31 21:49:35,493 - BERTopic - Representation - Completed ✓\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Initial State:\n",
-      "Total topics: 73\n",
-      "Topic sizes (top 5): {0: 11, 1: 11, 2: 10, 3: 10, 4: 9}\n",
-      "Matrix shapes - c_tf_idf: (74, 19877), embeddings: (74, 384)\n",
-      "\n",
-      "Deleting topics: [3, 4, 6]\n",
-      "Original sizes of topics to delete: [10, 9, 9]\n",
-      "\n",
-      "✓ Topic deletion completed\n",
-      "\n",
-      "=== Validation Results ===\n",
-      "\n",
-      "1. Topic Counts:\n",
-      "   Before: 73 topics\n",
-      "   After: 70 topics\n",
-      "   Expected: 70 topics\n",
-      "\n",
-      "2. Size-based Ordering:\n",
-      "   Topic IDs by size: [0, 1, 2, 4, 3, 5, 8, 6, 7, 13, 10, 9, 12, 11, 17, 15, 16, 14, 20, 21, 22, 18, 19, 23, 26, 28, 24, 27, 25, 29, 35, 36, 41, 37, 31, 30, 40, 38, 34, 33, 32, 42, 39, 43, 45, 50, 51, 49, 52, 53, 44, 48, 57, 47, 54, 55, 56, 46, 58, 61, 60, 59, 62, 63, 64, 65, 66, 67, 68, 69]\n",
-      "   Sizes: [11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n",
-      "   Correctly ordered by size: False\n",
-      "   WARNING: Topics not properly ordered by size!\n",
-      "\n",
-      "3. Matrix Shapes:\n",
-      "   c_tf_idf - Before: (74, 19877), After: (71, 19877)\n",
-      "   embeddings - Before: (74, 384), After: (71, 384)\n",
-      "\n",
-      "4. Topic Representations:\n",
-      "   Before: 74 representations\n",
-      "   After: 71 representations\n",
-      "\n",
-      "5. Outlier Topic (-1):\n",
-      "   Present in topics_: True\n",
-      "   Present in sizes: True\n",
-      "   Present in representations: True\n",
-      "\n",
-      "6. Topic Deletion and Reordering:\n",
-      "   Expected topic count: 70\n",
-      "   Actual topic count: 70\n",
-      "   Sequential topic numbering: True\n",
-      "   Sizes match: True\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from sklearn.datasets import fetch_20newsgroups\n",
-    "from umap import UMAP\n",
-    "from hdbscan import HDBSCAN\n",
-    "\n",
-    "def create_topic_model():\n",
-    "    \"\"\"Create and fit a BERTopic model\"\"\"\n",
-    "    # Create sample data\n",
-    "    docs = fetch_20newsgroups(subset='all')['data'][:500]\n",
-    "    \n",
-    "    # Initialize BERTopic with specific models\n",
-    "    umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=True, random_state=42)\n",
-    "    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)\n",
-    "    topic_model = BERTopic(umap_model=umap, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', verbose=True)\n",
-    "    \n",
-    "    # Fit the model\n",
-    "    topics, probs = topic_model.fit_transform(docs)\n",
-    "    return topic_model\n",
-    "\n",
-    "def run_deletion_tests():\n",
-    "    \"\"\"Run comprehensive tests for topic deletion and print detailed comparisons\"\"\"\n",
-    "    print(\"\\n=== BERTopic Delete Topics Test Results ===\\n\")\n",
-    "    \n",
-    "    # Setup model\n",
-    "    print(\"Initializing and fitting BERTopic model...\")\n",
-    "    topic_model = create_topic_model()\n",
-    "    \n",
-    "    # Record initial state\n",
-    "    initial_state = {\n",
-    "        'topic_sizes': topic_model.topic_sizes_.copy(),\n",
-    "        'topics_set': set(topic_model.topics_),\n",
-    "        'c_tf_idf_shape': topic_model.c_tf_idf_.shape,\n",
-    "        'embeddings_shape': topic_model.topic_embeddings_.shape,\n",
-    "        'representations_count': len(topic_model.topic_representations_),\n",
-    "    }\n",
-    "    \n",
-    "    # Get topics sorted by size (excluding -1)\n",
-    "    sorted_topics = sorted(\n",
-    "        [(topic, size) for topic, size in initial_state['topic_sizes'].items() if topic != -1],\n",
-    "        key=lambda x: x[1],\n",
-    "        reverse=True\n",
-    "    )\n",
-    "    \n",
-    "    print(\"\\nInitial State:\")\n",
-    "    print(f\"Total topics: {len(sorted_topics)}\")\n",
-    "    print(f\"Topic sizes (top 5): {dict(sorted_topics[:5])}\")\n",
-    "    print(f\"Matrix shapes - c_tf_idf: {initial_state['c_tf_idf_shape']}, embeddings: {initial_state['embeddings_shape']}\")\n",
-    "    \n",
-    "    # Select topics to delete (4th, 5th, 6th largest)\n",
-    "    topics_to_delete = [item[0] for item in sorted_topics[3:6]]\n",
-    "    print(f\"\\nDeleting topics: {topics_to_delete}\")\n",
-    "    print(f\"Original sizes of topics to delete: {[initial_state['topic_sizes'][t] for t in topics_to_delete]}\")\n",
-    "    \n",
-    "    # Perform deletion\n",
-    "    try:\n",
-    "        topic_model.delete_topics(topics_to_delete)\n",
-    "        print(\"\\n✓ Topic deletion completed\")\n",
-    "    except Exception as e:\n",
-    "        print(f\"\\n❌ Error during topic deletion: {str(e)}\")\n",
-    "        return\n",
-    "    \n",
-    "    # Analyze results\n",
-    "    print(\"\\n=== Validation Results ===\\n\")\n",
-    "    \n",
-    "    # 1. Check topic counts\n",
-    "    new_topics = set(topic_model.topics_) - {-1}\n",
-    "    print(f\"1. Topic Counts:\")\n",
-    "    print(f\"   Before: {len(sorted_topics)} topics\")\n",
-    "    print(f\"   After: {len(new_topics)} topics\")\n",
-    "    print(f\"   Expected: {len(sorted_topics) - len(topics_to_delete)} topics\")\n",
-    "    \n",
-    "    # 2. Check size ordering\n",
-    "    new_sorted_topics = sorted(\n",
-    "        [(topic, size) for topic, size in topic_model.topic_sizes_.items() if topic != -1],\n",
-    "        key=lambda x: x[1],\n",
-    "        reverse=True\n",
-    "    )\n",
-    "    \n",
-    "    print(\"\\n2. Size-based Ordering:\")\n",
-    "    print(f\"   Topic IDs by size: {[t[0] for t in new_sorted_topics]}\")\n",
-    "    print(f\"   Sizes: {[t[1] for t in new_sorted_topics]}\")\n",
-    "    is_ordered = all(i == t[0] for i, t in enumerate(new_sorted_topics))\n",
-    "    print(f\"   Correctly ordered by size: {is_ordered}\")\n",
-    "    if not is_ordered:\n",
-    "        print(\"   WARNING: Topics not properly ordered by size!\")\n",
-    "    \n",
-    "    # 3. Check matrix shapes\n",
-    "    print(\"\\n3. Matrix Shapes:\")\n",
-    "    print(f\"   c_tf_idf - Before: {initial_state['c_tf_idf_shape']}, After: {topic_model.c_tf_idf_.shape}\")\n",
-    "    print(f\"   embeddings - Before: {initial_state['embeddings_shape']}, After: {topic_model.topic_embeddings_.shape}\")\n",
-    "    \n",
-    "    # 4. Check representations\n",
-    "    print(\"\\n4. Topic Representations:\")\n",
-    "    print(f\"   Before: {initial_state['representations_count']} representations\")\n",
-    "    print(f\"   After: {len(topic_model.topic_representations_)} representations\")\n",
-    "    \n",
-    "    # 5. Check outlier topic\n",
-    "    print(\"\\n5. Outlier Topic (-1):\")\n",
-    "    print(f\"   Present in topics_: {-1 in topic_model.topics_}\")\n",
-    "    print(f\"   Present in sizes: {-1 in topic_model.topic_sizes_}\")\n",
-    "    print(f\"   Present in representations: {-1 in topic_model.topic_representations_}\")\n",
-    "    \n",
-    "    # 6. Verify topic deletion and reordering\n",
-    "    print(\"\\n6. Topic Deletion and Reordering:\")\n",
-    "    expected_topic_count = len(sorted_topics) - len(topics_to_delete)\n",
-    "    actual_topic_count = len([t for t in topic_model.topic_sizes_.keys() if t != -1])\n",
-    "    \n",
-    "    print(f\"   Expected topic count: {expected_topic_count}\")\n",
-    "    print(f\"   Actual topic count: {actual_topic_count}\")\n",
-    "    \n",
-    "    # Check sequential numbering\n",
-    "    expected_topic_numbers = set(range(expected_topic_count))\n",
-    "    actual_topic_numbers = set(t for t in topic_model.topic_sizes_.keys() if t != -1)\n",
-    "    sequential_numbering = expected_topic_numbers == actual_topic_numbers\n",
-    "    \n",
-    "    print(f\"   Sequential topic numbering: {sequential_numbering}\")\n",
-    "    if not sequential_numbering:\n",
-    "        print(f\"   Expected topics: {sorted(expected_topic_numbers)}\")\n",
-    "        print(f\"   Actual topics: {sorted(actual_topic_numbers)}\")\n",
-    "    \n",
-    "    # Check sizes match (excluding deleted topics)\n",
-    "    expected_sizes = sorted([size for topic, size in sorted_topics if topic not in topics_to_delete], reverse=True)\n",
-    "    actual_sizes = sorted([size for topic, size in topic_model.topic_sizes_.items() if topic != -1], reverse=True)\n",
-    "    sizes_match = expected_sizes == actual_sizes\n",
-    "    \n",
-    "    print(f\"   Sizes match: {sizes_match}\")\n",
-    "    if not sizes_match:\n",
-    "        print(f\"   Expected sizes: {expected_sizes}\")\n",
-    "        print(f\"   Actual sizes: {actual_sizes}\")\n",
-    "\n",
-    "    # Update the validations dictionary\n",
-    "    validations = {\n",
-    "        \"Topic count correct\": actual_topic_count == expected_topic_count,\n",
-    "        \"Size ordering correct\": is_ordered,\n",
-    "        \"Matrix shapes consistent\": topic_model.c_tf_idf_.shape[0] == topic_model.topic_embeddings_.shape[0],\n",
-    "        \"Sequential topic numbering\": sequential_numbering,\n",
-    "        \"Topic sizes preserved\": sizes_match,\n",
-    "        \"Outlier preserved\": all([-1 in topic_model.topics_, -1 in topic_model.topic_sizes_, -1 in topic_model.topic_representations_])\n",
-    "    }\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    run_deletion_tests()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "bertopic-dev",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py
index b5ae7661..8c5514e2 100644
--- a/tests/test_reduction/test_delete.py
+++ b/tests/test_reduction/test_delete.py
@@ -17,35 +17,20 @@ def test_delete(model, request):
     topic_model = copy.deepcopy(request.getfixturevalue(model))
     nr_topics = len(set(topic_model.topics_))
     length_documents = len(topic_model.topics_)
-    
-    print("\n" + "="*50)
-    print(f"Testing model: {model}")
-    print(f"Initial number of topics: {nr_topics}")
-    print(f"Initial topics: {sorted(list(set(topic_model.topics_)))}")
-    print(f"Number of documents: {length_documents}")
-    print("="*50)
 
     # First deletion
     topics_to_delete = [1, 2]
-    print(f"\nFirst deletion - attempting to delete topics: {topics_to_delete}")
     topic_model.delete_topics(topics_to_delete)
-    
-    print(f"Topics after first deletion: {sorted(list(set(topic_model.topics_)))}")
-    print(f"Number of topics after first deletion: {len(set(topic_model.topics_))}")
-    
     mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
-    print(f"Topic mappings after first deletion: {mappings}")
-    
     mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
-    print(f"First 10 mapped labels: {mapped_labels[:10]}")
-    print(f"First 10 model topics: {topic_model.topics_[:10]}")
-
-    print("\nFirst deletion - Assertions:")
-    print(f"Expected topics: {nr_topics - 2}, Actual topics: {len(set(topic_model.topics_))}")
-    print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}")
     
-    assert nr_topics == len(set(topic_model.topics_)) + 2
-    assert topic_model.get_topic_info().Count.sum() == length_documents
+    if model == "online_topic_model" or model == "kmeans_pca_topic_model":
+        assert nr_topics == len(set(topic_model.topics_)) + 1
+        assert topic_model.get_topic_info().Count.sum() == length_documents
+    else:
+        assert nr_topics == len(set(topic_model.topics_)) + 2
+        assert topic_model.get_topic_info().Count.sum() == length_documents
+
     if model == "online_topic_model":
         assert mapped_labels == topic_model.topics_[950:]
     else:
@@ -55,29 +40,19 @@ def test_delete(model, request):
     remaining_topics = sorted(list(set(topic_model.topics_)))
     remaining_topics = [t for t in remaining_topics if t != -1]  # Exclude outlier topic
     topics_to_delete = remaining_topics[:2]  # Take first two remaining topics
-    
-    print(f"\nSecond deletion - attempting to delete topics: {topics_to_delete}")
-    print(f"All remaining topics before second deletion: {remaining_topics}")
-    
+
     # Second deletion
     topic_model.delete_topics(topics_to_delete)
-    
-    print(f"Topics after second deletion: {sorted(list(set(topic_model.topics_)))}")
-    print(f"Number of topics after second deletion: {len(set(topic_model.topics_))}")
-    
     mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
-    print(f"Topic mappings after second deletion: {mappings}")
-    
     mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
-    print(f"First 10 mapped labels: {mapped_labels[:10]}")
-    print(f"First 10 model topics: {topic_model.topics_[:10]}")
 
-    print("\nSecond deletion - Assertions:")
-    print(f"Expected topics: {nr_topics - 4}, Actual topics: {len(set(topic_model.topics_))}")
-    print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}")
-    
-    assert nr_topics == len(set(topic_model.topics_)) + 4
-    assert topic_model.get_topic_info().Count.sum() == length_documents
+    if model == "online_topic_model" or model == "kmeans_pca_topic_model":
+        assert nr_topics == len(set(topic_model.topics_)) + 3
+        assert topic_model.get_topic_info().Count.sum() == length_documents
+    else:
+        assert nr_topics == len(set(topic_model.topics_)) + 4
+        assert topic_model.get_topic_info().Count.sum() == length_documents
+
     if model == "online_topic_model":
         assert mapped_labels == topic_model.topics_[950:]
     else:

From d85959e2b3f1e3190319a16fc99094cd7180223f Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Thu, 3 Apr 2025 10:23:50 -0400
Subject: [PATCH 3/7] minor refactor/format updates

---
 bertopic/_bertopic.py               | 56 ++++++++++++++---------------
 tests/test_reduction/test_delete.py |  2 +-
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index eab008ae..d672155e 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -1778,7 +1778,7 @@ def get_document_info(
         # the topic distributions
         document_info = topic_model.get_document_info(docs, df=df,
                                                       metadata={"Topic_distribution": distributions})
-                """
+        """
         check_documents_type(docs)
         if df is not None:
             document_info = df.copy()
@@ -2173,31 +2173,11 @@ def delete_topics(
     ) -> None:
         check_is_fitted(self)
 
-        topics_df = pd.DataFrame(
-                {
-                    "Topic": self.topics_
-                }
-            )
+        topics_df = pd.DataFrame({"Topic": self.topics_})
 
         # Check if -1 exists in the current topics
         had_outliers = -1 in set(self.topics_)
 
-        # First map deleted topics to -1
-        mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}
-        mapping[-1] = -1
-
-        # Track mappings and sizes of topics for merging topic embeddings
-        mappings = defaultdict(list)
-        for key, val in sorted(mapping.items()):
-            mappings[val].append(key)
-        mappings = {
-            topic_to: {
-                "topics_from": topics_from,
-                "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from],
-            }
-            for topic_to, topics_from in mappings.items()
-        }
-
         # If adding -1 for the first time, initialize its attributes
         if not had_outliers and any(topic in topics_to_delete for topic in self.topics_):
             # Initialize c_tf_idf for -1 topic (zeros)
@@ -2205,21 +2185,23 @@ def delete_topics(
                 outlier_row = np.zeros((1, self.c_tf_idf_.shape[1]))
                 if isinstance(self.c_tf_idf_, sp.csr_matrix):
                     outlier_row = sp.csr_matrix(outlier_row)
-                    self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
-                else:
-                    self.c_tf_idf_ = np.vstack([outlier_row, self.c_tf_idf_])
+                self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
 
             # Initialize topic embeddings for -1 topic (zeros)
             if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None:
                 outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1]))
                 self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_])
 
-            # Initialize topic representations for -1 topic: ('N/A', 1e-05)]
+            # Initialize topic representations for -1 topic: ("N/A - OUTLIER TOPIC", 1e-05)
             if hasattr(self, "topic_representations_") and self.topic_representations_ is not None:
-                self.topic_representations_[-1] = [('N/A', 1e-05)]
+                self.topic_representations_[-1] = [("N/A - OUTLIER TOPIC", 1e-05)]
 
             # Initialize ctfidf model diagonal for -1 topic (ones)
-            if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None and hasattr(self.ctfidf_model, "_idf_diag"):
+            if (
+                hasattr(self, "ctfidf_model")
+                and self.ctfidf_model is not None
+                and hasattr(self.ctfidf_model, "_idf_diag")
+            ):
                 if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix):
                     n_features = self.ctfidf_model._idf_diag.shape[1]
                     outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features))
@@ -2233,7 +2215,23 @@ def delete_topics(
                 for aspect in self.topic_aspects_:
                     self.topic_aspects_[aspect][-1] = {}
 
-        # Continue with the rest of the delete_topics logic
+        # First map deleted topics to -1
+        mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}
+        mapping[-1] = -1
+
+        # Track mappings and sizes of topics for merging topic embeddings
+        mappings = defaultdict(list)
+        for key, val in sorted(mapping.items()):
+            mappings[val].append(key)
+        mappings = {
+            topic_to: {
+                "topics_from": topics_from,
+                "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from],
+            }
+            for topic_to, topics_from in mappings.items()
+        }
+
+        # remove deleted topics and update attributes
         topics_df.Topic = topics_df.Topic.map(mapping)
         self.topic_mapper_.add_mappings(mapping, topic_model=self)
         topics_df = self._sort_mappings_by_frequency(topics_df)
diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py
index 8c5514e2..188e1ffb 100644
--- a/tests/test_reduction/test_delete.py
+++ b/tests/test_reduction/test_delete.py
@@ -23,7 +23,7 @@ def test_delete(model, request):
     topic_model.delete_topics(topics_to_delete)
     mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
     mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
-    
+
     if model == "online_topic_model" or model == "kmeans_pca_topic_model":
         assert nr_topics == len(set(topic_model.topics_)) + 1
         assert topic_model.get_topic_info().Count.sum() == length_documents

From 15e9cfeefe5c5b2b4526bb7e8ed474c22bc26883 Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Fri, 18 Apr 2025 15:06:13 -0400
Subject: [PATCH 4/7] refactor delete_topics and adjust for custom_labels_

---
 bertopic/_bertopic.py | 106 +++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index d672155e..8d7712c9 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -2171,6 +2171,14 @@ def delete_topics(
         self,
         topics_to_delete: List[int],
     ) -> None:
+        """Delete topics from the topic model.
+
+        The deleted topics will be mapped to -1 (outlier topic). Core topic attributes
+        like topic embeddings and c-TF-IDF will be automatically updated.
+
+        Arguments:
+            topics_to_delete: List of topics to delete
+        """
         check_is_fitted(self)
 
         topics_df = pd.DataFrame({"Topic": self.topics_})
@@ -2180,37 +2188,25 @@ def delete_topics(
 
         # If adding -1 for the first time, initialize its attributes
         if not had_outliers and any(topic in topics_to_delete for topic in self.topics_):
-            # Initialize c_tf_idf for -1 topic (zeros)
-            if hasattr(self, "c_tf_idf_") and self.c_tf_idf_ is not None:
-                outlier_row = np.zeros((1, self.c_tf_idf_.shape[1]))
-                if isinstance(self.c_tf_idf_, sp.csr_matrix):
-                    outlier_row = sp.csr_matrix(outlier_row)
-                self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
+            # Initialize c-TF-IDF for -1 topic (zeros)
+            outlier_row = np.zeros((1, self.c_tf_idf_.shape[1]))
+            outlier_row = sp.csr_matrix(outlier_row)
+            self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
 
             # Initialize topic embeddings for -1 topic (zeros)
-            if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None:
-                outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1]))
-                self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_])
+            outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1]))
+            self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_])
 
-            # Initialize topic representations for -1 topic: ("N/A - OUTLIER TOPIC", 1e-05)
-            if hasattr(self, "topic_representations_") and self.topic_representations_ is not None:
-                self.topic_representations_[-1] = [("N/A - OUTLIER TOPIC", 1e-05)]
+            # Initialize topic representations for -1 topic: ("", 1e-05)
+            self.topic_representations_[-1] = [("", 1e-05)]
 
-            # Initialize ctfidf model diagonal for -1 topic (ones)
-            if (
-                hasattr(self, "ctfidf_model")
-                and self.ctfidf_model is not None
-                and hasattr(self.ctfidf_model, "_idf_diag")
-            ):
-                if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix):
-                    n_features = self.ctfidf_model._idf_diag.shape[1]
-                    outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features))
-                    self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag])
-                else:
-                    outlier_diag = np.ones(1)
-                    self.ctfidf_model._idf_diag = np.concatenate([outlier_diag, self.ctfidf_model._idf_diag])
+            # Initialize ctfidf model diagonal for -1 topic (ones) if it exists
+            if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
+                n_features = self.ctfidf_model._idf_diag.shape[1]
+                outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features))
+                self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag])
 
-            # Initialize topic aspects for -1 topic (empty dict for each aspect)
+            # Initialize topic aspects for -1 topic (empty dict for each aspect) if they exist
             if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None:
                 for aspect in self.topic_aspects_:
                     self.topic_aspects_[aspect][-1] = {}
@@ -2241,36 +2237,42 @@ def delete_topics(
         final_mapping = self.topic_mapper_.get_mappings()
 
         # Update dictionary-based attributes to remove deleted topics
-        for attr in ["topic_representations_", "topic_aspects_"]:
-            if hasattr(self, attr) and getattr(self, attr) is not None:
-                old_dict = getattr(self, attr)
-                if attr == "topic_aspects_":
-                    # Handle nested dictionary for aspects
-                    new_dict = {
-                        aspect: {
-                            (final_mapping[old_topic] if old_topic != -1 else -1): content
-                            for old_topic, content in topics.items()
-                            if old_topic not in topics_to_delete
-                        }
-                        for aspect, topics in old_dict.items()
-                    }
-                else:
-                    # Handle flat dictionary
-                    new_dict = {
-                        (final_mapping[old_topic] if old_topic != -1 else -1): content
-                        for old_topic, content in old_dict.items()
-                        if old_topic not in topics_to_delete
-                    }
-                setattr(self, attr, new_dict)
+        # Handle topic_aspects_ if it exists
+        if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None:
+            new_aspects = {
+                aspect: {
+                    (final_mapping[old_topic] if old_topic != -1 else -1): content
+                    for old_topic, content in topics.items()
+                    if old_topic not in topics_to_delete
+                }
+                for aspect, topics in self.topic_aspects_.items()
+            }
+            self.topic_aspects_ = new_aspects
+
+        # Update custom labels if they exist
+        if hasattr(self, "custom_labels_") and self.custom_labels_ is not None:
+            new_labels = {
+                (final_mapping[old_topic] if old_topic != -1 else -1): label
+                for old_topic, label in self.custom_labels_.items()
+                if old_topic not in topics_to_delete
+            }
+            self.custom_labels_ = new_labels
+
+        # Update topic representations
+        new_representations = {
+            (final_mapping[old_topic] if old_topic != -1 else -1): content
+            for old_topic, content in self.topic_representations_.items()
+            if old_topic not in topics_to_delete
+        }
+        self.topic_representations_ = new_representations
 
         # Update array-based attributes using masks to remove deleted topics
         for attr in ["topic_embeddings_", "c_tf_idf_"]:
-            if hasattr(self, attr) and getattr(self, attr) is not None:
-                matrix = getattr(self, attr)
-                mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])])
-                setattr(self, attr, matrix[mask])
+            matrix = getattr(self, attr)
+            mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])])
+            setattr(self, attr, matrix[mask])
 
-        # Update ctfidf model to remove deleted topics
+        # Update ctfidf model to remove deleted topics if it exists
         if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
             mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])])
             self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask]

From 31fd95ee3ff7529fab2e84f95488adc7bbe02183 Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Fri, 25 Apr 2025 20:42:42 -0400
Subject: [PATCH 5/7] debug and update delete_topics

---
 bertopic/_bertopic.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 8d7712c9..ec04c23d 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -2200,6 +2200,18 @@ def delete_topics(
             # Initialize topic representations for -1 topic: ("", 1e-05)
             self.topic_representations_[-1] = [("", 1e-05)]
 
+            # Initialize representative docs for -1 topic (empty list)
+            self.representative_docs_[-1] = []
+
+            # Initialize representative images for -1 topic if images are being used
+            if self.representative_images_ is not None:
+                outlier_image = np.zeros((1, self.representative_images_.shape[1]))
+                self.representative_images_ = np.vstack([outlier_image, self.representative_images_])
+
+            # Initialize custom labels for -1 topic if they exist
+            if hasattr(self, "custom_labels_") and self.custom_labels_ is not None:
+                self.custom_labels_[-1] = ""
+
             # Initialize ctfidf model diagonal for -1 topic (ones) if it exists
             if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
                 n_features = self.ctfidf_model._idf_diag.shape[1]
@@ -2234,7 +2246,7 @@ def delete_topics(
         self._update_topic_size(topics_df)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
 
-        final_mapping = self.topic_mapper_.get_mappings()
+        final_mapping = self.topic_mapper_.get_mappings(original_topics=False)
 
         # Update dictionary-based attributes to remove deleted topics
         # Handle topic_aspects_ if it exists
@@ -2266,6 +2278,20 @@ def delete_topics(
         }
         self.topic_representations_ = new_representations
 
+        # Update representative docs if they exist
+        new_representative_docs = {
+            (final_mapping[old_topic] if old_topic != -1 else -1): docs
+            for old_topic, docs in self.representative_docs_.items()
+            if old_topic not in topics_to_delete
+        }
+        self.representative_docs_ = new_representative_docs
+
+        # Update representative images if they exist
+        if self.representative_images_ is not None:
+            # Create a mask for non-deleted topics
+            mask = np.array([topic not in topics_to_delete for topic in range(len(self.representative_images_))])
+            self.representative_images_ = self.representative_images_[mask] if mask.any() else None
+
         # Update array-based attributes using masks to remove deleted topics
         for attr in ["topic_embeddings_", "c_tf_idf_"]:
             matrix = getattr(self, attr)

From 1bc6593ded1acb3fd6b046a08aedfcff6ca912f1 Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Tue, 27 May 2025 17:05:10 -0400
Subject: [PATCH 6/7] draft delete adjustment for zero shot

---
 .gitignore                  |    1 +
 bertopic/_bertopic.py       |   13 +-
 delete_topics_test_sc.ipynb | 1277 +++++++++++++++++++++++++++++++++++
 3 files changed, 1288 insertions(+), 3 deletions(-)
 create mode 100644 delete_topics_test_sc.ipynb

diff --git a/.gitignore b/.gitignore
index 77c026df..e7058c9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,7 @@ docs/_build/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+notebooks/
 
 # IPython
 profile_default/
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index ec04c23d..499fd584 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
+from copy import deepcopy
 
 from tqdm import tqdm
 from pathlib import Path
@@ -2241,7 +2242,7 @@ def delete_topics(
 
         # remove deleted topics and update attributes
         topics_df.Topic = topics_df.Topic.map(mapping)
-        self.topic_mapper_.add_mappings(mapping, topic_model=self)
+        self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self))
         topics_df = self._sort_mappings_by_frequency(topics_df)
         self._update_topic_size(topics_df)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
@@ -4921,6 +4922,7 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic):
             for key, value in topics_to_map.items():
                 mapping[value].append(key)
 
+            print(f'len of mapping: {len(mapping)}')
             for topic_to, topics_from in mapping.items():
                 # which of the original topics are zero-shot
                 zeroshot_topic_ids = [
@@ -4935,20 +4937,25 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic):
                     topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]]
                     for topic_id in zeroshot_topic_ids
                 ]
+                print(f'topics_from: {topics_from} and topic_to: {topic_to}')
+                print(f'zeroshot_labels: {zeroshot_labels}')
                 zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels)
                 cosine_similarities = cosine_similarity(
                     zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]]
                 ).flatten()
+                print(f'cosine_similarities: {cosine_similarities}')
                 best_zeroshot_topic_idx = np.argmax(cosine_similarities)
                 best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx]
-
+                print(f'best_cosine_similarity: {best_cosine_similarity}')
                 if best_cosine_similarity >= topic_model.zeroshot_min_similarity:
                     # Using the topic ID from before mapping, get the idx into the zeroshot topic list
                     new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[
                         zeroshot_topic_ids[best_zeroshot_topic_idx]
                     ]
-
+            print(f'new_topic_id_to_zeroshot_topic_idx: {new_topic_id_to_zeroshot_topic_idx}')
+            # print('running without updating topic_model._topic_id_to_zeroshot_topic_idx!')
             topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx
+            print(f'after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {topic_model._topic_id_to_zeroshot_topic_idx}')
 
     def add_new_topics(self, mappings: Mapping[int, int]):
         """Add new row(s) of topic mappings.
diff --git a/delete_topics_test_sc.ipynb b/delete_topics_test_sc.ipynb
new file mode 100644
index 00000000..dae2157a
--- /dev/null
+++ b/delete_topics_test_sc.ipynb
@@ -0,0 +1,1277 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !python -m pip install -e \".[dev]\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\sc305\\miniforge3\\envs\\bertopic-sc\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "from bertopic import BERTopic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test Case from MaartenGr No. 2 - Zero Shot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Before delete:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-05-27 16:51:28,462 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from hdbscan import HDBSCAN\n",
+    "from umap import UMAP\n",
+    "\n",
+    "from bertopic import BERTopic\n",
+    "from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance\n",
+    "\n",
+    "docs = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"][\"abstract\"][:20_000]\n",
+    "\n",
+    "# Pre-calculate embeddings\n",
+    "embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
+    "# embeddings = embedding_model.encode(docs, show_progress_bar=True)\n",
+    "\n",
+    "# # # Save the embeddings to a file\n",
+    "# np.save(\"test_ArXiv_embeddings_zeroshot_example.npy\", embeddings)\n",
+    "loaded_embeddings = np.load(\"test_ArXiv_embeddings_zeroshot_example.npy\")\n",
+    "\n",
+    "# Use sub-models\n",
+    "umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, random_state=42)\n",
+    "hdbscan_model = HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True)\n",
+    "\n",
+    "# Representation models\n",
+    "keybert_model = KeyBERTInspired()\n",
+    "mmr_model = MaximalMarginalRelevance(diversity=0.3)\n",
+    "representation_model = {\n",
+    "  \"KeyBERT\": keybert_model,\n",
+    "  \"MMR\": mmr_model,\n",
+    "}\n",
+    "\n",
+    "# BERTopic\n",
+    "topic_model = BERTopic(\n",
+    "    embedding_model=embedding_model,\n",
+    "    umap_model=umap_model,\n",
+    "    hdbscan_model=hdbscan_model,\n",
+    "    zeroshot_topic_list=[\"topic modeling\", \"large language models\"],\n",
+    "    verbose=True,\n",
+    ").fit(docs, loaded_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7624</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>topic modeling</td>\n",
+       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
+       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>24</td>\n",
+       "      <td>1_modulation_radio_channel_transmitters</td>\n",
+       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
+       "      <td>[  We survey the latest advances in machine le...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
+       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
+       "      <td>[  Pain is a complex and subjective experience...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
+       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
+       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Topic  Count                                     Name  \\\n",
+       "0     -1   7624                         -1_the_of_to_and   \n",
+       "1      0      4                           topic modeling   \n",
+       "2      1     24  1_modulation_radio_channel_transmitters   \n",
+       "3      2      7      2_pain_discomfort_diagnostic_facial   \n",
+       "4      3      6    3_quantum_entanglement_wave_annealers   \n",
+       "\n",
+       "                                      Representation  \\\n",
+       "0  [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "1  [topic, papers, svd, topics, allocation, conta...   \n",
+       "2  [modulation, radio, channel, transmitters, sig...   \n",
+       "3  [pain, discomfort, diagnostic, facial, intensi...   \n",
+       "4  [quantum, entanglement, wave, annealers, conva...   \n",
+       "\n",
+       "                                 Representative_Docs  \n",
+       "0  [  A crucial task in system identification pro...  \n",
+       "1  [  Topic models have emerged as fundamental to...  \n",
+       "2  [  We survey the latest advances in machine le...  \n",
+       "3  [  Pain is a complex and subjective experience...  \n",
+       "4  [  Modern deep learning has enabled unpreceden...  "
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "topic_model.get_topic_info()[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_topic_info = topic_model.get_topic_info()\n",
+    "original_topic_info[original_topic_info['Name']=='large language models']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7624</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>264</th>\n",
+       "      <td>263</td>\n",
+       "      <td>510</td>\n",
+       "      <td>263_generative_gan_gans_generator</td>\n",
+       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
+       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>324</th>\n",
+       "      <td>323</td>\n",
+       "      <td>403</td>\n",
+       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
+       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
+       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>98</td>\n",
+       "      <td>269</td>\n",
+       "      <td>98_recommendation_user_items_item</td>\n",
+       "      <td>[recommendation, user, items, item, recommende...</td>\n",
+       "      <td>[  Matrix factorization techniques have been w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>28</td>\n",
+       "      <td>196</td>\n",
+       "      <td>28_privacy_private_differential_differentially</td>\n",
+       "      <td>[privacy, private, differential, differentiall...</td>\n",
+       "      <td>[  The process of data mining with differentia...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Topic  Count                                            Name  \\\n",
+       "0       -1   7624                                -1_the_of_to_and   \n",
+       "264    263    510               263_generative_gan_gans_generator   \n",
+       "324    323    403           323_quantization_hardware_pruning_gpu   \n",
+       "99      98    269               98_recommendation_user_items_item   \n",
+       "29      28    196  28_privacy_private_differential_differentially   \n",
+       "\n",
+       "                                        Representation  \\\n",
+       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "264  [generative, gan, gans, generator, adversarial...   \n",
+       "324  [quantization, hardware, pruning, gpu, precisi...   \n",
+       "99   [recommendation, user, items, item, recommende...   \n",
+       "29   [privacy, private, differential, differentiall...   \n",
+       "\n",
+       "                                   Representative_Docs  \n",
+       "0    [  A crucial task in system identification pro...  \n",
+       "264  [  Generative Adversarial Networks (GANs) are ...  \n",
+       "324  [  Deep neural networks (DNNs) are used by dif...  \n",
+       "99   [  Matrix factorization techniques have been w...  \n",
+       "29   [  The process of data mining with differentia...  "
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_topic_info.sort_values('Count', ascending=False)[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>topic modeling</td>\n",
+       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
+       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Topic  Count            Name  \\\n",
+       "1      0      4  topic modeling   \n",
+       "\n",
+       "                                      Representation  \\\n",
+       "1  [topic, papers, svd, topics, allocation, conta...   \n",
+       "\n",
+       "                                 Representative_Docs  \n",
+       "1  [  Topic models have emerged as fundamental to...  "
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_topic_info[original_topic_info['Name']=='topic modeling']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>24</td>\n",
+       "      <td>1_modulation_radio_channel_transmitters</td>\n",
+       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
+       "      <td>[  We survey the latest advances in machine le...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Topic  Count                                     Name  \\\n",
+       "2      1     24  1_modulation_radio_channel_transmitters   \n",
+       "\n",
+       "                                      Representation  \\\n",
+       "2  [modulation, radio, channel, transmitters, sig...   \n",
+       "\n",
+       "                                 Representative_Docs  \n",
+       "2  [  We survey the latest advances in machine le...  "
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_topic_info[original_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run delete:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7624</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>topic modeling</td>\n",
+       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
+       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>24</td>\n",
+       "      <td>1_modulation_radio_channel_transmitters</td>\n",
+       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
+       "      <td>[  We survey the latest advances in machine le...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
+       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
+       "      <td>[  Pain is a complex and subjective experience...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
+       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
+       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Topic  Count                                     Name  \\\n",
+       "0     -1   7624                         -1_the_of_to_and   \n",
+       "1      0      4                           topic modeling   \n",
+       "2      1     24  1_modulation_radio_channel_transmitters   \n",
+       "3      2      7      2_pain_discomfort_diagnostic_facial   \n",
+       "4      3      6    3_quantum_entanglement_wave_annealers   \n",
+       "\n",
+       "                                      Representation  \\\n",
+       "0  [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "1  [topic, papers, svd, topics, allocation, conta...   \n",
+       "2  [modulation, radio, channel, transmitters, sig...   \n",
+       "3  [pain, discomfort, diagnostic, facial, intensi...   \n",
+       "4  [quantum, entanglement, wave, annealers, conva...   \n",
+       "\n",
+       "                                 Representative_Docs  \n",
+       "0  [  A crucial task in system identification pro...  \n",
+       "1  [  Topic models have emerged as fundamental to...  \n",
+       "2  [  We survey the latest advances in machine le...  \n",
+       "3  [  Pain is a complex and subjective experience...  \n",
+       "4  [  Modern deep learning has enabled unpreceden...  "
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "topic_model.get_topic_info()[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# topic_model.merge_topics(docs, [1, 2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# topic_model.delete_topics([0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len of mapping: 375\n",
+      "topics_from: [np.int64(0)] and topic_to: 0\n",
+      "zeroshot_labels: ['topic modeling']\n",
+      "cosine_similarities: [0.31793424]\n",
+      "best_cosine_similarity: 0.3179342448711395\n",
+      "new_topic_id_to_zeroshot_topic_idx: {}\n",
+      "after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {}\n"
+     ]
+    }
+   ],
+   "source": [
+    "topic_model.delete_topics([1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{0: 0}"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# updated to self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) to avoid unecessary updates in add_mappings\n",
+    "topic_model._topic_id_to_zeroshot_topic_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7648</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>topic modeling</td>\n",
+       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
+       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
+       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
+       "      <td>[  Pain is a complex and subjective experience...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>6</td>\n",
+       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
+       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
+       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>74</td>\n",
+       "      <td>4_quantum_classical_qubits_states</td>\n",
+       "      <td>[quantum, classical, qubits, states, circuit, ...</td>\n",
+       "      <td>[  Quantum machine learning witnesses an incre...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>369</th>\n",
+       "      <td>369</td>\n",
+       "      <td>8</td>\n",
+       "      <td>369_chaos_initialization_jacobian_isometry</td>\n",
+       "      <td>[chaos, initialization, jacobian, isometry, de...</td>\n",
+       "      <td>[  It is well known that the initialization of...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>370</th>\n",
+       "      <td>370</td>\n",
+       "      <td>37</td>\n",
+       "      <td>370_relu_depth_activation_mathbb</td>\n",
+       "      <td>[relu, depth, activation, mathbb, functions, w...</td>\n",
+       "      <td>[  We study the necessary and sufficient compl...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>371</th>\n",
+       "      <td>371</td>\n",
+       "      <td>16</td>\n",
+       "      <td>371_generalization_nonvacuous_explain_sensitivity</td>\n",
+       "      <td>[generalization, nonvacuous, explain, sensitiv...</td>\n",
+       "      <td>[  Neural networks exhibit good generalization...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>372</th>\n",
+       "      <td>372</td>\n",
+       "      <td>71</td>\n",
+       "      <td>372_minima_mathbf_loss_relu</td>\n",
+       "      <td>[minima, mathbf, loss, relu, activation, layer...</td>\n",
+       "      <td>[  Deep learning models are often successfully...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>373</th>\n",
+       "      <td>373</td>\n",
+       "      <td>5</td>\n",
+       "      <td>373_sgd_minima_saddles_band</td>\n",
+       "      <td>[sgd, minima, saddles, band, descent, degenera...</td>\n",
+       "      <td>[  Recent years have seen a growing interest i...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>374 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Topic  Count                                               Name  \\\n",
+       "0       -1   7648                                   -1_the_of_to_and   \n",
+       "1        0      4                                     topic modeling   \n",
+       "2        2      7                2_pain_discomfort_diagnostic_facial   \n",
+       "3        3      6              3_quantum_entanglement_wave_annealers   \n",
+       "4        4     74                  4_quantum_classical_qubits_states   \n",
+       "..     ...    ...                                                ...   \n",
+       "369    369      8         369_chaos_initialization_jacobian_isometry   \n",
+       "370    370     37                   370_relu_depth_activation_mathbb   \n",
+       "371    371     16  371_generalization_nonvacuous_explain_sensitivity   \n",
+       "372    372     71                        372_minima_mathbf_loss_relu   \n",
+       "373    373      5                        373_sgd_minima_saddles_band   \n",
+       "\n",
+       "                                        Representation  \\\n",
+       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "1    [topic, papers, svd, topics, allocation, conta...   \n",
+       "2    [pain, discomfort, diagnostic, facial, intensi...   \n",
+       "3    [quantum, entanglement, wave, annealers, conva...   \n",
+       "4    [quantum, classical, qubits, states, circuit, ...   \n",
+       "..                                                 ...   \n",
+       "369  [chaos, initialization, jacobian, isometry, de...   \n",
+       "370  [relu, depth, activation, mathbb, functions, w...   \n",
+       "371  [generalization, nonvacuous, explain, sensitiv...   \n",
+       "372  [minima, mathbf, loss, relu, activation, layer...   \n",
+       "373  [sgd, minima, saddles, band, descent, degenera...   \n",
+       "\n",
+       "                                   Representative_Docs  \n",
+       "0    [  A crucial task in system identification pro...  \n",
+       "1    [  Topic models have emerged as fundamental to...  \n",
+       "2    [  Pain is a complex and subjective experience...  \n",
+       "3    [  Modern deep learning has enabled unpreceden...  \n",
+       "4    [  Quantum machine learning witnesses an incre...  \n",
+       "..                                                 ...  \n",
+       "369  [  It is well known that the initialization of...  \n",
+       "370  [  We study the necessary and sufficient compl...  \n",
+       "371  [  Neural networks exhibit good generalization...  \n",
+       "372  [  Deep learning models are often successfully...  \n",
+       "373  [  Recent years have seen a growing interest i...  \n",
+       "\n",
+       "[374 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "after_delete_one_topic_info = topic_model.get_topic_info()\n",
+    "after_delete_one_topic_info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>topic modeling</td>\n",
+       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
+       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Topic  Count            Name  \\\n",
+       "1      0      4  topic modeling   \n",
+       "\n",
+       "                                      Representation  \\\n",
+       "1  [topic, papers, svd, topics, allocation, conta...   \n",
+       "\n",
+       "                                 Representative_Docs  \n",
+       "1  [  Topic models have emerged as fundamental to...  "
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('topic modeling')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing Name suffixes after delete/merge:\n",
+      "modulation_radio_channel_transmitters\n",
+      "\n",
+      "Summary:\n",
+      "Original unique suffix count: 375\n",
+      "After delete/merge unique suffix count: 374\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Extract just the suffix (after the first “_”) from each Name\n",
+    "orig_suffixes = original_topic_info['Name'].str.split('_', n=1).str[1].where(original_topic_info['Name'].str.contains('_'), original_topic_info['Name'])\n",
+    "after_suffixes = after_delete_one_topic_info['Name'].str.split('_', n=1).str[1].where(after_delete_one_topic_info['Name'].str.contains('_'), after_delete_one_topic_info['Name'])\n",
+    "\n",
+    "\n",
+    "# Build unique sets\n",
+    "orig_set = set(orig_suffixes)\n",
+    "after_set = set(after_suffixes)\n",
+    "\n",
+    "# Find any suffixes that were in the original but not after deletion\n",
+    "missing = orig_set - after_set\n",
+    "\n",
+    "# Report\n",
+    "if missing:\n",
+    "    print(\"Missing Name suffixes after delete/merge:\")\n",
+    "    for name in sorted(missing):\n",
+    "        print(name)\n",
+    "else:\n",
+    "    print(\"All Name suffixes are preserved\")\n",
+    "\n",
+    "print(\"\\nSummary:\")\n",
+    "print(f\"Original unique suffix count: {len(orig_set)}\")\n",
+    "print(f\"After delete/merge unique suffix count: {len(after_set)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7624</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>264</th>\n",
+       "      <td>263</td>\n",
+       "      <td>510</td>\n",
+       "      <td>263_generative_gan_gans_generator</td>\n",
+       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
+       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>324</th>\n",
+       "      <td>323</td>\n",
+       "      <td>403</td>\n",
+       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
+       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
+       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>98</td>\n",
+       "      <td>269</td>\n",
+       "      <td>98_recommendation_user_items_item</td>\n",
+       "      <td>[recommendation, user, items, item, recommende...</td>\n",
+       "      <td>[  Matrix factorization techniques have been w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>28</td>\n",
+       "      <td>196</td>\n",
+       "      <td>28_privacy_private_differential_differentially</td>\n",
+       "      <td>[privacy, private, differential, differentiall...</td>\n",
+       "      <td>[  The process of data mining with differentia...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Topic  Count                                            Name  \\\n",
+       "0       -1   7624                                -1_the_of_to_and   \n",
+       "264    263    510               263_generative_gan_gans_generator   \n",
+       "324    323    403           323_quantization_hardware_pruning_gpu   \n",
+       "99      98    269               98_recommendation_user_items_item   \n",
+       "29      28    196  28_privacy_private_differential_differentially   \n",
+       "\n",
+       "                                        Representation  \\\n",
+       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "264  [generative, gan, gans, generator, adversarial...   \n",
+       "324  [quantization, hardware, pruning, gpu, precisi...   \n",
+       "99   [recommendation, user, items, item, recommende...   \n",
+       "29   [privacy, private, differential, differentiall...   \n",
+       "\n",
+       "                                   Representative_Docs  \n",
+       "0    [  A crucial task in system identification pro...  \n",
+       "264  [  Generative Adversarial Networks (GANs) are ...  \n",
+       "324  [  Deep neural networks (DNNs) are used by dif...  \n",
+       "99   [  Matrix factorization techniques have been w...  \n",
+       "29   [  The process of data mining with differentia...  "
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_topic_info.sort_values('Count', ascending=False)[:5]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Topic</th>\n",
+       "      <th>Count</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Representation</th>\n",
+       "      <th>Representative_Docs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>7648</td>\n",
+       "      <td>-1_the_of_to_and</td>\n",
+       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
+       "      <td>[  A crucial task in system identification pro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>263</th>\n",
+       "      <td>263</td>\n",
+       "      <td>510</td>\n",
+       "      <td>263_generative_gan_gans_generator</td>\n",
+       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
+       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>323</th>\n",
+       "      <td>323</td>\n",
+       "      <td>403</td>\n",
+       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
+       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
+       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>98</td>\n",
+       "      <td>269</td>\n",
+       "      <td>98_recommendation_user_items_item</td>\n",
+       "      <td>[recommendation, user, items, item, recommende...</td>\n",
+       "      <td>[  Matrix factorization techniques have been w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>28</td>\n",
+       "      <td>196</td>\n",
+       "      <td>28_privacy_private_differential_differentially</td>\n",
+       "      <td>[privacy, private, differential, differentiall...</td>\n",
+       "      <td>[  The process of data mining with differentia...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Topic  Count                                            Name  \\\n",
+       "0       -1   7648                                -1_the_of_to_and   \n",
+       "263    263    510               263_generative_gan_gans_generator   \n",
+       "323    323    403           323_quantization_hardware_pruning_gpu   \n",
+       "98      98    269               98_recommendation_user_items_item   \n",
+       "28      28    196  28_privacy_private_differential_differentially   \n",
+       "\n",
+       "                                        Representation  \\\n",
+       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
+       "263  [generative, gan, gans, generator, adversarial...   \n",
+       "323  [quantization, hardware, pruning, gpu, precisi...   \n",
+       "98   [recommendation, user, items, item, recommende...   \n",
+       "28   [privacy, private, differential, differentiall...   \n",
+       "\n",
+       "                                   Representative_Docs  \n",
+       "0    [  A crucial task in system identification pro...  \n",
+       "263  [  Generative Adversarial Networks (GANs) are ...  \n",
+       "323  [  Deep neural networks (DNNs) are used by dif...  \n",
+       "98   [  Matrix factorization techniques have been w...  \n",
+       "28   [  The process of data mining with differentia...  "
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "after_delete_one_topic_info.sort_values('Count', ascending=False)[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bertopic-sc",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.22"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 17f0a72ee7580c399eda9d4b120552af75dd0a2f Mon Sep 17 00:00:00 2001
From: Shuang Chen <shuang@sightly.com>
Date: Sun, 27 Jul 2025 12:27:22 -0400
Subject: [PATCH 7/7] finalize delete topics

---
 bertopic/_bertopic.py       |    8 -
 delete_topics_test_sc.ipynb | 1277 -----------------------------------
 2 files changed, 1285 deletions(-)
 delete mode 100644 delete_topics_test_sc.ipynb

diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
index 499fd584..bd2fad43 100644
--- a/bertopic/_bertopic.py
+++ b/bertopic/_bertopic.py
@@ -4922,7 +4922,6 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic):
             for key, value in topics_to_map.items():
                 mapping[value].append(key)
 
-            print(f'len of mapping: {len(mapping)}')
             for topic_to, topics_from in mapping.items():
                 # which of the original topics are zero-shot
                 zeroshot_topic_ids = [
@@ -4937,25 +4936,18 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic):
                     topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]]
                     for topic_id in zeroshot_topic_ids
                 ]
-                print(f'topics_from: {topics_from} and topic_to: {topic_to}')
-                print(f'zeroshot_labels: {zeroshot_labels}')
                 zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels)
                 cosine_similarities = cosine_similarity(
                     zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]]
                 ).flatten()
-                print(f'cosine_similarities: {cosine_similarities}')
                 best_zeroshot_topic_idx = np.argmax(cosine_similarities)
                 best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx]
-                print(f'best_cosine_similarity: {best_cosine_similarity}')
                 if best_cosine_similarity >= topic_model.zeroshot_min_similarity:
                     # Using the topic ID from before mapping, get the idx into the zeroshot topic list
                     new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[
                         zeroshot_topic_ids[best_zeroshot_topic_idx]
                     ]
-            print(f'new_topic_id_to_zeroshot_topic_idx: {new_topic_id_to_zeroshot_topic_idx}')
-            # print('running without updating topic_model._topic_id_to_zeroshot_topic_idx!')
             topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx
-            print(f'after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {topic_model._topic_id_to_zeroshot_topic_idx}')
 
     def add_new_topics(self, mappings: Mapping[int, int]):
         """Add new row(s) of topic mappings.
diff --git a/delete_topics_test_sc.ipynb b/delete_topics_test_sc.ipynb
deleted file mode 100644
index dae2157a..00000000
--- a/delete_topics_test_sc.ipynb
+++ /dev/null
@@ -1,1277 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !python -m pip install -e \".[dev]\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\sc305\\miniforge3\\envs\\bertopic-sc\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "%reload_ext autoreload\n",
-    "%autoreload 2\n",
-    "from bertopic import BERTopic"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Test Case from MaartenGr No. 2 - Zero Shot"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Before delete:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-05-27 16:51:28,462 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "from sentence_transformers import SentenceTransformer\n",
-    "from hdbscan import HDBSCAN\n",
-    "from umap import UMAP\n",
-    "\n",
-    "from bertopic import BERTopic\n",
-    "from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance\n",
-    "\n",
-    "docs = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"][\"abstract\"][:20_000]\n",
-    "\n",
-    "# Pre-calculate embeddings\n",
-    "embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
-    "# embeddings = embedding_model.encode(docs, show_progress_bar=True)\n",
-    "\n",
-    "# # # Save the embeddings to a file\n",
-    "# np.save(\"test_ArXiv_embeddings_zeroshot_example.npy\", embeddings)\n",
-    "loaded_embeddings = np.load(\"test_ArXiv_embeddings_zeroshot_example.npy\")\n",
-    "\n",
-    "# Use sub-models\n",
-    "umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, random_state=42)\n",
-    "hdbscan_model = HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True)\n",
-    "\n",
-    "# Representation models\n",
-    "keybert_model = KeyBERTInspired()\n",
-    "mmr_model = MaximalMarginalRelevance(diversity=0.3)\n",
-    "representation_model = {\n",
-    "  \"KeyBERT\": keybert_model,\n",
-    "  \"MMR\": mmr_model,\n",
-    "}\n",
-    "\n",
-    "# BERTopic\n",
-    "topic_model = BERTopic(\n",
-    "    embedding_model=embedding_model,\n",
-    "    umap_model=umap_model,\n",
-    "    hdbscan_model=hdbscan_model,\n",
-    "    zeroshot_topic_list=[\"topic modeling\", \"large language models\"],\n",
-    "    verbose=True,\n",
-    ").fit(docs, loaded_embeddings)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7624</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>topic modeling</td>\n",
-       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
-       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>24</td>\n",
-       "      <td>1_modulation_radio_channel_transmitters</td>\n",
-       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
-       "      <td>[  We survey the latest advances in machine le...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
-       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
-       "      <td>[  Pain is a complex and subjective experience...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
-       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
-       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Topic  Count                                     Name  \\\n",
-       "0     -1   7624                         -1_the_of_to_and   \n",
-       "1      0      4                           topic modeling   \n",
-       "2      1     24  1_modulation_radio_channel_transmitters   \n",
-       "3      2      7      2_pain_discomfort_diagnostic_facial   \n",
-       "4      3      6    3_quantum_entanglement_wave_annealers   \n",
-       "\n",
-       "                                      Representation  \\\n",
-       "0  [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "1  [topic, papers, svd, topics, allocation, conta...   \n",
-       "2  [modulation, radio, channel, transmitters, sig...   \n",
-       "3  [pain, discomfort, diagnostic, facial, intensi...   \n",
-       "4  [quantum, entanglement, wave, annealers, conva...   \n",
-       "\n",
-       "                                 Representative_Docs  \n",
-       "0  [  A crucial task in system identification pro...  \n",
-       "1  [  Topic models have emerged as fundamental to...  \n",
-       "2  [  We survey the latest advances in machine le...  \n",
-       "3  [  Pain is a complex and subjective experience...  \n",
-       "4  [  Modern deep learning has enabled unpreceden...  "
-      ]
-     },
-     "execution_count": 71,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "topic_model.get_topic_info()[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n",
-       "Index: []"
-      ]
-     },
-     "execution_count": 72,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "original_topic_info = topic_model.get_topic_info()\n",
-    "original_topic_info[original_topic_info['Name']=='large language models']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7624</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>264</th>\n",
-       "      <td>263</td>\n",
-       "      <td>510</td>\n",
-       "      <td>263_generative_gan_gans_generator</td>\n",
-       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
-       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>324</th>\n",
-       "      <td>323</td>\n",
-       "      <td>403</td>\n",
-       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
-       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
-       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>99</th>\n",
-       "      <td>98</td>\n",
-       "      <td>269</td>\n",
-       "      <td>98_recommendation_user_items_item</td>\n",
-       "      <td>[recommendation, user, items, item, recommende...</td>\n",
-       "      <td>[  Matrix factorization techniques have been w...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
-       "      <td>28</td>\n",
-       "      <td>196</td>\n",
-       "      <td>28_privacy_private_differential_differentially</td>\n",
-       "      <td>[privacy, private, differential, differentiall...</td>\n",
-       "      <td>[  The process of data mining with differentia...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     Topic  Count                                            Name  \\\n",
-       "0       -1   7624                                -1_the_of_to_and   \n",
-       "264    263    510               263_generative_gan_gans_generator   \n",
-       "324    323    403           323_quantization_hardware_pruning_gpu   \n",
-       "99      98    269               98_recommendation_user_items_item   \n",
-       "29      28    196  28_privacy_private_differential_differentially   \n",
-       "\n",
-       "                                        Representation  \\\n",
-       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "264  [generative, gan, gans, generator, adversarial...   \n",
-       "324  [quantization, hardware, pruning, gpu, precisi...   \n",
-       "99   [recommendation, user, items, item, recommende...   \n",
-       "29   [privacy, private, differential, differentiall...   \n",
-       "\n",
-       "                                   Representative_Docs  \n",
-       "0    [  A crucial task in system identification pro...  \n",
-       "264  [  Generative Adversarial Networks (GANs) are ...  \n",
-       "324  [  Deep neural networks (DNNs) are used by dif...  \n",
-       "99   [  Matrix factorization techniques have been w...  \n",
-       "29   [  The process of data mining with differentia...  "
-      ]
-     },
-     "execution_count": 73,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "original_topic_info.sort_values('Count', ascending=False)[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>topic modeling</td>\n",
-       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
-       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Topic  Count            Name  \\\n",
-       "1      0      4  topic modeling   \n",
-       "\n",
-       "                                      Representation  \\\n",
-       "1  [topic, papers, svd, topics, allocation, conta...   \n",
-       "\n",
-       "                                 Representative_Docs  \n",
-       "1  [  Topic models have emerged as fundamental to...  "
-      ]
-     },
-     "execution_count": 74,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "original_topic_info[original_topic_info['Name']=='topic modeling']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>24</td>\n",
-       "      <td>1_modulation_radio_channel_transmitters</td>\n",
-       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
-       "      <td>[  We survey the latest advances in machine le...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Topic  Count                                     Name  \\\n",
-       "2      1     24  1_modulation_radio_channel_transmitters   \n",
-       "\n",
-       "                                      Representation  \\\n",
-       "2  [modulation, radio, channel, transmitters, sig...   \n",
-       "\n",
-       "                                 Representative_Docs  \n",
-       "2  [  We survey the latest advances in machine le...  "
-      ]
-     },
-     "execution_count": 75,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "original_topic_info[original_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Run delete:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7624</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>topic modeling</td>\n",
-       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
-       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>24</td>\n",
-       "      <td>1_modulation_radio_channel_transmitters</td>\n",
-       "      <td>[modulation, radio, channel, transmitters, sig...</td>\n",
-       "      <td>[  We survey the latest advances in machine le...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
-       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
-       "      <td>[  Pain is a complex and subjective experience...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
-       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
-       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Topic  Count                                     Name  \\\n",
-       "0     -1   7624                         -1_the_of_to_and   \n",
-       "1      0      4                           topic modeling   \n",
-       "2      1     24  1_modulation_radio_channel_transmitters   \n",
-       "3      2      7      2_pain_discomfort_diagnostic_facial   \n",
-       "4      3      6    3_quantum_entanglement_wave_annealers   \n",
-       "\n",
-       "                                      Representation  \\\n",
-       "0  [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "1  [topic, papers, svd, topics, allocation, conta...   \n",
-       "2  [modulation, radio, channel, transmitters, sig...   \n",
-       "3  [pain, discomfort, diagnostic, facial, intensi...   \n",
-       "4  [quantum, entanglement, wave, annealers, conva...   \n",
-       "\n",
-       "                                 Representative_Docs  \n",
-       "0  [  A crucial task in system identification pro...  \n",
-       "1  [  Topic models have emerged as fundamental to...  \n",
-       "2  [  We survey the latest advances in machine le...  \n",
-       "3  [  Pain is a complex and subjective experience...  \n",
-       "4  [  Modern deep learning has enabled unpreceden...  "
-      ]
-     },
-     "execution_count": 77,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "topic_model.get_topic_info()[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# topic_model.merge_topics(docs, [1, 2])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# topic_model.delete_topics([0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "len of mapping: 375\n",
-      "topics_from: [np.int64(0)] and topic_to: 0\n",
-      "zeroshot_labels: ['topic modeling']\n",
-      "cosine_similarities: [0.31793424]\n",
-      "best_cosine_similarity: 0.3179342448711395\n",
-      "new_topic_id_to_zeroshot_topic_idx: {}\n",
-      "after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {}\n"
-     ]
-    }
-   ],
-   "source": [
-    "topic_model.delete_topics([1])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{0: 0}"
-      ]
-     },
-     "execution_count": 88,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# updated to self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) to avoid unecessary updates in add_mappings\n",
-    "topic_model._topic_id_to_zeroshot_topic_idx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7648</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>topic modeling</td>\n",
-       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
-       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>7</td>\n",
-       "      <td>2_pain_discomfort_diagnostic_facial</td>\n",
-       "      <td>[pain, discomfort, diagnostic, facial, intensi...</td>\n",
-       "      <td>[  Pain is a complex and subjective experience...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3_quantum_entanglement_wave_annealers</td>\n",
-       "      <td>[quantum, entanglement, wave, annealers, conva...</td>\n",
-       "      <td>[  Modern deep learning has enabled unpreceden...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>74</td>\n",
-       "      <td>4_quantum_classical_qubits_states</td>\n",
-       "      <td>[quantum, classical, qubits, states, circuit, ...</td>\n",
-       "      <td>[  Quantum machine learning witnesses an incre...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>369</th>\n",
-       "      <td>369</td>\n",
-       "      <td>8</td>\n",
-       "      <td>369_chaos_initialization_jacobian_isometry</td>\n",
-       "      <td>[chaos, initialization, jacobian, isometry, de...</td>\n",
-       "      <td>[  It is well known that the initialization of...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>370</th>\n",
-       "      <td>370</td>\n",
-       "      <td>37</td>\n",
-       "      <td>370_relu_depth_activation_mathbb</td>\n",
-       "      <td>[relu, depth, activation, mathbb, functions, w...</td>\n",
-       "      <td>[  We study the necessary and sufficient compl...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>371</th>\n",
-       "      <td>371</td>\n",
-       "      <td>16</td>\n",
-       "      <td>371_generalization_nonvacuous_explain_sensitivity</td>\n",
-       "      <td>[generalization, nonvacuous, explain, sensitiv...</td>\n",
-       "      <td>[  Neural networks exhibit good generalization...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>372</th>\n",
-       "      <td>372</td>\n",
-       "      <td>71</td>\n",
-       "      <td>372_minima_mathbf_loss_relu</td>\n",
-       "      <td>[minima, mathbf, loss, relu, activation, layer...</td>\n",
-       "      <td>[  Deep learning models are often successfully...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>373</th>\n",
-       "      <td>373</td>\n",
-       "      <td>5</td>\n",
-       "      <td>373_sgd_minima_saddles_band</td>\n",
-       "      <td>[sgd, minima, saddles, band, descent, degenera...</td>\n",
-       "      <td>[  Recent years have seen a growing interest i...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>374 rows × 5 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     Topic  Count                                               Name  \\\n",
-       "0       -1   7648                                   -1_the_of_to_and   \n",
-       "1        0      4                                     topic modeling   \n",
-       "2        2      7                2_pain_discomfort_diagnostic_facial   \n",
-       "3        3      6              3_quantum_entanglement_wave_annealers   \n",
-       "4        4     74                  4_quantum_classical_qubits_states   \n",
-       "..     ...    ...                                                ...   \n",
-       "369    369      8         369_chaos_initialization_jacobian_isometry   \n",
-       "370    370     37                   370_relu_depth_activation_mathbb   \n",
-       "371    371     16  371_generalization_nonvacuous_explain_sensitivity   \n",
-       "372    372     71                        372_minima_mathbf_loss_relu   \n",
-       "373    373      5                        373_sgd_minima_saddles_band   \n",
-       "\n",
-       "                                        Representation  \\\n",
-       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "1    [topic, papers, svd, topics, allocation, conta...   \n",
-       "2    [pain, discomfort, diagnostic, facial, intensi...   \n",
-       "3    [quantum, entanglement, wave, annealers, conva...   \n",
-       "4    [quantum, classical, qubits, states, circuit, ...   \n",
-       "..                                                 ...   \n",
-       "369  [chaos, initialization, jacobian, isometry, de...   \n",
-       "370  [relu, depth, activation, mathbb, functions, w...   \n",
-       "371  [generalization, nonvacuous, explain, sensitiv...   \n",
-       "372  [minima, mathbf, loss, relu, activation, layer...   \n",
-       "373  [sgd, minima, saddles, band, descent, degenera...   \n",
-       "\n",
-       "                                   Representative_Docs  \n",
-       "0    [  A crucial task in system identification pro...  \n",
-       "1    [  Topic models have emerged as fundamental to...  \n",
-       "2    [  Pain is a complex and subjective experience...  \n",
-       "3    [  Modern deep learning has enabled unpreceden...  \n",
-       "4    [  Quantum machine learning witnesses an incre...  \n",
-       "..                                                 ...  \n",
-       "369  [  It is well known that the initialization of...  \n",
-       "370  [  We study the necessary and sufficient compl...  \n",
-       "371  [  Neural networks exhibit good generalization...  \n",
-       "372  [  Deep learning models are often successfully...  \n",
-       "373  [  Recent years have seen a growing interest i...  \n",
-       "\n",
-       "[374 rows x 5 columns]"
-      ]
-     },
-     "execution_count": 79,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "after_delete_one_topic_info = topic_model.get_topic_info()\n",
-    "after_delete_one_topic_info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>topic modeling</td>\n",
-       "      <td>[topic, papers, svd, topics, allocation, conta...</td>\n",
-       "      <td>[  Topic models have emerged as fundamental to...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Topic  Count            Name  \\\n",
-       "1      0      4  topic modeling   \n",
-       "\n",
-       "                                      Representation  \\\n",
-       "1  [topic, papers, svd, topics, allocation, conta...   \n",
-       "\n",
-       "                                 Representative_Docs  \n",
-       "1  [  Topic models have emerged as fundamental to...  "
-      ]
-     },
-     "execution_count": 80,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('topic modeling')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n",
-       "Index: []"
-      ]
-     },
-     "execution_count": 81,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Missing Name suffixes after delete/merge:\n",
-      "modulation_radio_channel_transmitters\n",
-      "\n",
-      "Summary:\n",
-      "Original unique suffix count: 375\n",
-      "After delete/merge unique suffix count: 374\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Extract just the suffix (after the first “_”) from each Name\n",
-    "orig_suffixes = original_topic_info['Name'].str.split('_', n=1).str[1].where(original_topic_info['Name'].str.contains('_'), original_topic_info['Name'])\n",
-    "after_suffixes = after_delete_one_topic_info['Name'].str.split('_', n=1).str[1].where(after_delete_one_topic_info['Name'].str.contains('_'), after_delete_one_topic_info['Name'])\n",
-    "\n",
-    "\n",
-    "# Build unique sets\n",
-    "orig_set = set(orig_suffixes)\n",
-    "after_set = set(after_suffixes)\n",
-    "\n",
-    "# Find any suffixes that were in the original but not after deletion\n",
-    "missing = orig_set - after_set\n",
-    "\n",
-    "# Report\n",
-    "if missing:\n",
-    "    print(\"Missing Name suffixes after delete/merge:\")\n",
-    "    for name in sorted(missing):\n",
-    "        print(name)\n",
-    "else:\n",
-    "    print(\"All Name suffixes are preserved\")\n",
-    "\n",
-    "print(\"\\nSummary:\")\n",
-    "print(f\"Original unique suffix count: {len(orig_set)}\")\n",
-    "print(f\"After delete/merge unique suffix count: {len(after_set)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7624</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>264</th>\n",
-       "      <td>263</td>\n",
-       "      <td>510</td>\n",
-       "      <td>263_generative_gan_gans_generator</td>\n",
-       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
-       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>324</th>\n",
-       "      <td>323</td>\n",
-       "      <td>403</td>\n",
-       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
-       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
-       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>99</th>\n",
-       "      <td>98</td>\n",
-       "      <td>269</td>\n",
-       "      <td>98_recommendation_user_items_item</td>\n",
-       "      <td>[recommendation, user, items, item, recommende...</td>\n",
-       "      <td>[  Matrix factorization techniques have been w...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
-       "      <td>28</td>\n",
-       "      <td>196</td>\n",
-       "      <td>28_privacy_private_differential_differentially</td>\n",
-       "      <td>[privacy, private, differential, differentiall...</td>\n",
-       "      <td>[  The process of data mining with differentia...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     Topic  Count                                            Name  \\\n",
-       "0       -1   7624                                -1_the_of_to_and   \n",
-       "264    263    510               263_generative_gan_gans_generator   \n",
-       "324    323    403           323_quantization_hardware_pruning_gpu   \n",
-       "99      98    269               98_recommendation_user_items_item   \n",
-       "29      28    196  28_privacy_private_differential_differentially   \n",
-       "\n",
-       "                                        Representation  \\\n",
-       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "264  [generative, gan, gans, generator, adversarial...   \n",
-       "324  [quantization, hardware, pruning, gpu, precisi...   \n",
-       "99   [recommendation, user, items, item, recommende...   \n",
-       "29   [privacy, private, differential, differentiall...   \n",
-       "\n",
-       "                                   Representative_Docs  \n",
-       "0    [  A crucial task in system identification pro...  \n",
-       "264  [  Generative Adversarial Networks (GANs) are ...  \n",
-       "324  [  Deep neural networks (DNNs) are used by dif...  \n",
-       "99   [  Matrix factorization techniques have been w...  \n",
-       "29   [  The process of data mining with differentia...  "
-      ]
-     },
-     "execution_count": 86,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "original_topic_info.sort_values('Count', ascending=False)[:5]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Topic</th>\n",
-       "      <th>Count</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Representation</th>\n",
-       "      <th>Representative_Docs</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-1</td>\n",
-       "      <td>7648</td>\n",
-       "      <td>-1_the_of_to_and</td>\n",
-       "      <td>[the, of, to, and, in, we, that, is, for, lear...</td>\n",
-       "      <td>[  A crucial task in system identification pro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>263</th>\n",
-       "      <td>263</td>\n",
-       "      <td>510</td>\n",
-       "      <td>263_generative_gan_gans_generator</td>\n",
-       "      <td>[generative, gan, gans, generator, adversarial...</td>\n",
-       "      <td>[  Generative Adversarial Networks (GANs) are ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>323</th>\n",
-       "      <td>323</td>\n",
-       "      <td>403</td>\n",
-       "      <td>323_quantization_hardware_pruning_gpu</td>\n",
-       "      <td>[quantization, hardware, pruning, gpu, precisi...</td>\n",
-       "      <td>[  Deep neural networks (DNNs) are used by dif...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>98</th>\n",
-       "      <td>98</td>\n",
-       "      <td>269</td>\n",
-       "      <td>98_recommendation_user_items_item</td>\n",
-       "      <td>[recommendation, user, items, item, recommende...</td>\n",
-       "      <td>[  Matrix factorization techniques have been w...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>28</td>\n",
-       "      <td>196</td>\n",
-       "      <td>28_privacy_private_differential_differentially</td>\n",
-       "      <td>[privacy, private, differential, differentiall...</td>\n",
-       "      <td>[  The process of data mining with differentia...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     Topic  Count                                            Name  \\\n",
-       "0       -1   7648                                -1_the_of_to_and   \n",
-       "263    263    510               263_generative_gan_gans_generator   \n",
-       "323    323    403           323_quantization_hardware_pruning_gpu   \n",
-       "98      98    269               98_recommendation_user_items_item   \n",
-       "28      28    196  28_privacy_private_differential_differentially   \n",
-       "\n",
-       "                                        Representation  \\\n",
-       "0    [the, of, to, and, in, we, that, is, for, lear...   \n",
-       "263  [generative, gan, gans, generator, adversarial...   \n",
-       "323  [quantization, hardware, pruning, gpu, precisi...   \n",
-       "98   [recommendation, user, items, item, recommende...   \n",
-       "28   [privacy, private, differential, differentiall...   \n",
-       "\n",
-       "                                   Representative_Docs  \n",
-       "0    [  A crucial task in system identification pro...  \n",
-       "263  [  Generative Adversarial Networks (GANs) are ...  \n",
-       "323  [  Deep neural networks (DNNs) are used by dif...  \n",
-       "98   [  Matrix factorization techniques have been w...  \n",
-       "28   [  The process of data mining with differentia...  "
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "after_delete_one_topic_info.sort_values('Count', ascending=False)[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "bertopic-sc",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.22"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}