From ecae7fe87a49fa6eb78d1b0e69692c4538164e2f Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Mon, 31 Mar 2025 23:42:03 -0400 Subject: [PATCH 1/7] inital_try_merge_to_outlier --- bertopic/_bertopic.py | 79 ++++++++ dev_test_sc.ipynb | 267 ++++++++++++++++++++++++++++ tests/test_reduction/test_delete.py | 84 +++++++++ 3 files changed, 430 insertions(+) create mode 100644 dev_test_sc.ipynb create mode 100644 tests/test_reduction/test_delete.py diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 6a75171b..99e33240 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -2168,6 +2168,85 @@ def merge_topics( self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) + def delete_topics( + self, + topics_to_delete: List[int], + ) -> None: + """Delete specified topics from the topic model. + + This method allows you to remove topics from the model by mapping them to a special + label (-1) and updating the internal topic representation accordingly. It also + updates the topic sizes and any relevant attributes to reflect the changes. + + Arguments: + topics_to_delete: A list of topic IDs to be deleted from the model. + + Examples: + To delete topics 1 and 2 from the model: + + ```python + topic_model.delete_topics([1, 2]) + ``` + """ + check_is_fitted(self) + + # First map deleted topics to -1 + initial_mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} + initial_mapping[-1] = -1 + + # Update topics to mark deletions + self.topics_ = [initial_mapping[topic] for topic in self.topics_] + self._update_topic_size(pd.DataFrame({"Topic": self.topics_})) + + # Create size-based mapping for remaining topics + df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) + df = df[df.Old_Topic != -1] # Exclude outliers + final_mapping = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} + + # Update topics with final mapping + self.topics_ = [final_mapping[topic] for topic in self.topics_] + self.topic_mapper_.add_mappings(final_mapping, topic_model=self) + self._update_topic_size(pd.DataFrame({"Topic": self.topics_})) + + # Update probabilities if they exist + if self.probabilities_ is not None: + self.probabilities_ = self._map_probabilities(self.probabilities_) + + # Update dictionary-based attributes + for attr in ["topic_representations_", "topic_aspects_"]: + if hasattr(self, attr) and getattr(self, attr) is not None: + old_dict = getattr(self, attr) + if attr == "topic_aspects_": + # Handle nested dictionary for aspects + new_dict = { + aspect: { + final_mapping[old_topic]: content + for old_topic, content in topics.items() + if old_topic not in topics_to_delete + } + for aspect, topics in old_dict.items() + } + else: + # Handle flat dictionary + new_dict = { + final_mapping[old_topic]: content + for old_topic, content in old_dict.items() + if old_topic not in topics_to_delete + } + setattr(self, attr, new_dict) + + # Update array-based attributes using masks + for attr in ["topic_embeddings_", "c_tf_idf_"]: + if hasattr(self, attr) and getattr(self, attr) is not None: + matrix = getattr(self, attr) + mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) + setattr(self, attr, matrix[mask]) + + # Update ctfidf model + if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: + mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])]) + self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask] + def reduce_topics( self, docs: List[str], diff --git a/dev_test_sc.ipynb b/dev_test_sc.ipynb new file mode 100644 index 00000000..8307bb19 --- /dev/null +++ b/dev_test_sc.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload 2\n", + "from bertopic import BERTopic" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== BERTopic Delete Topics Test Results ===\n", + "\n", + "Initializing and fitting BERTopic model...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-31 21:49:21,445 - BERTopic - Embedding - Transforming documents to embeddings.\n", + "Batches: 100%|██████████| 16/16 [00:12<00:00, 1.29it/s]\n", + "2025-03-31 21:49:34,425 - BERTopic - Embedding - Completed ✓\n", + "2025-03-31 21:49:34,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", + "2025-03-31 21:49:35,290 - BERTopic - Dimensionality - Completed ✓\n", + "2025-03-31 21:49:35,291 - BERTopic - Cluster - Start clustering the reduced embeddings\n", + "2025-03-31 21:49:35,307 - BERTopic - Cluster - Completed ✓\n", + "2025-03-31 21:49:35,309 - BERTopic - Representation - Fine-tuning topics using representation models.\n", + "2025-03-31 21:49:35,493 - BERTopic - Representation - Completed ✓\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Initial State:\n", + "Total topics: 73\n", + "Topic sizes (top 5): {0: 11, 1: 11, 2: 10, 3: 10, 4: 9}\n", + "Matrix shapes - c_tf_idf: (74, 19877), embeddings: (74, 384)\n", + "\n", + "Deleting topics: [3, 4, 6]\n", + "Original sizes of topics to delete: [10, 9, 9]\n", + "\n", + "✓ Topic deletion completed\n", + "\n", + "=== Validation Results ===\n", + "\n", + "1. Topic Counts:\n", + " Before: 73 topics\n", + " After: 70 topics\n", + " Expected: 70 topics\n", + "\n", + "2. Size-based Ordering:\n", + " Topic IDs by size: [0, 1, 2, 4, 3, 5, 8, 6, 7, 13, 10, 9, 12, 11, 17, 15, 16, 14, 20, 21, 22, 18, 19, 23, 26, 28, 24, 27, 25, 29, 35, 36, 41, 37, 31, 30, 40, 38, 34, 33, 32, 42, 39, 43, 45, 50, 51, 49, 52, 53, 44, 48, 57, 47, 54, 55, 56, 46, 58, 61, 60, 59, 62, 63, 64, 65, 66, 67, 68, 69]\n", + " Sizes: [11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", + " Correctly ordered by size: False\n", + " WARNING: Topics not properly ordered by size!\n", + "\n", + "3. Matrix Shapes:\n", + " c_tf_idf - Before: (74, 19877), After: (71, 19877)\n", + " embeddings - Before: (74, 384), After: (71, 384)\n", + "\n", + "4. Topic Representations:\n", + " Before: 74 representations\n", + " After: 71 representations\n", + "\n", + "5. Outlier Topic (-1):\n", + " Present in topics_: True\n", + " Present in sizes: True\n", + " Present in representations: True\n", + "\n", + "6. Topic Deletion and Reordering:\n", + " Expected topic count: 70\n", + " Actual topic count: 70\n", + " Sequential topic numbering: True\n", + " Sizes match: True\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from umap import UMAP\n", + "from hdbscan import HDBSCAN\n", + "\n", + "def create_topic_model():\n", + " \"\"\"Create and fit a BERTopic model\"\"\"\n", + " # Create sample data\n", + " docs = fetch_20newsgroups(subset='all')['data'][:500]\n", + " \n", + " # Initialize BERTopic with specific models\n", + " umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=True, random_state=42)\n", + " hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)\n", + " topic_model = BERTopic(umap_model=umap, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', verbose=True)\n", + " \n", + " # Fit the model\n", + " topics, probs = topic_model.fit_transform(docs)\n", + " return topic_model\n", + "\n", + "def run_deletion_tests():\n", + " \"\"\"Run comprehensive tests for topic deletion and print detailed comparisons\"\"\"\n", + " print(\"\\n=== BERTopic Delete Topics Test Results ===\\n\")\n", + " \n", + " # Setup model\n", + " print(\"Initializing and fitting BERTopic model...\")\n", + " topic_model = create_topic_model()\n", + " \n", + " # Record initial state\n", + " initial_state = {\n", + " 'topic_sizes': topic_model.topic_sizes_.copy(),\n", + " 'topics_set': set(topic_model.topics_),\n", + " 'c_tf_idf_shape': topic_model.c_tf_idf_.shape,\n", + " 'embeddings_shape': topic_model.topic_embeddings_.shape,\n", + " 'representations_count': len(topic_model.topic_representations_),\n", + " }\n", + " \n", + " # Get topics sorted by size (excluding -1)\n", + " sorted_topics = sorted(\n", + " [(topic, size) for topic, size in initial_state['topic_sizes'].items() if topic != -1],\n", + " key=lambda x: x[1],\n", + " reverse=True\n", + " )\n", + " \n", + " print(\"\\nInitial State:\")\n", + " print(f\"Total topics: {len(sorted_topics)}\")\n", + " print(f\"Topic sizes (top 5): {dict(sorted_topics[:5])}\")\n", + " print(f\"Matrix shapes - c_tf_idf: {initial_state['c_tf_idf_shape']}, embeddings: {initial_state['embeddings_shape']}\")\n", + " \n", + " # Select topics to delete (4th, 5th, 6th largest)\n", + " topics_to_delete = [item[0] for item in sorted_topics[3:6]]\n", + " print(f\"\\nDeleting topics: {topics_to_delete}\")\n", + " print(f\"Original sizes of topics to delete: {[initial_state['topic_sizes'][t] for t in topics_to_delete]}\")\n", + " \n", + " # Perform deletion\n", + " try:\n", + " topic_model.delete_topics(topics_to_delete)\n", + " print(\"\\n✓ Topic deletion completed\")\n", + " except Exception as e:\n", + " print(f\"\\n❌ Error during topic deletion: {str(e)}\")\n", + " return\n", + " \n", + " # Analyze results\n", + " print(\"\\n=== Validation Results ===\\n\")\n", + " \n", + " # 1. Check topic counts\n", + " new_topics = set(topic_model.topics_) - {-1}\n", + " print(f\"1. Topic Counts:\")\n", + " print(f\" Before: {len(sorted_topics)} topics\")\n", + " print(f\" After: {len(new_topics)} topics\")\n", + " print(f\" Expected: {len(sorted_topics) - len(topics_to_delete)} topics\")\n", + " \n", + " # 2. Check size ordering\n", + " new_sorted_topics = sorted(\n", + " [(topic, size) for topic, size in topic_model.topic_sizes_.items() if topic != -1],\n", + " key=lambda x: x[1],\n", + " reverse=True\n", + " )\n", + " \n", + " print(\"\\n2. Size-based Ordering:\")\n", + " print(f\" Topic IDs by size: {[t[0] for t in new_sorted_topics]}\")\n", + " print(f\" Sizes: {[t[1] for t in new_sorted_topics]}\")\n", + " is_ordered = all(i == t[0] for i, t in enumerate(new_sorted_topics))\n", + " print(f\" Correctly ordered by size: {is_ordered}\")\n", + " if not is_ordered:\n", + " print(\" WARNING: Topics not properly ordered by size!\")\n", + " \n", + " # 3. Check matrix shapes\n", + " print(\"\\n3. Matrix Shapes:\")\n", + " print(f\" c_tf_idf - Before: {initial_state['c_tf_idf_shape']}, After: {topic_model.c_tf_idf_.shape}\")\n", + " print(f\" embeddings - Before: {initial_state['embeddings_shape']}, After: {topic_model.topic_embeddings_.shape}\")\n", + " \n", + " # 4. Check representations\n", + " print(\"\\n4. Topic Representations:\")\n", + " print(f\" Before: {initial_state['representations_count']} representations\")\n", + " print(f\" After: {len(topic_model.topic_representations_)} representations\")\n", + " \n", + " # 5. Check outlier topic\n", + " print(\"\\n5. Outlier Topic (-1):\")\n", + " print(f\" Present in topics_: {-1 in topic_model.topics_}\")\n", + " print(f\" Present in sizes: {-1 in topic_model.topic_sizes_}\")\n", + " print(f\" Present in representations: {-1 in topic_model.topic_representations_}\")\n", + " \n", + " # 6. Verify topic deletion and reordering\n", + " print(\"\\n6. Topic Deletion and Reordering:\")\n", + " expected_topic_count = len(sorted_topics) - len(topics_to_delete)\n", + " actual_topic_count = len([t for t in topic_model.topic_sizes_.keys() if t != -1])\n", + " \n", + " print(f\" Expected topic count: {expected_topic_count}\")\n", + " print(f\" Actual topic count: {actual_topic_count}\")\n", + " \n", + " # Check sequential numbering\n", + " expected_topic_numbers = set(range(expected_topic_count))\n", + " actual_topic_numbers = set(t for t in topic_model.topic_sizes_.keys() if t != -1)\n", + " sequential_numbering = expected_topic_numbers == actual_topic_numbers\n", + " \n", + " print(f\" Sequential topic numbering: {sequential_numbering}\")\n", + " if not sequential_numbering:\n", + " print(f\" Expected topics: {sorted(expected_topic_numbers)}\")\n", + " print(f\" Actual topics: {sorted(actual_topic_numbers)}\")\n", + " \n", + " # Check sizes match (excluding deleted topics)\n", + " expected_sizes = sorted([size for topic, size in sorted_topics if topic not in topics_to_delete], reverse=True)\n", + " actual_sizes = sorted([size for topic, size in topic_model.topic_sizes_.items() if topic != -1], reverse=True)\n", + " sizes_match = expected_sizes == actual_sizes\n", + " \n", + " print(f\" Sizes match: {sizes_match}\")\n", + " if not sizes_match:\n", + " print(f\" Expected sizes: {expected_sizes}\")\n", + " print(f\" Actual sizes: {actual_sizes}\")\n", + "\n", + " # Update the validations dictionary\n", + " validations = {\n", + " \"Topic count correct\": actual_topic_count == expected_topic_count,\n", + " \"Size ordering correct\": is_ordered,\n", + " \"Matrix shapes consistent\": topic_model.c_tf_idf_.shape[0] == topic_model.topic_embeddings_.shape[0],\n", + " \"Sequential topic numbering\": sequential_numbering,\n", + " \"Topic sizes preserved\": sizes_match,\n", + " \"Outlier preserved\": all([-1 in topic_model.topics_, -1 in topic_model.topic_sizes_, -1 in topic_model.topic_representations_])\n", + " }\n", + "\n", + "if __name__ == \"__main__\":\n", + " run_deletion_tests()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bertopic-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py new file mode 100644 index 00000000..b5ae7661 --- /dev/null +++ b/tests/test_reduction/test_delete.py @@ -0,0 +1,84 @@ +import copy +import pytest + + +@pytest.mark.parametrize( + "model", + [ + ("kmeans_pca_topic_model"), + ("base_topic_model"), + ("custom_topic_model"), + ("merged_topic_model"), + ("reduced_topic_model"), + ("online_topic_model"), + ], +) +def test_delete(model, request): + topic_model = copy.deepcopy(request.getfixturevalue(model)) + nr_topics = len(set(topic_model.topics_)) + length_documents = len(topic_model.topics_) + + print("\n" + "="*50) + print(f"Testing model: {model}") + print(f"Initial number of topics: {nr_topics}") + print(f"Initial topics: {sorted(list(set(topic_model.topics_)))}") + print(f"Number of documents: {length_documents}") + print("="*50) + + # First deletion + topics_to_delete = [1, 2] + print(f"\nFirst deletion - attempting to delete topics: {topics_to_delete}") + topic_model.delete_topics(topics_to_delete) + + print(f"Topics after first deletion: {sorted(list(set(topic_model.topics_)))}") + print(f"Number of topics after first deletion: {len(set(topic_model.topics_))}") + + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) + print(f"Topic mappings after first deletion: {mappings}") + + mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] + print(f"First 10 mapped labels: {mapped_labels[:10]}") + print(f"First 10 model topics: {topic_model.topics_[:10]}") + + print("\nFirst deletion - Assertions:") + print(f"Expected topics: {nr_topics - 2}, Actual topics: {len(set(topic_model.topics_))}") + print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}") + + assert nr_topics == len(set(topic_model.topics_)) + 2 + assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model": + assert mapped_labels == topic_model.topics_[950:] + else: + assert mapped_labels == topic_model.topics_ + + # Find two existing topics for second deletion + remaining_topics = sorted(list(set(topic_model.topics_))) + remaining_topics = [t for t in remaining_topics if t != -1] # Exclude outlier topic + topics_to_delete = remaining_topics[:2] # Take first two remaining topics + + print(f"\nSecond deletion - attempting to delete topics: {topics_to_delete}") + print(f"All remaining topics before second deletion: {remaining_topics}") + + # Second deletion + topic_model.delete_topics(topics_to_delete) + + print(f"Topics after second deletion: {sorted(list(set(topic_model.topics_)))}") + print(f"Number of topics after second deletion: {len(set(topic_model.topics_))}") + + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) + print(f"Topic mappings after second deletion: {mappings}") + + mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] + print(f"First 10 mapped labels: {mapped_labels[:10]}") + print(f"First 10 model topics: {topic_model.topics_[:10]}") + + print("\nSecond deletion - Assertions:") + print(f"Expected topics: {nr_topics - 4}, Actual topics: {len(set(topic_model.topics_))}") + print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}") + + assert nr_topics == len(set(topic_model.topics_)) + 4 + assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model": + assert mapped_labels == topic_model.topics_[950:] + else: + assert mapped_labels == topic_model.topics_ From b8e01c5928497c3af3d78e8be5f8337a07b090b1 Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Tue, 1 Apr 2025 22:46:52 -0400 Subject: [PATCH 2/7] Updated delete_topics and test_delete.py to account for models without -1 topics --- bertopic/_bertopic.py | 108 +++++++---- dev_test_sc.ipynb | 267 ---------------------------- tests/test_reduction/test_delete.py | 55 ++---- 3 files changed, 84 insertions(+), 346 deletions(-) delete mode 100644 dev_test_sc.ipynb diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 99e33240..eab008ae 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -827,7 +827,7 @@ def topics_over_time( nr_bins: The number of bins you want to create for the timestamps. The left interval will be chosen as the timestamp. An additional column will be created with the entire interval. - datetime_format: The datetime format of the timestamps if they are strings, eg “%d/%m/%Y”. + datetime_format: The datetime format of the timestamps if they are strings, eg "%d/%m/%Y". Set this to None if you want to have it automatically detect the format. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. @@ -1778,8 +1778,7 @@ def get_document_info( # the topic distributions document_info = topic_model.get_document_info(docs, df=df, metadata={"Topic_distribution": distributions}) - ``` - """ + """ check_documents_type(docs) if df is not None: document_info = df.copy() @@ -2172,47 +2171,78 @@ def delete_topics( self, topics_to_delete: List[int], ) -> None: - """Delete specified topics from the topic model. - - This method allows you to remove topics from the model by mapping them to a special - label (-1) and updating the internal topic representation accordingly. It also - updates the topic sizes and any relevant attributes to reflect the changes. + check_is_fitted(self) - Arguments: - topics_to_delete: A list of topic IDs to be deleted from the model. + topics_df = pd.DataFrame( + { + "Topic": self.topics_ + } + ) - Examples: - To delete topics 1 and 2 from the model: - - ```python - topic_model.delete_topics([1, 2]) - ``` - """ - check_is_fitted(self) + # Check if -1 exists in the current topics + had_outliers = -1 in set(self.topics_) # First map deleted topics to -1 - initial_mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} - initial_mapping[-1] = -1 + mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} + mapping[-1] = -1 + + # Track mappings and sizes of topics for merging topic embeddings + mappings = defaultdict(list) + for key, val in sorted(mapping.items()): + mappings[val].append(key) + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in mappings.items() + } - # Update topics to mark deletions - self.topics_ = [initial_mapping[topic] for topic in self.topics_] - self._update_topic_size(pd.DataFrame({"Topic": self.topics_})) + # If adding -1 for the first time, initialize its attributes + if not had_outliers and any(topic in topics_to_delete for topic in self.topics_): + # Initialize c_tf_idf for -1 topic (zeros) + if hasattr(self, "c_tf_idf_") and self.c_tf_idf_ is not None: + outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) + if isinstance(self.c_tf_idf_, sp.csr_matrix): + outlier_row = sp.csr_matrix(outlier_row) + self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) + else: + self.c_tf_idf_ = np.vstack([outlier_row, self.c_tf_idf_]) + + # Initialize topic embeddings for -1 topic (zeros) + if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None: + outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) + self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) + + # Initialize topic representations for -1 topic: ('N/A', 1e-05)] + if hasattr(self, "topic_representations_") and self.topic_representations_ is not None: + self.topic_representations_[-1] = [('N/A', 1e-05)] + + # Initialize ctfidf model diagonal for -1 topic (ones) + if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None and hasattr(self.ctfidf_model, "_idf_diag"): + if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix): + n_features = self.ctfidf_model._idf_diag.shape[1] + outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) + self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag]) + else: + outlier_diag = np.ones(1) + self.ctfidf_model._idf_diag = np.concatenate([outlier_diag, self.ctfidf_model._idf_diag]) - # Create size-based mapping for remaining topics - df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) - df = df[df.Old_Topic != -1] # Exclude outliers - final_mapping = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} + # Initialize topic aspects for -1 topic (empty dict for each aspect) + if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: + for aspect in self.topic_aspects_: + self.topic_aspects_[aspect][-1] = {} - # Update topics with final mapping - self.topics_ = [final_mapping[topic] for topic in self.topics_] - self.topic_mapper_.add_mappings(final_mapping, topic_model=self) - self._update_topic_size(pd.DataFrame({"Topic": self.topics_})) + # Continue with the rest of the delete_topics logic + topics_df.Topic = topics_df.Topic.map(mapping) + self.topic_mapper_.add_mappings(mapping, topic_model=self) + topics_df = self._sort_mappings_by_frequency(topics_df) + self._update_topic_size(topics_df) + self.probabilities_ = self._map_probabilities(self.probabilities_) - # Update probabilities if they exist - if self.probabilities_ is not None: - self.probabilities_ = self._map_probabilities(self.probabilities_) + final_mapping = self.topic_mapper_.get_mappings() - # Update dictionary-based attributes + # Update dictionary-based attributes to remove deleted topics for attr in ["topic_representations_", "topic_aspects_"]: if hasattr(self, attr) and getattr(self, attr) is not None: old_dict = getattr(self, attr) @@ -2220,7 +2250,7 @@ def delete_topics( # Handle nested dictionary for aspects new_dict = { aspect: { - final_mapping[old_topic]: content + (final_mapping[old_topic] if old_topic != -1 else -1): content for old_topic, content in topics.items() if old_topic not in topics_to_delete } @@ -2229,20 +2259,20 @@ def delete_topics( else: # Handle flat dictionary new_dict = { - final_mapping[old_topic]: content + (final_mapping[old_topic] if old_topic != -1 else -1): content for old_topic, content in old_dict.items() if old_topic not in topics_to_delete } setattr(self, attr, new_dict) - # Update array-based attributes using masks + # Update array-based attributes using masks to remove deleted topics for attr in ["topic_embeddings_", "c_tf_idf_"]: if hasattr(self, attr) and getattr(self, attr) is not None: matrix = getattr(self, attr) mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) setattr(self, attr, matrix[mask]) - # Update ctfidf model + # Update ctfidf model to remove deleted topics if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])]) self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask] diff --git a/dev_test_sc.ipynb b/dev_test_sc.ipynb deleted file mode 100644 index 8307bb19..00000000 --- a/dev_test_sc.ipynb +++ /dev/null @@ -1,267 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "%autoreload 2\n", - "from bertopic import BERTopic" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "=== BERTopic Delete Topics Test Results ===\n", - "\n", - "Initializing and fitting BERTopic model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-03-31 21:49:21,445 - BERTopic - Embedding - Transforming documents to embeddings.\n", - "Batches: 100%|██████████| 16/16 [00:12<00:00, 1.29it/s]\n", - "2025-03-31 21:49:34,425 - BERTopic - Embedding - Completed ✓\n", - "2025-03-31 21:49:34,426 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", - "2025-03-31 21:49:35,290 - BERTopic - Dimensionality - Completed ✓\n", - "2025-03-31 21:49:35,291 - BERTopic - Cluster - Start clustering the reduced embeddings\n", - "2025-03-31 21:49:35,307 - BERTopic - Cluster - Completed ✓\n", - "2025-03-31 21:49:35,309 - BERTopic - Representation - Fine-tuning topics using representation models.\n", - "2025-03-31 21:49:35,493 - BERTopic - Representation - Completed ✓\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Initial State:\n", - "Total topics: 73\n", - "Topic sizes (top 5): {0: 11, 1: 11, 2: 10, 3: 10, 4: 9}\n", - "Matrix shapes - c_tf_idf: (74, 19877), embeddings: (74, 384)\n", - "\n", - "Deleting topics: [3, 4, 6]\n", - "Original sizes of topics to delete: [10, 9, 9]\n", - "\n", - "✓ Topic deletion completed\n", - "\n", - "=== Validation Results ===\n", - "\n", - "1. Topic Counts:\n", - " Before: 73 topics\n", - " After: 70 topics\n", - " Expected: 70 topics\n", - "\n", - "2. Size-based Ordering:\n", - " Topic IDs by size: [0, 1, 2, 4, 3, 5, 8, 6, 7, 13, 10, 9, 12, 11, 17, 15, 16, 14, 20, 21, 22, 18, 19, 23, 26, 28, 24, 27, 25, 29, 35, 36, 41, 37, 31, 30, 40, 38, 34, 33, 32, 42, 39, 43, 45, 50, 51, 49, 52, 53, 44, 48, 57, 47, 54, 55, 56, 46, 58, 61, 60, 59, 62, 63, 64, 65, 66, 67, 68, 69]\n", - " Sizes: [11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", - " Correctly ordered by size: False\n", - " WARNING: Topics not properly ordered by size!\n", - "\n", - "3. Matrix Shapes:\n", - " c_tf_idf - Before: (74, 19877), After: (71, 19877)\n", - " embeddings - Before: (74, 384), After: (71, 384)\n", - "\n", - "4. Topic Representations:\n", - " Before: 74 representations\n", - " After: 71 representations\n", - "\n", - "5. Outlier Topic (-1):\n", - " Present in topics_: True\n", - " Present in sizes: True\n", - " Present in representations: True\n", - "\n", - "6. Topic Deletion and Reordering:\n", - " Expected topic count: 70\n", - " Actual topic count: 70\n", - " Sequential topic numbering: True\n", - " Sizes match: True\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from umap import UMAP\n", - "from hdbscan import HDBSCAN\n", - "\n", - "def create_topic_model():\n", - " \"\"\"Create and fit a BERTopic model\"\"\"\n", - " # Create sample data\n", - " docs = fetch_20newsgroups(subset='all')['data'][:500]\n", - " \n", - " # Initialize BERTopic with specific models\n", - " umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=True, random_state=42)\n", - " hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)\n", - " topic_model = BERTopic(umap_model=umap, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', verbose=True)\n", - " \n", - " # Fit the model\n", - " topics, probs = topic_model.fit_transform(docs)\n", - " return topic_model\n", - "\n", - "def run_deletion_tests():\n", - " \"\"\"Run comprehensive tests for topic deletion and print detailed comparisons\"\"\"\n", - " print(\"\\n=== BERTopic Delete Topics Test Results ===\\n\")\n", - " \n", - " # Setup model\n", - " print(\"Initializing and fitting BERTopic model...\")\n", - " topic_model = create_topic_model()\n", - " \n", - " # Record initial state\n", - " initial_state = {\n", - " 'topic_sizes': topic_model.topic_sizes_.copy(),\n", - " 'topics_set': set(topic_model.topics_),\n", - " 'c_tf_idf_shape': topic_model.c_tf_idf_.shape,\n", - " 'embeddings_shape': topic_model.topic_embeddings_.shape,\n", - " 'representations_count': len(topic_model.topic_representations_),\n", - " }\n", - " \n", - " # Get topics sorted by size (excluding -1)\n", - " sorted_topics = sorted(\n", - " [(topic, size) for topic, size in initial_state['topic_sizes'].items() if topic != -1],\n", - " key=lambda x: x[1],\n", - " reverse=True\n", - " )\n", - " \n", - " print(\"\\nInitial State:\")\n", - " print(f\"Total topics: {len(sorted_topics)}\")\n", - " print(f\"Topic sizes (top 5): {dict(sorted_topics[:5])}\")\n", - " print(f\"Matrix shapes - c_tf_idf: {initial_state['c_tf_idf_shape']}, embeddings: {initial_state['embeddings_shape']}\")\n", - " \n", - " # Select topics to delete (4th, 5th, 6th largest)\n", - " topics_to_delete = [item[0] for item in sorted_topics[3:6]]\n", - " print(f\"\\nDeleting topics: {topics_to_delete}\")\n", - " print(f\"Original sizes of topics to delete: {[initial_state['topic_sizes'][t] for t in topics_to_delete]}\")\n", - " \n", - " # Perform deletion\n", - " try:\n", - " topic_model.delete_topics(topics_to_delete)\n", - " print(\"\\n✓ Topic deletion completed\")\n", - " except Exception as e:\n", - " print(f\"\\n❌ Error during topic deletion: {str(e)}\")\n", - " return\n", - " \n", - " # Analyze results\n", - " print(\"\\n=== Validation Results ===\\n\")\n", - " \n", - " # 1. Check topic counts\n", - " new_topics = set(topic_model.topics_) - {-1}\n", - " print(f\"1. Topic Counts:\")\n", - " print(f\" Before: {len(sorted_topics)} topics\")\n", - " print(f\" After: {len(new_topics)} topics\")\n", - " print(f\" Expected: {len(sorted_topics) - len(topics_to_delete)} topics\")\n", - " \n", - " # 2. Check size ordering\n", - " new_sorted_topics = sorted(\n", - " [(topic, size) for topic, size in topic_model.topic_sizes_.items() if topic != -1],\n", - " key=lambda x: x[1],\n", - " reverse=True\n", - " )\n", - " \n", - " print(\"\\n2. Size-based Ordering:\")\n", - " print(f\" Topic IDs by size: {[t[0] for t in new_sorted_topics]}\")\n", - " print(f\" Sizes: {[t[1] for t in new_sorted_topics]}\")\n", - " is_ordered = all(i == t[0] for i, t in enumerate(new_sorted_topics))\n", - " print(f\" Correctly ordered by size: {is_ordered}\")\n", - " if not is_ordered:\n", - " print(\" WARNING: Topics not properly ordered by size!\")\n", - " \n", - " # 3. Check matrix shapes\n", - " print(\"\\n3. Matrix Shapes:\")\n", - " print(f\" c_tf_idf - Before: {initial_state['c_tf_idf_shape']}, After: {topic_model.c_tf_idf_.shape}\")\n", - " print(f\" embeddings - Before: {initial_state['embeddings_shape']}, After: {topic_model.topic_embeddings_.shape}\")\n", - " \n", - " # 4. Check representations\n", - " print(\"\\n4. Topic Representations:\")\n", - " print(f\" Before: {initial_state['representations_count']} representations\")\n", - " print(f\" After: {len(topic_model.topic_representations_)} representations\")\n", - " \n", - " # 5. Check outlier topic\n", - " print(\"\\n5. Outlier Topic (-1):\")\n", - " print(f\" Present in topics_: {-1 in topic_model.topics_}\")\n", - " print(f\" Present in sizes: {-1 in topic_model.topic_sizes_}\")\n", - " print(f\" Present in representations: {-1 in topic_model.topic_representations_}\")\n", - " \n", - " # 6. Verify topic deletion and reordering\n", - " print(\"\\n6. Topic Deletion and Reordering:\")\n", - " expected_topic_count = len(sorted_topics) - len(topics_to_delete)\n", - " actual_topic_count = len([t for t in topic_model.topic_sizes_.keys() if t != -1])\n", - " \n", - " print(f\" Expected topic count: {expected_topic_count}\")\n", - " print(f\" Actual topic count: {actual_topic_count}\")\n", - " \n", - " # Check sequential numbering\n", - " expected_topic_numbers = set(range(expected_topic_count))\n", - " actual_topic_numbers = set(t for t in topic_model.topic_sizes_.keys() if t != -1)\n", - " sequential_numbering = expected_topic_numbers == actual_topic_numbers\n", - " \n", - " print(f\" Sequential topic numbering: {sequential_numbering}\")\n", - " if not sequential_numbering:\n", - " print(f\" Expected topics: {sorted(expected_topic_numbers)}\")\n", - " print(f\" Actual topics: {sorted(actual_topic_numbers)}\")\n", - " \n", - " # Check sizes match (excluding deleted topics)\n", - " expected_sizes = sorted([size for topic, size in sorted_topics if topic not in topics_to_delete], reverse=True)\n", - " actual_sizes = sorted([size for topic, size in topic_model.topic_sizes_.items() if topic != -1], reverse=True)\n", - " sizes_match = expected_sizes == actual_sizes\n", - " \n", - " print(f\" Sizes match: {sizes_match}\")\n", - " if not sizes_match:\n", - " print(f\" Expected sizes: {expected_sizes}\")\n", - " print(f\" Actual sizes: {actual_sizes}\")\n", - "\n", - " # Update the validations dictionary\n", - " validations = {\n", - " \"Topic count correct\": actual_topic_count == expected_topic_count,\n", - " \"Size ordering correct\": is_ordered,\n", - " \"Matrix shapes consistent\": topic_model.c_tf_idf_.shape[0] == topic_model.topic_embeddings_.shape[0],\n", - " \"Sequential topic numbering\": sequential_numbering,\n", - " \"Topic sizes preserved\": sizes_match,\n", - " \"Outlier preserved\": all([-1 in topic_model.topics_, -1 in topic_model.topic_sizes_, -1 in topic_model.topic_representations_])\n", - " }\n", - "\n", - "if __name__ == \"__main__\":\n", - " run_deletion_tests()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bertopic-dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py index b5ae7661..8c5514e2 100644 --- a/tests/test_reduction/test_delete.py +++ b/tests/test_reduction/test_delete.py @@ -17,35 +17,20 @@ def test_delete(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) nr_topics = len(set(topic_model.topics_)) length_documents = len(topic_model.topics_) - - print("\n" + "="*50) - print(f"Testing model: {model}") - print(f"Initial number of topics: {nr_topics}") - print(f"Initial topics: {sorted(list(set(topic_model.topics_)))}") - print(f"Number of documents: {length_documents}") - print("="*50) # First deletion topics_to_delete = [1, 2] - print(f"\nFirst deletion - attempting to delete topics: {topics_to_delete}") topic_model.delete_topics(topics_to_delete) - - print(f"Topics after first deletion: {sorted(list(set(topic_model.topics_)))}") - print(f"Number of topics after first deletion: {len(set(topic_model.topics_))}") - mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) - print(f"Topic mappings after first deletion: {mappings}") - mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] - print(f"First 10 mapped labels: {mapped_labels[:10]}") - print(f"First 10 model topics: {topic_model.topics_[:10]}") - - print("\nFirst deletion - Assertions:") - print(f"Expected topics: {nr_topics - 2}, Actual topics: {len(set(topic_model.topics_))}") - print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}") - assert nr_topics == len(set(topic_model.topics_)) + 2 - assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model" or model == "kmeans_pca_topic_model": + assert nr_topics == len(set(topic_model.topics_)) + 1 + assert topic_model.get_topic_info().Count.sum() == length_documents + else: + assert nr_topics == len(set(topic_model.topics_)) + 2 + assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: @@ -55,29 +40,19 @@ def test_delete(model, request): remaining_topics = sorted(list(set(topic_model.topics_))) remaining_topics = [t for t in remaining_topics if t != -1] # Exclude outlier topic topics_to_delete = remaining_topics[:2] # Take first two remaining topics - - print(f"\nSecond deletion - attempting to delete topics: {topics_to_delete}") - print(f"All remaining topics before second deletion: {remaining_topics}") - + # Second deletion topic_model.delete_topics(topics_to_delete) - - print(f"Topics after second deletion: {sorted(list(set(topic_model.topics_)))}") - print(f"Number of topics after second deletion: {len(set(topic_model.topics_))}") - mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) - print(f"Topic mappings after second deletion: {mappings}") - mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] - print(f"First 10 mapped labels: {mapped_labels[:10]}") - print(f"First 10 model topics: {topic_model.topics_[:10]}") - print("\nSecond deletion - Assertions:") - print(f"Expected topics: {nr_topics - 4}, Actual topics: {len(set(topic_model.topics_))}") - print(f"Expected documents: {length_documents}, Actual documents: {topic_model.get_topic_info().Count.sum()}") - - assert nr_topics == len(set(topic_model.topics_)) + 4 - assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model" or model == "kmeans_pca_topic_model": + assert nr_topics == len(set(topic_model.topics_)) + 3 + assert topic_model.get_topic_info().Count.sum() == length_documents + else: + assert nr_topics == len(set(topic_model.topics_)) + 4 + assert topic_model.get_topic_info().Count.sum() == length_documents + if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: From d85959e2b3f1e3190319a16fc99094cd7180223f Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Thu, 3 Apr 2025 10:23:50 -0400 Subject: [PATCH 3/7] minor refactor/format updates --- bertopic/_bertopic.py | 56 ++++++++++++++--------------- tests/test_reduction/test_delete.py | 2 +- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index eab008ae..d672155e 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -1778,7 +1778,7 @@ def get_document_info( # the topic distributions document_info = topic_model.get_document_info(docs, df=df, metadata={"Topic_distribution": distributions}) - """ + """ check_documents_type(docs) if df is not None: document_info = df.copy() @@ -2173,31 +2173,11 @@ def delete_topics( ) -> None: check_is_fitted(self) - topics_df = pd.DataFrame( - { - "Topic": self.topics_ - } - ) + topics_df = pd.DataFrame({"Topic": self.topics_}) # Check if -1 exists in the current topics had_outliers = -1 in set(self.topics_) - # First map deleted topics to -1 - mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} - mapping[-1] = -1 - - # Track mappings and sizes of topics for merging topic embeddings - mappings = defaultdict(list) - for key, val in sorted(mapping.items()): - mappings[val].append(key) - mappings = { - topic_to: { - "topics_from": topics_from, - "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], - } - for topic_to, topics_from in mappings.items() - } - # If adding -1 for the first time, initialize its attributes if not had_outliers and any(topic in topics_to_delete for topic in self.topics_): # Initialize c_tf_idf for -1 topic (zeros) @@ -2205,21 +2185,23 @@ def delete_topics( outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) if isinstance(self.c_tf_idf_, sp.csr_matrix): outlier_row = sp.csr_matrix(outlier_row) - self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) - else: - self.c_tf_idf_ = np.vstack([outlier_row, self.c_tf_idf_]) + self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) # Initialize topic embeddings for -1 topic (zeros) if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None: outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) - # Initialize topic representations for -1 topic: ('N/A', 1e-05)] + # Initialize topic representations for -1 topic: ("N/A - OUTLIER TOPIC", 1e-05) if hasattr(self, "topic_representations_") and self.topic_representations_ is not None: - self.topic_representations_[-1] = [('N/A', 1e-05)] + self.topic_representations_[-1] = [("N/A - OUTLIER TOPIC", 1e-05)] # Initialize ctfidf model diagonal for -1 topic (ones) - if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None and hasattr(self.ctfidf_model, "_idf_diag"): + if ( + hasattr(self, "ctfidf_model") + and self.ctfidf_model is not None + and hasattr(self.ctfidf_model, "_idf_diag") + ): if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix): n_features = self.ctfidf_model._idf_diag.shape[1] outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) @@ -2233,7 +2215,23 @@ def delete_topics( for aspect in self.topic_aspects_: self.topic_aspects_[aspect][-1] = {} - # Continue with the rest of the delete_topics logic + # First map deleted topics to -1 + mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} + mapping[-1] = -1 + + # Track mappings and sizes of topics for merging topic embeddings + mappings = defaultdict(list) + for key, val in sorted(mapping.items()): + mappings[val].append(key) + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in mappings.items() + } + + # remove deleted topics and update attributes topics_df.Topic = topics_df.Topic.map(mapping) self.topic_mapper_.add_mappings(mapping, topic_model=self) topics_df = self._sort_mappings_by_frequency(topics_df) diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py index 8c5514e2..188e1ffb 100644 --- a/tests/test_reduction/test_delete.py +++ b/tests/test_reduction/test_delete.py @@ -23,7 +23,7 @@ def test_delete(model, request): topic_model.delete_topics(topics_to_delete) mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] - + if model == "online_topic_model" or model == "kmeans_pca_topic_model": assert nr_topics == len(set(topic_model.topics_)) + 1 assert topic_model.get_topic_info().Count.sum() == length_documents From 15e9cfeefe5c5b2b4526bb7e8ed474c22bc26883 Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Fri, 18 Apr 2025 15:06:13 -0400 Subject: [PATCH 4/7] refactor delete_topics and adjust for custom_labels_ --- bertopic/_bertopic.py | 106 +++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index d672155e..8d7712c9 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -2171,6 +2171,14 @@ def delete_topics( self, topics_to_delete: List[int], ) -> None: + """Delete topics from the topic model. + + The deleted topics will be mapped to -1 (outlier topic). Core topic attributes + like topic embeddings and c-TF-IDF will be automatically updated. + + Arguments: + topics_to_delete: List of topics to delete + """ check_is_fitted(self) topics_df = pd.DataFrame({"Topic": self.topics_}) @@ -2180,37 +2188,25 @@ def delete_topics( # If adding -1 for the first time, initialize its attributes if not had_outliers and any(topic in topics_to_delete for topic in self.topics_): - # Initialize c_tf_idf for -1 topic (zeros) - if hasattr(self, "c_tf_idf_") and self.c_tf_idf_ is not None: - outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) - if isinstance(self.c_tf_idf_, sp.csr_matrix): - outlier_row = sp.csr_matrix(outlier_row) - self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) + # Initialize c-TF-IDF for -1 topic (zeros) + outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) + outlier_row = sp.csr_matrix(outlier_row) + self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) # Initialize topic embeddings for -1 topic (zeros) - if hasattr(self, "topic_embeddings_") and self.topic_embeddings_ is not None: - outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) - self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) + outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) + self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) - # Initialize topic representations for -1 topic: ("N/A - OUTLIER TOPIC", 1e-05) - if hasattr(self, "topic_representations_") and self.topic_representations_ is not None: - self.topic_representations_[-1] = [("N/A - OUTLIER TOPIC", 1e-05)] + # Initialize topic representations for -1 topic: ("", 1e-05) + self.topic_representations_[-1] = [("", 1e-05)] - # Initialize ctfidf model diagonal for -1 topic (ones) - if ( - hasattr(self, "ctfidf_model") - and self.ctfidf_model is not None - and hasattr(self.ctfidf_model, "_idf_diag") - ): - if isinstance(self.ctfidf_model._idf_diag, sp.csr_matrix): - n_features = self.ctfidf_model._idf_diag.shape[1] - outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) - self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag]) - else: - outlier_diag = np.ones(1) - self.ctfidf_model._idf_diag = np.concatenate([outlier_diag, self.ctfidf_model._idf_diag]) + # Initialize ctfidf model diagonal for -1 topic (ones) if it exists + if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: + n_features = self.ctfidf_model._idf_diag.shape[1] + outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) + self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag]) - # Initialize topic aspects for -1 topic (empty dict for each aspect) + # Initialize topic aspects for -1 topic (empty dict for each aspect) if they exist if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: for aspect in self.topic_aspects_: self.topic_aspects_[aspect][-1] = {} @@ -2241,36 +2237,42 @@ def delete_topics( final_mapping = self.topic_mapper_.get_mappings() # Update dictionary-based attributes to remove deleted topics - for attr in ["topic_representations_", "topic_aspects_"]: - if hasattr(self, attr) and getattr(self, attr) is not None: - old_dict = getattr(self, attr) - if attr == "topic_aspects_": - # Handle nested dictionary for aspects - new_dict = { - aspect: { - (final_mapping[old_topic] if old_topic != -1 else -1): content - for old_topic, content in topics.items() - if old_topic not in topics_to_delete - } - for aspect, topics in old_dict.items() - } - else: - # Handle flat dictionary - new_dict = { - (final_mapping[old_topic] if old_topic != -1 else -1): content - for old_topic, content in old_dict.items() - if old_topic not in topics_to_delete - } - setattr(self, attr, new_dict) + # Handle topic_aspects_ if it exists + if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: + new_aspects = { + aspect: { + (final_mapping[old_topic] if old_topic != -1 else -1): content + for old_topic, content in topics.items() + if old_topic not in topics_to_delete + } + for aspect, topics in self.topic_aspects_.items() + } + self.topic_aspects_ = new_aspects + + # Update custom labels if they exist + if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: + new_labels = { + (final_mapping[old_topic] if old_topic != -1 else -1): label + for old_topic, label in self.custom_labels_.items() + if old_topic not in topics_to_delete + } + self.custom_labels_ = new_labels + + # Update topic representations + new_representations = { + (final_mapping[old_topic] if old_topic != -1 else -1): content + for old_topic, content in self.topic_representations_.items() + if old_topic not in topics_to_delete + } + self.topic_representations_ = new_representations # Update array-based attributes using masks to remove deleted topics for attr in ["topic_embeddings_", "c_tf_idf_"]: - if hasattr(self, attr) and getattr(self, attr) is not None: - matrix = getattr(self, attr) - mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) - setattr(self, attr, matrix[mask]) + matrix = getattr(self, attr) + mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) + setattr(self, attr, matrix[mask]) - # Update ctfidf model to remove deleted topics + # Update ctfidf model to remove deleted topics if it exists if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])]) self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask] From 31fd95ee3ff7529fab2e84f95488adc7bbe02183 Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Fri, 25 Apr 2025 20:42:42 -0400 Subject: [PATCH 5/7] debug and update delete_topics --- bertopic/_bertopic.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 8d7712c9..ec04c23d 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -2200,6 +2200,18 @@ def delete_topics( # Initialize topic representations for -1 topic: ("", 1e-05) self.topic_representations_[-1] = [("", 1e-05)] + # Initialize representative docs for -1 topic (empty list) + self.representative_docs_[-1] = [] + + # Initialize representative images for -1 topic if images are being used + if self.representative_images_ is not None: + outlier_image = np.zeros((1, self.representative_images_.shape[1])) + self.representative_images_ = np.vstack([outlier_image, self.representative_images_]) + + # Initialize custom labels for -1 topic if they exist + if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: + self.custom_labels_[-1] = "" + # Initialize ctfidf model diagonal for -1 topic (ones) if it exists if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: n_features = self.ctfidf_model._idf_diag.shape[1] @@ -2234,7 +2246,7 @@ def delete_topics( self._update_topic_size(topics_df) self.probabilities_ = self._map_probabilities(self.probabilities_) - final_mapping = self.topic_mapper_.get_mappings() + final_mapping = self.topic_mapper_.get_mappings(original_topics=False) # Update dictionary-based attributes to remove deleted topics # Handle topic_aspects_ if it exists @@ -2266,6 +2278,20 @@ def delete_topics( } self.topic_representations_ = new_representations + # Update representative docs if they exist + new_representative_docs = { + (final_mapping[old_topic] if old_topic != -1 else -1): docs + for old_topic, docs in self.representative_docs_.items() + if old_topic not in topics_to_delete + } + self.representative_docs_ = new_representative_docs + + # Update representative images if they exist + if self.representative_images_ is not None: + # Create a mask for non-deleted topics + mask = np.array([topic not in topics_to_delete for topic in range(len(self.representative_images_))]) + self.representative_images_ = self.representative_images_[mask] if mask.any() else None + # Update array-based attributes using masks to remove deleted topics for attr in ["topic_embeddings_", "c_tf_idf_"]: matrix = getattr(self, attr) From 1bc6593ded1acb3fd6b046a08aedfcff6ca912f1 Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Tue, 27 May 2025 17:05:10 -0400 Subject: [PATCH 6/7] draft delete adjustment for zero shot --- .gitignore | 1 + bertopic/_bertopic.py | 13 +- delete_topics_test_sc.ipynb | 1277 +++++++++++++++++++++++++++++++++++ 3 files changed, 1288 insertions(+), 3 deletions(-) create mode 100644 delete_topics_test_sc.ipynb diff --git a/.gitignore b/.gitignore index 77c026df..e7058c9e 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,7 @@ docs/_build/ # Jupyter Notebook .ipynb_checkpoints +notebooks/ # IPython profile_default/ diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index ec04c23d..499fd584 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd import scipy.sparse as sp +from copy import deepcopy from tqdm import tqdm from pathlib import Path @@ -2241,7 +2242,7 @@ def delete_topics( # remove deleted topics and update attributes topics_df.Topic = topics_df.Topic.map(mapping) - self.topic_mapper_.add_mappings(mapping, topic_model=self) + self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) topics_df = self._sort_mappings_by_frequency(topics_df) self._update_topic_size(topics_df) self.probabilities_ = self._map_probabilities(self.probabilities_) @@ -4921,6 +4922,7 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): for key, value in topics_to_map.items(): mapping[value].append(key) + print(f'len of mapping: {len(mapping)}') for topic_to, topics_from in mapping.items(): # which of the original topics are zero-shot zeroshot_topic_ids = [ @@ -4935,20 +4937,25 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]] for topic_id in zeroshot_topic_ids ] + print(f'topics_from: {topics_from} and topic_to: {topic_to}') + print(f'zeroshot_labels: {zeroshot_labels}') zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels) cosine_similarities = cosine_similarity( zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]] ).flatten() + print(f'cosine_similarities: {cosine_similarities}') best_zeroshot_topic_idx = np.argmax(cosine_similarities) best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] - + print(f'best_cosine_similarity: {best_cosine_similarity}') if best_cosine_similarity >= topic_model.zeroshot_min_similarity: # Using the topic ID from before mapping, get the idx into the zeroshot topic list new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[ zeroshot_topic_ids[best_zeroshot_topic_idx] ] - + print(f'new_topic_id_to_zeroshot_topic_idx: {new_topic_id_to_zeroshot_topic_idx}') + # print('running without updating topic_model._topic_id_to_zeroshot_topic_idx!') topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx + print(f'after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {topic_model._topic_id_to_zeroshot_topic_idx}') def add_new_topics(self, mappings: Mapping[int, int]): """Add new row(s) of topic mappings. diff --git a/delete_topics_test_sc.ipynb b/delete_topics_test_sc.ipynb new file mode 100644 index 00000000..dae2157a --- /dev/null +++ b/delete_topics_test_sc.ipynb @@ -0,0 +1,1277 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# !python -m pip install -e \".[dev]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\sc305\\miniforge3\\envs\\bertopic-sc\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "from bertopic import BERTopic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Case from MaartenGr No. 2 - Zero Shot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Before delete:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-05-27 16:51:28,462 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "from sentence_transformers import SentenceTransformer\n", + "from hdbscan import HDBSCAN\n", + "from umap import UMAP\n", + "\n", + "from bertopic import BERTopic\n", + "from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance\n", + "\n", + "docs = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"][\"abstract\"][:20_000]\n", + "\n", + "# Pre-calculate embeddings\n", + "embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", + "# embeddings = embedding_model.encode(docs, show_progress_bar=True)\n", + "\n", + "# # # Save the embeddings to a file\n", + "# np.save(\"test_ArXiv_embeddings_zeroshot_example.npy\", embeddings)\n", + "loaded_embeddings = np.load(\"test_ArXiv_embeddings_zeroshot_example.npy\")\n", + "\n", + "# Use sub-models\n", + "umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, random_state=42)\n", + "hdbscan_model = HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True)\n", + "\n", + "# Representation models\n", + "keybert_model = KeyBERTInspired()\n", + "mmr_model = MaximalMarginalRelevance(diversity=0.3)\n", + "representation_model = {\n", + " \"KeyBERT\": keybert_model,\n", + " \"MMR\": mmr_model,\n", + "}\n", + "\n", + "# BERTopic\n", + "topic_model = BERTopic(\n", + " embedding_model=embedding_model,\n", + " umap_model=umap_model,\n", + " hdbscan_model=hdbscan_model,\n", + " zeroshot_topic_list=[\"topic modeling\", \"large language models\"],\n", + " verbose=True,\n", + ").fit(docs, loaded_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
3272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
4363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7624 -1_the_of_to_and \n", + "1 0 4 topic modeling \n", + "2 1 24 1_modulation_radio_channel_transmitters \n", + "3 2 7 2_pain_discomfort_diagnostic_facial \n", + "4 3 6 3_quantum_entanglement_wave_annealers \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "1 [topic, papers, svd, topics, allocation, conta... \n", + "2 [modulation, radio, channel, transmitters, sig... \n", + "3 [pain, discomfort, diagnostic, facial, intensi... \n", + "4 [quantum, entanglement, wave, annealers, conva... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "1 [ Topic models have emerged as fundamental to... \n", + "2 [ We survey the latest advances in machine le... \n", + "3 [ Pain is a complex and subjective experience... \n", + "4 [ Modern deep learning has enabled unpreceden... " + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topic_model.get_topic_info()[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n", + "Index: []" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_topic_info = topic_model.get_topic_info()\n", + "original_topic_info[original_topic_info['Name']=='large language models']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
264263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
324323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
999826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
292819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7624 -1_the_of_to_and \n", + "264 263 510 263_generative_gan_gans_generator \n", + "324 323 403 323_quantization_hardware_pruning_gpu \n", + "99 98 269 98_recommendation_user_items_item \n", + "29 28 196 28_privacy_private_differential_differentially \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "264 [generative, gan, gans, generator, adversarial... \n", + "324 [quantization, hardware, pruning, gpu, precisi... \n", + "99 [recommendation, user, items, item, recommende... \n", + "29 [privacy, private, differential, differentiall... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "264 [ Generative Adversarial Networks (GANs) are ... \n", + "324 [ Deep neural networks (DNNs) are used by dif... \n", + "99 [ Matrix factorization techniques have been w... \n", + "29 [ The process of data mining with differentia... " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_topic_info.sort_values('Count', ascending=False)[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "1 0 4 topic modeling \n", + "\n", + " Representation \\\n", + "1 [topic, papers, svd, topics, allocation, conta... \n", + "\n", + " Representative_Docs \n", + "1 [ Topic models have emerged as fundamental to... " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_topic_info[original_topic_info['Name']=='topic modeling']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "2 1 24 1_modulation_radio_channel_transmitters \n", + "\n", + " Representation \\\n", + "2 [modulation, radio, channel, transmitters, sig... \n", + "\n", + " Representative_Docs \n", + "2 [ We survey the latest advances in machine le... " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_topic_info[original_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run delete:" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
3272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
4363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7624 -1_the_of_to_and \n", + "1 0 4 topic modeling \n", + "2 1 24 1_modulation_radio_channel_transmitters \n", + "3 2 7 2_pain_discomfort_diagnostic_facial \n", + "4 3 6 3_quantum_entanglement_wave_annealers \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "1 [topic, papers, svd, topics, allocation, conta... \n", + "2 [modulation, radio, channel, transmitters, sig... \n", + "3 [pain, discomfort, diagnostic, facial, intensi... \n", + "4 [quantum, entanglement, wave, annealers, conva... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "1 [ Topic models have emerged as fundamental to... \n", + "2 [ We survey the latest advances in machine le... \n", + "3 [ Pain is a complex and subjective experience... \n", + "4 [ Modern deep learning has enabled unpreceden... " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topic_model.get_topic_info()[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "# topic_model.merge_topics(docs, [1, 2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# topic_model.delete_topics([0])" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len of mapping: 375\n", + "topics_from: [np.int64(0)] and topic_to: 0\n", + "zeroshot_labels: ['topic modeling']\n", + "cosine_similarities: [0.31793424]\n", + "best_cosine_similarity: 0.3179342448711395\n", + "new_topic_id_to_zeroshot_topic_idx: {}\n", + "after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {}\n" + ] + } + ], + "source": [ + "topic_model.delete_topics([1])" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0}" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# updated to self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) to avoid unecessary updates in add_mappings\n", + "topic_model._topic_id_to_zeroshot_topic_idx" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17648-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
2272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
3363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
44744_quantum_classical_qubits_states[quantum, classical, qubits, states, circuit, ...[ Quantum machine learning witnesses an incre...
..................
3693698369_chaos_initialization_jacobian_isometry[chaos, initialization, jacobian, isometry, de...[ It is well known that the initialization of...
37037037370_relu_depth_activation_mathbb[relu, depth, activation, mathbb, functions, w...[ We study the necessary and sufficient compl...
37137116371_generalization_nonvacuous_explain_sensitivity[generalization, nonvacuous, explain, sensitiv...[ Neural networks exhibit good generalization...
37237271372_minima_mathbf_loss_relu[minima, mathbf, loss, relu, activation, layer...[ Deep learning models are often successfully...
3733735373_sgd_minima_saddles_band[sgd, minima, saddles, band, descent, degenera...[ Recent years have seen a growing interest i...
\n", + "

374 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7648 -1_the_of_to_and \n", + "1 0 4 topic modeling \n", + "2 2 7 2_pain_discomfort_diagnostic_facial \n", + "3 3 6 3_quantum_entanglement_wave_annealers \n", + "4 4 74 4_quantum_classical_qubits_states \n", + ".. ... ... ... \n", + "369 369 8 369_chaos_initialization_jacobian_isometry \n", + "370 370 37 370_relu_depth_activation_mathbb \n", + "371 371 16 371_generalization_nonvacuous_explain_sensitivity \n", + "372 372 71 372_minima_mathbf_loss_relu \n", + "373 373 5 373_sgd_minima_saddles_band \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "1 [topic, papers, svd, topics, allocation, conta... \n", + "2 [pain, discomfort, diagnostic, facial, intensi... \n", + "3 [quantum, entanglement, wave, annealers, conva... \n", + "4 [quantum, classical, qubits, states, circuit, ... \n", + ".. ... \n", + "369 [chaos, initialization, jacobian, isometry, de... \n", + "370 [relu, depth, activation, mathbb, functions, w... \n", + "371 [generalization, nonvacuous, explain, sensitiv... \n", + "372 [minima, mathbf, loss, relu, activation, layer... \n", + "373 [sgd, minima, saddles, band, descent, degenera... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "1 [ Topic models have emerged as fundamental to... \n", + "2 [ Pain is a complex and subjective experience... \n", + "3 [ Modern deep learning has enabled unpreceden... \n", + "4 [ Quantum machine learning witnesses an incre... \n", + ".. ... \n", + "369 [ It is well known that the initialization of... \n", + "370 [ We study the necessary and sufficient compl... \n", + "371 [ Neural networks exhibit good generalization... \n", + "372 [ Deep learning models are often successfully... \n", + "373 [ Recent years have seen a growing interest i... \n", + "\n", + "[374 rows x 5 columns]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "after_delete_one_topic_info = topic_model.get_topic_info()\n", + "after_delete_one_topic_info" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "1 0 4 topic modeling \n", + "\n", + " Representation \\\n", + "1 [topic, papers, svd, topics, allocation, conta... \n", + "\n", + " Representative_Docs \n", + "1 [ Topic models have emerged as fundamental to... " + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('topic modeling')]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n", + "Index: []" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing Name suffixes after delete/merge:\n", + "modulation_radio_channel_transmitters\n", + "\n", + "Summary:\n", + "Original unique suffix count: 375\n", + "After delete/merge unique suffix count: 374\n" + ] + } + ], + "source": [ + "# Extract just the suffix (after the first “_”) from each Name\n", + "orig_suffixes = original_topic_info['Name'].str.split('_', n=1).str[1].where(original_topic_info['Name'].str.contains('_'), original_topic_info['Name'])\n", + "after_suffixes = after_delete_one_topic_info['Name'].str.split('_', n=1).str[1].where(after_delete_one_topic_info['Name'].str.contains('_'), after_delete_one_topic_info['Name'])\n", + "\n", + "\n", + "# Build unique sets\n", + "orig_set = set(orig_suffixes)\n", + "after_set = set(after_suffixes)\n", + "\n", + "# Find any suffixes that were in the original but not after deletion\n", + "missing = orig_set - after_set\n", + "\n", + "# Report\n", + "if missing:\n", + " print(\"Missing Name suffixes after delete/merge:\")\n", + " for name in sorted(missing):\n", + " print(name)\n", + "else:\n", + " print(\"All Name suffixes are preserved\")\n", + "\n", + "print(\"\\nSummary:\")\n", + "print(f\"Original unique suffix count: {len(orig_set)}\")\n", + "print(f\"After delete/merge unique suffix count: {len(after_set)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
264263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
324323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
999826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
292819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7624 -1_the_of_to_and \n", + "264 263 510 263_generative_gan_gans_generator \n", + "324 323 403 323_quantization_hardware_pruning_gpu \n", + "99 98 269 98_recommendation_user_items_item \n", + "29 28 196 28_privacy_private_differential_differentially \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "264 [generative, gan, gans, generator, adversarial... \n", + "324 [quantization, hardware, pruning, gpu, precisi... \n", + "99 [recommendation, user, items, item, recommende... \n", + "29 [privacy, private, differential, differentiall... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "264 [ Generative Adversarial Networks (GANs) are ... \n", + "324 [ Deep neural networks (DNNs) are used by dif... \n", + "99 [ Matrix factorization techniques have been w... \n", + "29 [ The process of data mining with differentia... " + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_topic_info.sort_values('Count', ascending=False)[:5]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TopicCountNameRepresentationRepresentative_Docs
0-17648-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
263263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
323323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
989826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
282819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", + "
" + ], + "text/plain": [ + " Topic Count Name \\\n", + "0 -1 7648 -1_the_of_to_and \n", + "263 263 510 263_generative_gan_gans_generator \n", + "323 323 403 323_quantization_hardware_pruning_gpu \n", + "98 98 269 98_recommendation_user_items_item \n", + "28 28 196 28_privacy_private_differential_differentially \n", + "\n", + " Representation \\\n", + "0 [the, of, to, and, in, we, that, is, for, lear... \n", + "263 [generative, gan, gans, generator, adversarial... \n", + "323 [quantization, hardware, pruning, gpu, precisi... \n", + "98 [recommendation, user, items, item, recommende... \n", + "28 [privacy, private, differential, differentiall... \n", + "\n", + " Representative_Docs \n", + "0 [ A crucial task in system identification pro... \n", + "263 [ Generative Adversarial Networks (GANs) are ... \n", + "323 [ Deep neural networks (DNNs) are used by dif... \n", + "98 [ Matrix factorization techniques have been w... \n", + "28 [ The process of data mining with differentia... " + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "after_delete_one_topic_info.sort_values('Count', ascending=False)[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bertopic-sc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 17f0a72ee7580c399eda9d4b120552af75dd0a2f Mon Sep 17 00:00:00 2001 From: Shuang Chen Date: Sun, 27 Jul 2025 12:27:22 -0400 Subject: [PATCH 7/7] finalize delete topics --- bertopic/_bertopic.py | 8 - delete_topics_test_sc.ipynb | 1277 ----------------------------------- 2 files changed, 1285 deletions(-) delete mode 100644 delete_topics_test_sc.ipynb diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 499fd584..bd2fad43 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -4922,7 +4922,6 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): for key, value in topics_to_map.items(): mapping[value].append(key) - print(f'len of mapping: {len(mapping)}') for topic_to, topics_from in mapping.items(): # which of the original topics are zero-shot zeroshot_topic_ids = [ @@ -4937,25 +4936,18 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]] for topic_id in zeroshot_topic_ids ] - print(f'topics_from: {topics_from} and topic_to: {topic_to}') - print(f'zeroshot_labels: {zeroshot_labels}') zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels) cosine_similarities = cosine_similarity( zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]] ).flatten() - print(f'cosine_similarities: {cosine_similarities}') best_zeroshot_topic_idx = np.argmax(cosine_similarities) best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] - print(f'best_cosine_similarity: {best_cosine_similarity}') if best_cosine_similarity >= topic_model.zeroshot_min_similarity: # Using the topic ID from before mapping, get the idx into the zeroshot topic list new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[ zeroshot_topic_ids[best_zeroshot_topic_idx] ] - print(f'new_topic_id_to_zeroshot_topic_idx: {new_topic_id_to_zeroshot_topic_idx}') - # print('running without updating topic_model._topic_id_to_zeroshot_topic_idx!') topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx - print(f'after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {topic_model._topic_id_to_zeroshot_topic_idx}') def add_new_topics(self, mappings: Mapping[int, int]): """Add new row(s) of topic mappings. diff --git a/delete_topics_test_sc.ipynb b/delete_topics_test_sc.ipynb deleted file mode 100644 index dae2157a..00000000 --- a/delete_topics_test_sc.ipynb +++ /dev/null @@ -1,1277 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "# !python -m pip install -e \".[dev]\"" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\sc305\\miniforge3\\envs\\bertopic-sc\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "from bertopic import BERTopic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test Case from MaartenGr No. 2 - Zero Shot" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Before delete:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-05-27 16:51:28,462 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n" - ] - } - ], - "source": [ - "from datasets import load_dataset\n", - "from sentence_transformers import SentenceTransformer\n", - "from hdbscan import HDBSCAN\n", - "from umap import UMAP\n", - "\n", - "from bertopic import BERTopic\n", - "from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance\n", - "\n", - "docs = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"][\"abstract\"][:20_000]\n", - "\n", - "# Pre-calculate embeddings\n", - "embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", - "# embeddings = embedding_model.encode(docs, show_progress_bar=True)\n", - "\n", - "# # # Save the embeddings to a file\n", - "# np.save(\"test_ArXiv_embeddings_zeroshot_example.npy\", embeddings)\n", - "loaded_embeddings = np.load(\"test_ArXiv_embeddings_zeroshot_example.npy\")\n", - "\n", - "# Use sub-models\n", - "umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, random_state=42)\n", - "hdbscan_model = HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True)\n", - "\n", - "# Representation models\n", - "keybert_model = KeyBERTInspired()\n", - "mmr_model = MaximalMarginalRelevance(diversity=0.3)\n", - "representation_model = {\n", - " \"KeyBERT\": keybert_model,\n", - " \"MMR\": mmr_model,\n", - "}\n", - "\n", - "# BERTopic\n", - "topic_model = BERTopic(\n", - " embedding_model=embedding_model,\n", - " umap_model=umap_model,\n", - " hdbscan_model=hdbscan_model,\n", - " zeroshot_topic_list=[\"topic modeling\", \"large language models\"],\n", - " verbose=True,\n", - ").fit(docs, loaded_embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
3272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
4363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7624 -1_the_of_to_and \n", - "1 0 4 topic modeling \n", - "2 1 24 1_modulation_radio_channel_transmitters \n", - "3 2 7 2_pain_discomfort_diagnostic_facial \n", - "4 3 6 3_quantum_entanglement_wave_annealers \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "1 [topic, papers, svd, topics, allocation, conta... \n", - "2 [modulation, radio, channel, transmitters, sig... \n", - "3 [pain, discomfort, diagnostic, facial, intensi... \n", - "4 [quantum, entanglement, wave, annealers, conva... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "1 [ Topic models have emerged as fundamental to... \n", - "2 [ We survey the latest advances in machine le... \n", - "3 [ Pain is a complex and subjective experience... \n", - "4 [ Modern deep learning has enabled unpreceden... " - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "topic_model.get_topic_info()[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n", - "Index: []" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "original_topic_info = topic_model.get_topic_info()\n", - "original_topic_info[original_topic_info['Name']=='large language models']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
264263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
324323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
999826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
292819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7624 -1_the_of_to_and \n", - "264 263 510 263_generative_gan_gans_generator \n", - "324 323 403 323_quantization_hardware_pruning_gpu \n", - "99 98 269 98_recommendation_user_items_item \n", - "29 28 196 28_privacy_private_differential_differentially \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "264 [generative, gan, gans, generator, adversarial... \n", - "324 [quantization, hardware, pruning, gpu, precisi... \n", - "99 [recommendation, user, items, item, recommende... \n", - "29 [privacy, private, differential, differentiall... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "264 [ Generative Adversarial Networks (GANs) are ... \n", - "324 [ Deep neural networks (DNNs) are used by dif... \n", - "99 [ Matrix factorization techniques have been w... \n", - "29 [ The process of data mining with differentia... " - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "original_topic_info.sort_values('Count', ascending=False)[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "1 0 4 topic modeling \n", - "\n", - " Representation \\\n", - "1 [topic, papers, svd, topics, allocation, conta... \n", - "\n", - " Representative_Docs \n", - "1 [ Topic models have emerged as fundamental to... " - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "original_topic_info[original_topic_info['Name']=='topic modeling']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "2 1 24 1_modulation_radio_channel_transmitters \n", - "\n", - " Representation \\\n", - "2 [modulation, radio, channel, transmitters, sig... \n", - "\n", - " Representative_Docs \n", - "2 [ We survey the latest advances in machine le... " - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "original_topic_info[original_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run delete:" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
21241_modulation_radio_channel_transmitters[modulation, radio, channel, transmitters, sig...[ We survey the latest advances in machine le...
3272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
4363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7624 -1_the_of_to_and \n", - "1 0 4 topic modeling \n", - "2 1 24 1_modulation_radio_channel_transmitters \n", - "3 2 7 2_pain_discomfort_diagnostic_facial \n", - "4 3 6 3_quantum_entanglement_wave_annealers \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "1 [topic, papers, svd, topics, allocation, conta... \n", - "2 [modulation, radio, channel, transmitters, sig... \n", - "3 [pain, discomfort, diagnostic, facial, intensi... \n", - "4 [quantum, entanglement, wave, annealers, conva... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "1 [ Topic models have emerged as fundamental to... \n", - "2 [ We survey the latest advances in machine le... \n", - "3 [ Pain is a complex and subjective experience... \n", - "4 [ Modern deep learning has enabled unpreceden... " - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "topic_model.get_topic_info()[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "# topic_model.merge_topics(docs, [1, 2])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# topic_model.delete_topics([0])" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len of mapping: 375\n", - "topics_from: [np.int64(0)] and topic_to: 0\n", - "zeroshot_labels: ['topic modeling']\n", - "cosine_similarities: [0.31793424]\n", - "best_cosine_similarity: 0.3179342448711395\n", - "new_topic_id_to_zeroshot_topic_idx: {}\n", - "after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {}\n" - ] - } - ], - "source": [ - "topic_model.delete_topics([1])" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: 0}" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# updated to self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) to avoid unecessary updates in add_mappings\n", - "topic_model._topic_id_to_zeroshot_topic_idx" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17648-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
2272_pain_discomfort_diagnostic_facial[pain, discomfort, diagnostic, facial, intensi...[ Pain is a complex and subjective experience...
3363_quantum_entanglement_wave_annealers[quantum, entanglement, wave, annealers, conva...[ Modern deep learning has enabled unpreceden...
44744_quantum_classical_qubits_states[quantum, classical, qubits, states, circuit, ...[ Quantum machine learning witnesses an incre...
..................
3693698369_chaos_initialization_jacobian_isometry[chaos, initialization, jacobian, isometry, de...[ It is well known that the initialization of...
37037037370_relu_depth_activation_mathbb[relu, depth, activation, mathbb, functions, w...[ We study the necessary and sufficient compl...
37137116371_generalization_nonvacuous_explain_sensitivity[generalization, nonvacuous, explain, sensitiv...[ Neural networks exhibit good generalization...
37237271372_minima_mathbf_loss_relu[minima, mathbf, loss, relu, activation, layer...[ Deep learning models are often successfully...
3733735373_sgd_minima_saddles_band[sgd, minima, saddles, band, descent, degenera...[ Recent years have seen a growing interest i...
\n", - "

374 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7648 -1_the_of_to_and \n", - "1 0 4 topic modeling \n", - "2 2 7 2_pain_discomfort_diagnostic_facial \n", - "3 3 6 3_quantum_entanglement_wave_annealers \n", - "4 4 74 4_quantum_classical_qubits_states \n", - ".. ... ... ... \n", - "369 369 8 369_chaos_initialization_jacobian_isometry \n", - "370 370 37 370_relu_depth_activation_mathbb \n", - "371 371 16 371_generalization_nonvacuous_explain_sensitivity \n", - "372 372 71 372_minima_mathbf_loss_relu \n", - "373 373 5 373_sgd_minima_saddles_band \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "1 [topic, papers, svd, topics, allocation, conta... \n", - "2 [pain, discomfort, diagnostic, facial, intensi... \n", - "3 [quantum, entanglement, wave, annealers, conva... \n", - "4 [quantum, classical, qubits, states, circuit, ... \n", - ".. ... \n", - "369 [chaos, initialization, jacobian, isometry, de... \n", - "370 [relu, depth, activation, mathbb, functions, w... \n", - "371 [generalization, nonvacuous, explain, sensitiv... \n", - "372 [minima, mathbf, loss, relu, activation, layer... \n", - "373 [sgd, minima, saddles, band, descent, degenera... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "1 [ Topic models have emerged as fundamental to... \n", - "2 [ Pain is a complex and subjective experience... \n", - "3 [ Modern deep learning has enabled unpreceden... \n", - "4 [ Quantum machine learning witnesses an incre... \n", - ".. ... \n", - "369 [ It is well known that the initialization of... \n", - "370 [ We study the necessary and sufficient compl... \n", - "371 [ Neural networks exhibit good generalization... \n", - "372 [ Deep learning models are often successfully... \n", - "373 [ Recent years have seen a growing interest i... \n", - "\n", - "[374 rows x 5 columns]" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "after_delete_one_topic_info = topic_model.get_topic_info()\n", - "after_delete_one_topic_info" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
104topic modeling[topic, papers, svd, topics, allocation, conta...[ Topic models have emerged as fundamental to...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "1 0 4 topic modeling \n", - "\n", - " Representation \\\n", - "1 [topic, papers, svd, topics, allocation, conta... \n", - "\n", - " Representative_Docs \n", - "1 [ Topic models have emerged as fundamental to... " - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('topic modeling')]" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [Topic, Count, Name, Representation, Representative_Docs]\n", - "Index: []" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Missing Name suffixes after delete/merge:\n", - "modulation_radio_channel_transmitters\n", - "\n", - "Summary:\n", - "Original unique suffix count: 375\n", - "After delete/merge unique suffix count: 374\n" - ] - } - ], - "source": [ - "# Extract just the suffix (after the first “_”) from each Name\n", - "orig_suffixes = original_topic_info['Name'].str.split('_', n=1).str[1].where(original_topic_info['Name'].str.contains('_'), original_topic_info['Name'])\n", - "after_suffixes = after_delete_one_topic_info['Name'].str.split('_', n=1).str[1].where(after_delete_one_topic_info['Name'].str.contains('_'), after_delete_one_topic_info['Name'])\n", - "\n", - "\n", - "# Build unique sets\n", - "orig_set = set(orig_suffixes)\n", - "after_set = set(after_suffixes)\n", - "\n", - "# Find any suffixes that were in the original but not after deletion\n", - "missing = orig_set - after_set\n", - "\n", - "# Report\n", - "if missing:\n", - " print(\"Missing Name suffixes after delete/merge:\")\n", - " for name in sorted(missing):\n", - " print(name)\n", - "else:\n", - " print(\"All Name suffixes are preserved\")\n", - "\n", - "print(\"\\nSummary:\")\n", - "print(f\"Original unique suffix count: {len(orig_set)}\")\n", - "print(f\"After delete/merge unique suffix count: {len(after_set)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17624-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
264263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
324323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
999826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
292819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7624 -1_the_of_to_and \n", - "264 263 510 263_generative_gan_gans_generator \n", - "324 323 403 323_quantization_hardware_pruning_gpu \n", - "99 98 269 98_recommendation_user_items_item \n", - "29 28 196 28_privacy_private_differential_differentially \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "264 [generative, gan, gans, generator, adversarial... \n", - "324 [quantization, hardware, pruning, gpu, precisi... \n", - "99 [recommendation, user, items, item, recommende... \n", - "29 [privacy, private, differential, differentiall... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "264 [ Generative Adversarial Networks (GANs) are ... \n", - "324 [ Deep neural networks (DNNs) are used by dif... \n", - "99 [ Matrix factorization techniques have been w... \n", - "29 [ The process of data mining with differentia... " - ] - }, - "execution_count": 86, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "original_topic_info.sort_values('Count', ascending=False)[:5]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TopicCountNameRepresentationRepresentative_Docs
0-17648-1_the_of_to_and[the, of, to, and, in, we, that, is, for, lear...[ A crucial task in system identification pro...
263263510263_generative_gan_gans_generator[generative, gan, gans, generator, adversarial...[ Generative Adversarial Networks (GANs) are ...
323323403323_quantization_hardware_pruning_gpu[quantization, hardware, pruning, gpu, precisi...[ Deep neural networks (DNNs) are used by dif...
989826998_recommendation_user_items_item[recommendation, user, items, item, recommende...[ Matrix factorization techniques have been w...
282819628_privacy_private_differential_differentially[privacy, private, differential, differentiall...[ The process of data mining with differentia...
\n", - "
" - ], - "text/plain": [ - " Topic Count Name \\\n", - "0 -1 7648 -1_the_of_to_and \n", - "263 263 510 263_generative_gan_gans_generator \n", - "323 323 403 323_quantization_hardware_pruning_gpu \n", - "98 98 269 98_recommendation_user_items_item \n", - "28 28 196 28_privacy_private_differential_differentially \n", - "\n", - " Representation \\\n", - "0 [the, of, to, and, in, we, that, is, for, lear... \n", - "263 [generative, gan, gans, generator, adversarial... \n", - "323 [quantization, hardware, pruning, gpu, precisi... \n", - "98 [recommendation, user, items, item, recommende... \n", - "28 [privacy, private, differential, differentiall... \n", - "\n", - " Representative_Docs \n", - "0 [ A crucial task in system identification pro... \n", - "263 [ Generative Adversarial Networks (GANs) are ... \n", - "323 [ Deep neural networks (DNNs) are used by dif... \n", - "98 [ Matrix factorization techniques have been w... \n", - "28 [ The process of data mining with differentia... " - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "after_delete_one_topic_info.sort_values('Count', ascending=False)[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bertopic-sc", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.22" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}