diff --git a/.gitignore b/.gitignore index 77c026df..e7058c9e 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,7 @@ docs/_build/ # Jupyter Notebook .ipynb_checkpoints +notebooks/ # IPython profile_default/ diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 6a75171b..bd2fad43 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd import scipy.sparse as sp +from copy import deepcopy from tqdm import tqdm from pathlib import Path @@ -827,7 +828,7 @@ def topics_over_time( nr_bins: The number of bins you want to create for the timestamps. The left interval will be chosen as the timestamp. An additional column will be created with the entire interval. - datetime_format: The datetime format of the timestamps if they are strings, eg “%d/%m/%Y”. + datetime_format: The datetime format of the timestamps if they are strings, eg "%d/%m/%Y". Set this to None if you want to have it automatically detect the format. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. @@ -1778,7 +1779,6 @@ def get_document_info( # the topic distributions document_info = topic_model.get_document_info(docs, df=df, metadata={"Topic_distribution": distributions}) - ``` """ check_documents_type(docs) if df is not None: @@ -2168,6 +2168,142 @@ def merge_topics( self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) + def delete_topics( + self, + topics_to_delete: List[int], + ) -> None: + """Delete topics from the topic model. + + The deleted topics will be mapped to -1 (outlier topic). Core topic attributes + like topic embeddings and c-TF-IDF will be automatically updated. + + Arguments: + topics_to_delete: List of topics to delete + """ + check_is_fitted(self) + + topics_df = pd.DataFrame({"Topic": self.topics_}) + + # Check if -1 exists in the current topics + had_outliers = -1 in set(self.topics_) + + # If adding -1 for the first time, initialize its attributes + if not had_outliers and any(topic in topics_to_delete for topic in self.topics_): + # Initialize c-TF-IDF for -1 topic (zeros) + outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) + outlier_row = sp.csr_matrix(outlier_row) + self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) + + # Initialize topic embeddings for -1 topic (zeros) + outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) + self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) + + # Initialize topic representations for -1 topic: ("", 1e-05) + self.topic_representations_[-1] = [("", 1e-05)] + + # Initialize representative docs for -1 topic (empty list) + self.representative_docs_[-1] = [] + + # Initialize representative images for -1 topic if images are being used + if self.representative_images_ is not None: + outlier_image = np.zeros((1, self.representative_images_.shape[1])) + self.representative_images_ = np.vstack([outlier_image, self.representative_images_]) + + # Initialize custom labels for -1 topic if they exist + if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: + self.custom_labels_[-1] = "" + + # Initialize ctfidf model diagonal for -1 topic (ones) if it exists + if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: + n_features = self.ctfidf_model._idf_diag.shape[1] + outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) + self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag]) + + # Initialize topic aspects for -1 topic (empty dict for each aspect) if they exist + if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: + for aspect in self.topic_aspects_: + self.topic_aspects_[aspect][-1] = {} + + # First map deleted topics to -1 + mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} + mapping[-1] = -1 + + # Track mappings and sizes of topics for merging topic embeddings + mappings = defaultdict(list) + for key, val in sorted(mapping.items()): + mappings[val].append(key) + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in mappings.items() + } + + # remove deleted topics and update attributes + topics_df.Topic = topics_df.Topic.map(mapping) + self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) + topics_df = self._sort_mappings_by_frequency(topics_df) + self._update_topic_size(topics_df) + self.probabilities_ = self._map_probabilities(self.probabilities_) + + final_mapping = self.topic_mapper_.get_mappings(original_topics=False) + + # Update dictionary-based attributes to remove deleted topics + # Handle topic_aspects_ if it exists + if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: + new_aspects = { + aspect: { + (final_mapping[old_topic] if old_topic != -1 else -1): content + for old_topic, content in topics.items() + if old_topic not in topics_to_delete + } + for aspect, topics in self.topic_aspects_.items() + } + self.topic_aspects_ = new_aspects + + # Update custom labels if they exist + if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: + new_labels = { + (final_mapping[old_topic] if old_topic != -1 else -1): label + for old_topic, label in self.custom_labels_.items() + if old_topic not in topics_to_delete + } + self.custom_labels_ = new_labels + + # Update topic representations + new_representations = { + (final_mapping[old_topic] if old_topic != -1 else -1): content + for old_topic, content in self.topic_representations_.items() + if old_topic not in topics_to_delete + } + self.topic_representations_ = new_representations + + # Update representative docs if they exist + new_representative_docs = { + (final_mapping[old_topic] if old_topic != -1 else -1): docs + for old_topic, docs in self.representative_docs_.items() + if old_topic not in topics_to_delete + } + self.representative_docs_ = new_representative_docs + + # Update representative images if they exist + if self.representative_images_ is not None: + # Create a mask for non-deleted topics + mask = np.array([topic not in topics_to_delete for topic in range(len(self.representative_images_))]) + self.representative_images_ = self.representative_images_[mask] if mask.any() else None + + # Update array-based attributes using masks to remove deleted topics + for attr in ["topic_embeddings_", "c_tf_idf_"]: + matrix = getattr(self, attr) + mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) + setattr(self, attr, matrix[mask]) + + # Update ctfidf model to remove deleted topics if it exists + if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: + mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])]) + self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask] + def reduce_topics( self, docs: List[str], @@ -4806,13 +4942,11 @@ def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): ).flatten() best_zeroshot_topic_idx = np.argmax(cosine_similarities) best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] - if best_cosine_similarity >= topic_model.zeroshot_min_similarity: # Using the topic ID from before mapping, get the idx into the zeroshot topic list new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[ zeroshot_topic_ids[best_zeroshot_topic_idx] ] - topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx def add_new_topics(self, mappings: Mapping[int, int]): diff --git a/tests/test_reduction/test_delete.py b/tests/test_reduction/test_delete.py new file mode 100644 index 00000000..188e1ffb --- /dev/null +++ b/tests/test_reduction/test_delete.py @@ -0,0 +1,59 @@ +import copy +import pytest + + +@pytest.mark.parametrize( + "model", + [ + ("kmeans_pca_topic_model"), + ("base_topic_model"), + ("custom_topic_model"), + ("merged_topic_model"), + ("reduced_topic_model"), + ("online_topic_model"), + ], +) +def test_delete(model, request): + topic_model = copy.deepcopy(request.getfixturevalue(model)) + nr_topics = len(set(topic_model.topics_)) + length_documents = len(topic_model.topics_) + + # First deletion + topics_to_delete = [1, 2] + topic_model.delete_topics(topics_to_delete) + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) + mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] + + if model == "online_topic_model" or model == "kmeans_pca_topic_model": + assert nr_topics == len(set(topic_model.topics_)) + 1 + assert topic_model.get_topic_info().Count.sum() == length_documents + else: + assert nr_topics == len(set(topic_model.topics_)) + 2 + assert topic_model.get_topic_info().Count.sum() == length_documents + + if model == "online_topic_model": + assert mapped_labels == topic_model.topics_[950:] + else: + assert mapped_labels == topic_model.topics_ + + # Find two existing topics for second deletion + remaining_topics = sorted(list(set(topic_model.topics_))) + remaining_topics = [t for t in remaining_topics if t != -1] # Exclude outlier topic + topics_to_delete = remaining_topics[:2] # Take first two remaining topics + + # Second deletion + topic_model.delete_topics(topics_to_delete) + mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) + mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] + + if model == "online_topic_model" or model == "kmeans_pca_topic_model": + assert nr_topics == len(set(topic_model.topics_)) + 3 + assert topic_model.get_topic_info().Count.sum() == length_documents + else: + assert nr_topics == len(set(topic_model.topics_)) + 4 + assert topic_model.get_topic_info().Count.sum() == length_documents + + if model == "online_topic_model": + assert mapped_labels == topic_model.topics_[950:] + else: + assert mapped_labels == topic_model.topics_