|
37 | 37 | from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable |
38 | 38 |
|
39 | 39 | # Models |
40 | | -import hdbscan |
41 | | -from umap import UMAP |
| 40 | +try: |
| 41 | + from hdbscan import HDBSCAN |
| 42 | + |
| 43 | + HAS_HDBSCAN = True |
| 44 | +except (ImportError, ModuleNotFoundError): |
| 45 | + HAS_HDBSCAN = False |
| 46 | + from sklearn.cluster import HDBSCAN as SK_HDBSCAN |
| 47 | + |
42 | 48 | from sklearn.preprocessing import normalize |
43 | 49 | from sklearn import __version__ as sklearn_version |
44 | 50 | from sklearn.cluster import AgglomerativeClustering |
| 51 | +from sklearn.decomposition import PCA |
45 | 52 | from sklearn.metrics.pairwise import cosine_similarity |
46 | 53 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
47 | 54 |
|
@@ -143,8 +150,8 @@ def __init__( |
143 | 150 | zeroshot_topic_list: List[str] = None, |
144 | 151 | zeroshot_min_similarity: float = 0.7, |
145 | 152 | embedding_model=None, |
146 | | - umap_model: UMAP = None, |
147 | | - hdbscan_model: hdbscan.HDBSCAN = None, |
| 153 | + umap_model=None, |
| 154 | + hdbscan_model=None, |
148 | 155 | vectorizer_model: CountVectorizer = None, |
149 | 156 | ctfidf_model: TfidfTransformer = None, |
150 | 157 | representation_model: BaseRepresentation = None, |
@@ -247,22 +254,38 @@ def __init__( |
247 | 254 | self.representation_model = representation_model |
248 | 255 |
|
249 | 256 | # UMAP or another algorithm that has .fit and .transform functions |
250 | | - self.umap_model = umap_model or UMAP( |
251 | | - n_neighbors=15, |
252 | | - n_components=5, |
253 | | - min_dist=0.0, |
254 | | - metric="cosine", |
255 | | - low_memory=self.low_memory, |
256 | | - ) |
| 257 | + if umap_model is not None: |
| 258 | + self.umap_model = umap_model |
| 259 | + else: |
| 260 | + try: |
| 261 | + from umap import UMAP |
| 262 | + |
| 263 | + self.umap_model = UMAP( |
| 264 | + n_neighbors=15, |
| 265 | + n_components=5, |
| 266 | + min_dist=0.0, |
| 267 | + metric="cosine", |
| 268 | + low_memory=self.low_memory, |
| 269 | + ) |
| 270 | + except (ImportError, ModuleNotFoundError): |
| 271 | + self.umap_model = PCA(n_components=5) |
257 | 272 |
|
258 | 273 | # HDBSCAN or another clustering algorithm that has .fit and .predict functions and |
259 | 274 | # the .labels_ variable to extract the labels |
260 | | - self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN( |
261 | | - min_cluster_size=self.min_topic_size, |
262 | | - metric="euclidean", |
263 | | - cluster_selection_method="eom", |
264 | | - prediction_data=True, |
265 | | - ) |
| 275 | + |
| 276 | + if hdbscan_model is not None: |
| 277 | + self.hdbscan_model = hdbscan_model |
| 278 | + elif HAS_HDBSCAN: |
| 279 | + self.hdbscan_model = HDBSCAN( |
| 280 | + min_cluster_size=self.min_topic_size, |
| 281 | + metric="euclidean", |
| 282 | + cluster_selection_method="eom", |
| 283 | + prediction_data=True, |
| 284 | + ) |
| 285 | + else: |
| 286 | + self.hdbscan_model = SK_HDBSCAN( |
| 287 | + min_cluster_size=self.min_topic_size, metric="euclidean", cluster_selection_method="eom", n_jobs=-1 |
| 288 | + ) |
266 | 289 |
|
267 | 290 | # Public attributes |
268 | 291 | self.topics_ = None |
@@ -326,7 +349,7 @@ def fit( |
326 | 349 | images: List[str] = None, |
327 | 350 | y: Union[List[int], np.ndarray] = None, |
328 | 351 | ): |
329 | | - """Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics. |
| 352 | + """Fit the models on a collection of documents and generate topics. |
330 | 353 |
|
331 | 354 | Arguments: |
332 | 355 | documents: A list of documents to fit on |
@@ -684,9 +707,7 @@ def partial_fit( |
684 | 707 | # Checks |
685 | 708 | check_embeddings_shape(embeddings, documents) |
686 | 709 | if not hasattr(self.hdbscan_model, "partial_fit"): |
687 | | - raise ValueError( |
688 | | - "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function." |
689 | | - ) |
| 710 | + raise ValueError("In order to use `.partial_fit`, the cluster model should have a `.partial_fit` function.") |
690 | 711 |
|
691 | 712 | # Prepare documents |
692 | 713 | if isinstance(documents, str): |
@@ -1524,7 +1545,7 @@ def update_topics( |
1524 | 1545 |
|
1525 | 1546 | if top_n_words > 100: |
1526 | 1547 | logger.warning( |
1527 | | - "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit." |
| 1548 | + "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." |
1528 | 1549 | ) |
1529 | 1550 | self.top_n_words = top_n_words |
1530 | 1551 | self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) |
@@ -2007,7 +2028,7 @@ def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> |
2007 | 2028 | custom_labels = topic_labels |
2008 | 2029 | else: |
2009 | 2030 | raise ValueError( |
2010 | | - "Make sure that `topic_labels` contains the same number " "of labels as there are topics." |
| 2031 | + "Make sure that `topic_labels` contains the same number of labels as there are topics." |
2011 | 2032 | ) |
2012 | 2033 |
|
2013 | 2034 | self.custom_labels_ = custom_labels |
@@ -2124,9 +2145,7 @@ def merge_topics( |
2124 | 2145 | for topic in topic_group: |
2125 | 2146 | mapping[topic] = topic_group[0] |
2126 | 2147 | else: |
2127 | | - raise ValueError( |
2128 | | - "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics." |
2129 | | - ) |
| 2148 | + raise ValueError("Make sure that `topics_to_merge` is eithera list of topics or a list of list of topics.") |
2130 | 2149 |
|
2131 | 2150 | # Track mappings and sizes of topics for merging topic embeddings |
2132 | 2151 | mappings = defaultdict(list) |
@@ -3769,7 +3788,7 @@ def _cluster_embeddings( |
3769 | 3788 | partial_fit: bool = False, |
3770 | 3789 | y: np.ndarray = None, |
3771 | 3790 | ) -> Tuple[pd.DataFrame, np.ndarray]: |
3772 | | - """Cluster UMAP embeddings with HDBSCAN. |
| 3791 | + """Cluster UMAP reduced embeddings with HDBSCAN. |
3773 | 3792 |
|
3774 | 3793 | Arguments: |
3775 | 3794 | umap_embeddings: The reduced sentence embeddings with UMAP |
@@ -4473,12 +4492,18 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) |
4473 | 4492 | self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True |
4474 | 4493 | )[0] |
4475 | 4494 | norm_data = normalize(embeddings, norm="l2") |
4476 | | - predictions = hdbscan.HDBSCAN( |
4477 | | - min_cluster_size=2, |
4478 | | - metric="euclidean", |
4479 | | - cluster_selection_method="eom", |
4480 | | - prediction_data=True, |
4481 | | - ).fit_predict(norm_data[self._outliers :]) |
| 4495 | + |
| 4496 | + if HAS_HDBSCAN: |
| 4497 | + predictions = HDBSCAN( |
| 4498 | + min_cluster_size=2, |
| 4499 | + metric="euclidean", |
| 4500 | + cluster_selection_method="eom", |
| 4501 | + prediction_data=True, |
| 4502 | + ).fit_predict(norm_data[self._outliers :]) |
| 4503 | + else: |
| 4504 | + predictions = SK_HDBSCAN( |
| 4505 | + min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1 |
| 4506 | + ).fit_predict(norm_data[self._outliers :]) |
4482 | 4507 |
|
4483 | 4508 | # Map similar topics |
4484 | 4509 | mapped_topics = { |
|
0 commit comments