@@ -78,6 +78,7 @@ def __init__(self,
7878 nr_topics : Union [int , str ] = None ,
7979 low_memory : bool = False ,
8080 calculate_probabilities : bool = False ,
81+ diversity : float = None ,
8182 seed_topic_list : List [List [str ]] = None ,
8283 embedding_model = None ,
8384 umap_model : UMAP = None ,
@@ -105,8 +106,7 @@ def __init__(self,
105106 number of topics to the value specified. This reduction can take
106107 a while as each reduction in topics (-1) activates a c-TF-IDF
107108 calculation. If this is set to None, no reduction is applied. Use
108- "auto" to automatically reduce topics that have a similarity of at
109- least 0.9, do not maps all others.
109+ "auto" to automatically reduce topics using HDBSCAN.
110110 low_memory: Sets UMAP low memory to True to make sure less memory is used.
111111 calculate_probabilities: Whether to calculate the probabilities of all topics
112112 per document instead of the probability of the assigned
@@ -116,6 +116,9 @@ def __init__(self,
116116 you do not mind more computation time.
117117 NOTE: If false you cannot use the corresponding
118118 visualization method `visualize_probabilities`.
119+ diversity: Whether to use MMR to diversify the resulting topic representations.
120+ If set to None, MMR will not be used. Accepted values lie between
121+ 0 and 1 with 0 being not at all diverse and 1 being very diverse.
119122 seed_topic_list: A list of seed words per topic to converge around
120123 verbose: Changes the verbosity of the model, Set to True if you want
121124 to track the stages of the model.
@@ -141,6 +144,7 @@ def __init__(self,
141144 self .nr_topics = nr_topics
142145 self .low_memory = low_memory
143146 self .calculate_probabilities = calculate_probabilities
147+ self .diversity = diversity
144148 self .verbose = verbose
145149 self .seed_topic_list = seed_topic_list
146150
@@ -370,10 +374,14 @@ def transform(self,
370374 verbose = self .verbose )
371375
372376 umap_embeddings = self .umap_model .transform (embeddings )
377+ logger .info ("Reduced dimensionality with UMAP" )
378+
373379 predictions , probabilities = hdbscan .approximate_predict (self .hdbscan_model , umap_embeddings )
380+ logger .info ("Predicted clusters with HDBSCAN" )
374381
375382 if self .calculate_probabilities :
376383 probabilities = hdbscan .membership_vector (self .hdbscan_model , umap_embeddings )
384+ logger .info ("Calculated probabilities with HDBSCAN" )
377385 else :
378386 probabilities = None
379387
@@ -476,7 +484,7 @@ def topics_over_time(self,
476484 selection = documents .loc [documents .Timestamps == timestamp , :]
477485 documents_per_topic = selection .groupby (['Topic' ], as_index = False ).agg ({'Document' : ' ' .join ,
478486 "Timestamps" : "count" })
479- c_tf_idf , words = self ._c_tf_idf (documents_per_topic , m = len ( selection ), fit = False )
487+ c_tf_idf , words = self ._c_tf_idf (documents_per_topic , fit = False )
480488
481489 if global_tuning or evolution_tuning :
482490 c_tf_idf = normalize (c_tf_idf , axis = 1 , norm = 'l1' , copy = False )
@@ -569,7 +577,7 @@ def topics_per_class(self,
569577 selection = documents .loc [documents .Class == class_ , :]
570578 documents_per_topic = selection .groupby (['Topic' ], as_index = False ).agg ({'Document' : ' ' .join ,
571579 "Class" : "count" })
572- c_tf_idf , words = self ._c_tf_idf (documents_per_topic , m = len ( selection ), fit = False )
580+ c_tf_idf , words = self ._c_tf_idf (documents_per_topic , fit = False )
573581
574582 # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
575583 # by simply taking the average of the two
@@ -1107,8 +1115,8 @@ def visualize_hierarchy(self,
11071115 Either 'left' or 'bottom'
11081116 topics: A selection of topics to visualize
11091117 top_n_topics: Only select the top n most frequent topics
1110- width: The width of the figure.
1111- height: The height of the figure.
1118+ width: The width of the figure. Only works if orientation is set to 'left'
1119+ height: The height of the figure. Only works if orientation is set to 'bottom'
11121120
11131121 Returns:
11141122 fig: A plotly figure
@@ -1185,18 +1193,18 @@ def visualize_heatmap(self,
11851193
11861194 def visualize_barchart (self ,
11871195 topics : List [int ] = None ,
1188- top_n_topics : int = 6 ,
1196+ top_n_topics : int = 8 ,
11891197 n_words : int = 5 ,
1190- width : int = 800 ,
1191- height : int = 600 ) -> go .Figure :
1198+ width : int = 250 ,
1199+ height : int = 250 ) -> go .Figure :
11921200 """ Visualize a barchart of selected topics
11931201
11941202 Arguments:
11951203 topics: A selection of topics to visualize.
11961204 top_n_topics: Only select the top n most frequent topics.
11971205 n_words: Number of words to show in a topic
1198- width: The width of the figure.
1199- height: The height of the figure.
1206+ width: The width of each figure.
1207+ height: The height of each figure.
12001208
12011209 Returns:
12021210 fig: A plotly figure
@@ -1447,7 +1455,7 @@ def _extract_topics(self, documents: pd.DataFrame):
14471455 c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic
14481456 """
14491457 documents_per_topic = documents .groupby (['Topic' ], as_index = False ).agg ({'Document' : ' ' .join })
1450- self .c_tf_idf , words = self ._c_tf_idf (documents_per_topic , m = len ( documents ) )
1458+ self .c_tf_idf , words = self ._c_tf_idf (documents_per_topic )
14511459 self .topics = self ._extract_words_per_topic (words )
14521460 self ._create_topic_vectors ()
14531461 self .topic_names = {key : f"{ key } _" + "_" .join ([word [0 ] for word in values [:4 ]])
@@ -1553,7 +1561,7 @@ def _create_topic_vectors(self):
15531561
15541562 self .topic_embeddings = topic_embeddings
15551563
1556- def _c_tf_idf (self , documents_per_topic : pd .DataFrame , m : int , fit : bool = True ) -> Tuple [csr_matrix , List [str ]]:
1564+ def _c_tf_idf (self , documents_per_topic : pd .DataFrame , fit : bool = True ) -> Tuple [csr_matrix , List [str ]]:
15571565 """ Calculate a class-based TF-IDF where m is the number of total documents.
15581566
15591567 Arguments:
@@ -1581,7 +1589,7 @@ def _c_tf_idf(self, documents_per_topic: pd.DataFrame, m: int, fit: bool = True)
15811589 multiplier = None
15821590
15831591 if fit :
1584- self .transformer = ClassTFIDF ().fit (X , n_samples = m , multiplier = multiplier )
1592+ self .transformer = ClassTFIDF ().fit (X , multiplier = multiplier )
15851593
15861594 c_tf_idf = self .transformer .transform (X )
15871595
@@ -1641,19 +1649,20 @@ def _extract_words_per_topic(self,
16411649
16421650 # Extract word embeddings for the top 30 words per topic and compare it
16431651 # with the topic embedding to keep only the words most similar to the topic embedding
1644- if self .embedding_model is not None :
1652+ if self .diversity is not None :
1653+ if self .embedding_model is not None :
16451654
1646- for topic , topic_words in topics .items ():
1647- words = [word [0 ] for word in topic_words ]
1648- word_embeddings = self ._extract_embeddings (words ,
1649- method = "word" ,
1650- verbose = False )
1651- topic_embedding = self ._extract_embeddings (" " .join (words ),
1652- method = "word" ,
1653- verbose = False ).reshape (1 , - 1 )
1654- topic_words = mmr (topic_embedding , word_embeddings , words ,
1655- top_n = self .top_n_words , diversity = 0 )
1656- topics [topic ] = [(word , value ) for word , value in topics [topic ] if word in topic_words ]
1655+ for topic , topic_words in topics .items ():
1656+ words = [word [0 ] for word in topic_words ]
1657+ word_embeddings = self ._extract_embeddings (words ,
1658+ method = "word" ,
1659+ verbose = False )
1660+ topic_embedding = self ._extract_embeddings (" " .join (words ),
1661+ method = "word" ,
1662+ verbose = False ).reshape (1 , - 1 )
1663+ topic_words = mmr (topic_embedding , word_embeddings , words ,
1664+ top_n = self .top_n_words , diversity = self . diversity )
1665+ topics [topic ] = [(word , value ) for word , value in topics [topic ] if word in topic_words ]
16571666 topics = {label : values [:self .top_n_words ] for label , values in topics .items ()}
16581667
16591668 return topics
@@ -1694,10 +1703,7 @@ def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
16941703 self .merged_topics = []
16951704
16961705 # Create topic similarity matrix
1697- if self .topic_embeddings is not None :
1698- similarities = cosine_similarity (np .array (self .topic_embeddings ))
1699- else :
1700- similarities = cosine_similarity (self .c_tf_idf )
1706+ similarities = cosine_similarity (self .c_tf_idf )
17011707 np .fill_diagonal (similarities , 0 )
17021708
17031709 # Find most similar topic to least common topic
0 commit comments