@@ -856,8 +856,9 @@ def reduce_topics(self,
856856 documents = pd .DataFrame ({"Document" : docs , "Topic" : topics })
857857
858858 # Reduce number of topics
859- self ._extract_topics (documents )
860859 documents = self ._reduce_topics (documents )
860+ self .merged_topics = None
861+ self ._map_representative_docs ()
861862
862863 # Extract topics and map probabilities
863864 new_topics = documents .Topic .to_list ()
@@ -960,6 +961,7 @@ def visualize_topics_over_time(self,
960961 topics_over_time : pd .DataFrame ,
961962 top_n_topics : int = None ,
962963 topics : List [int ] = None ,
964+ normalize_frequency : bool = False ,
963965 width : int = 1250 ,
964966 height : int = 450 ) -> go .Figure :
965967 """ Visualize topics over time
@@ -969,6 +971,7 @@ def visualize_topics_over_time(self,
969971 corresponding topic representation
970972 top_n_topics: To visualize the most frequent topics instead of all
971973 topics: Select which topics you would like to be visualized
974+ normalize_frequency: Whether to normalize each topic's frequency individually
972975 width: The width of the figure.
973976 height: The height of the figure.
974977
@@ -996,13 +999,15 @@ def visualize_topics_over_time(self,
996999 topics_over_time = topics_over_time ,
9971000 top_n_topics = top_n_topics ,
9981001 topics = topics ,
1002+ normalize_frequency = normalize_frequency ,
9991003 width = width ,
10001004 height = height )
10011005
10021006 def visualize_topics_per_class (self ,
10031007 topics_per_class : pd .DataFrame ,
10041008 top_n_topics : int = 10 ,
10051009 topics : List [int ] = None ,
1010+ normalize_frequency : bool = False ,
10061011 width : int = 1250 ,
10071012 height : int = 900 ) -> go .Figure :
10081013 """ Visualize topics per class
@@ -1012,6 +1017,7 @@ def visualize_topics_per_class(self,
10121017 corresponding topic representation
10131018 top_n_topics: To visualize the most frequent topics instead of all
10141019 topics: Select which topics you would like to be visualized
1020+ normalize_frequency: Whether to normalize each topic's frequency individually
10151021 width: The width of the figure.
10161022 height: The height of the figure.
10171023
@@ -1039,6 +1045,7 @@ def visualize_topics_per_class(self,
10391045 topics_per_class = topics_per_class ,
10401046 top_n_topics = top_n_topics ,
10411047 topics = topics ,
1048+ normalize_frequency = normalize_frequency ,
10421049 width = width ,
10431050 height = height )
10441051
@@ -1491,7 +1498,7 @@ def _map_representative_docs(self):
14911498 representative_docs = self .representative_docs .copy ()
14921499
14931500 # Remove topics that were merged as the most frequent
1494- # topic or the topics they were merged into contain as they contain
1501+ # topic or the topics they were merged into as they contain
14951502 # better representative documents
14961503 if self .merged_topics :
14971504 for topic_to_remove in self .merged_topics :
@@ -1742,7 +1749,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
17421749 if self .topic_embeddings is not None :
17431750 embeddings = np .array (self .topic_embeddings )
17441751 else :
1745- embeddings = self .c_tf_idf
1752+ embeddings = self .c_tf_idf . toarray ()
17461753 norm_data = normalize (embeddings , norm = 'l2' )
17471754 predictions = hdbscan .HDBSCAN (min_cluster_size = 2 ,
17481755 metric = 'euclidean' ,
@@ -1828,13 +1835,14 @@ def _map_probabilities(self, probabilities: Union[np.ndarray, None]) -> Union[np
18281835 mapped_probabilities: Updated probabilities
18291836 """
18301837 # Map array of probabilities (probability for assigned topic per document)
1831- if len (probabilities .shape ) == 2 and self .get_topic (- 1 ):
1832- mapped_probabilities = np .zeros ((probabilities .shape [0 ],
1833- len (set (self .mapped_topics .values ()))- 1 ))
1834- for from_topic , to_topic in self .mapped_topics .items ():
1835- if to_topic != - 1 and from_topic != - 1 :
1836- mapped_probabilities [:, to_topic ] += probabilities [:, from_topic ]
1837- return mapped_probabilities
1838+ if probabilities is not None :
1839+ if len (probabilities .shape ) == 2 and self .get_topic (- 1 ):
1840+ mapped_probabilities = np .zeros ((probabilities .shape [0 ],
1841+ len (set (self .mapped_topics .values ()))- 1 ))
1842+ for from_topic , to_topic in self .mapped_topics .items ():
1843+ if to_topic != - 1 and from_topic != - 1 :
1844+ mapped_probabilities [:, to_topic ] += probabilities [:, from_topic ]
1845+ return mapped_probabilities
18381846
18391847 return probabilities
18401848
0 commit comments