1- __all__ = [
2- 'get_topics_dist' , 'get_topics_scatter' , 'get_top_topic_words' ]
3- from typing import Union , List
1+ __all__ = ["get_topics_dist" , "get_topics_scatter" , "get_top_topic_words" ]
2+ from typing import Optional , Union , List
43from itertools import combinations
5- from pandas import DataFrame
4+ from pandas import DataFrame , Index
65import numpy as np
76from scipy .special import kl_div
87from scipy .spatial import distance
98from sklearn .manifold import (
10- TSNE , Isomap , LocallyLinearEmbedding , MDS , SpectralEmbedding )
9+ TSNE ,
10+ Isomap ,
11+ LocallyLinearEmbedding ,
12+ MDS ,
13+ SpectralEmbedding ,
14+ )
1115from ._helpers import calc_topics_marg_probs
1216
1317
@@ -28,15 +32,14 @@ def _dist_jsd(a1: np.ndarray, a2: np.ndarray):
2832
2933def _dist_jef (a1 : np .ndarray , a2 : np .ndarray ):
3034 vals = (a1 - a2 ) * (np .log (a1 ) - np .log (a2 ))
31- vals [(vals <= 0 ) | ~ np .isfinite (vals )] = 0.
35+ vals [(vals <= 0 ) | ~ np .isfinite (vals )] = 0.0
3236 return vals .sum ()
3337
3438
3539def _dist_hel (a1 : np .ndarray , a2 : np .ndarray ):
3640 a1 [(a1 <= 0 ) | ~ np .isfinite (a1 )] = 1e-64
3741 a2 [(a2 <= 0 ) | ~ np .isfinite (a2 )] = 1e-64
38- hel_val = distance .euclidean (
39- np .sqrt (a1 ), np .sqrt (a2 )) / np .sqrt (2 )
42+ hel_val = distance .euclidean (np .sqrt (a1 ), np .sqrt (a2 )) / np .sqrt (2 )
4043 return hel_val
4144
4245
@@ -52,19 +55,18 @@ def _dist_tv(a1: np.ndarray, a2: np.ndarray):
5255 return dist
5356
5457
55- def _dist_jac (a1 : np .ndarray , a2 : np .ndarray , top_words = 100 ):
56- a = np .argsort (a1 )[:- top_words - 1 : - 1 ]
57- b = np .argsort (a2 )[:- top_words - 1 : - 1 ]
58+ def _dist_jac (a1 : np .ndarray , a2 : np .ndarray , top_words = 100 ):
59+ a = np .argsort (a1 )[: - top_words - 1 : - 1 ]
60+ b = np .argsort (a2 )[: - top_words - 1 : - 1 ]
5861 j_num = np .intersect1d (a , b , assume_unique = False ).size
5962 j_den = np .union1d (a , b ).size
6063 jac_val = 1 - j_num / j_den
6164 return jac_val
6265
6366
6467def get_topics_dist (
65- phi : Union [np .ndarray , DataFrame ],
66- method : str = "sklb" ,
67- ** kwargs ) -> np .ndarray :
68+ phi : Union [np .ndarray , DataFrame ], method : str = "sklb" , ** kwargs
69+ ) -> np .ndarray :
6870 """Finding closest topics in models.
6971
7072 Parameters
@@ -110,16 +112,18 @@ def get_topics_dist(
110112 for i , j in topics_pairs :
111113 _dist_func = dist_funcs .get (method , "sklb" )
112114 topics_dists [((i , j ), (j , i ))] = _dist_func (
113- phi_copy [:, i ], phi_copy [:, j ], ** kwargs )
115+ phi_copy [:, i ], phi_copy [:, j ], ** kwargs
116+ )
114117
115118 return topics_dists
116119
117120
118121def get_topics_scatter (
119- topic_dists : np .ndarray ,
120- theta : np .ndarray ,
121- method : str = 'tsne' ,
122- method_kws : dict = None ) -> DataFrame :
122+ topic_dists : np .ndarray ,
123+ theta : np .ndarray ,
124+ method : str = "tsne" ,
125+ method_kws : Optional [dict ] = None ,
126+ ) -> DataFrame :
123127 """Calculate topics coordinates for a scatter plot.
124128
125129 Parameters
@@ -146,52 +150,52 @@ def get_topics_scatter(
146150 Topics scatter coordinates.
147151 """
148152 if not method_kws :
149- method_kws = {' n_components' : 2 }
153+ method_kws = {" n_components" : 2 }
150154
151- if method == 'tsne' :
152- method_kws .setdefault ('init' , 'pca' )
153- method_kws .setdefault ('learning_rate' , 'auto' )
154- method_kws .setdefault (
155- 'perplexity' , min (50 , max (topic_dists .shape [0 ] // 2 , 1 )))
155+ if method == "tsne" :
156+ method_kws .setdefault ("init" , "pca" )
157+ method_kws .setdefault ("learning_rate" , "auto" )
158+ method_kws .setdefault ("perplexity" , min (50 , max (topic_dists .shape [0 ] // 2 , 1 )))
156159 transformer = TSNE (** method_kws )
157160
158- elif method == ' sem' :
159- method_kws .setdefault (' affinity' , ' precomputed' )
161+ elif method == " sem" :
162+ method_kws .setdefault (" affinity" , " precomputed" )
160163 transformer = SpectralEmbedding (** method_kws )
161164
162- elif method == ' mds' :
163- method_kws .setdefault (' dissimilarity' , ' precomputed' )
164- method_kws .setdefault (' normalized_stress' , ' auto' )
165+ elif method == " mds" :
166+ method_kws .setdefault (" dissimilarity" , " precomputed" )
167+ method_kws .setdefault (" normalized_stress" , " auto" )
165168 transformer = MDS (** method_kws )
166169
167- elif method == ' lle' :
168- method_kws [' method' ] = ' standard'
170+ elif method == " lle" :
171+ method_kws [" method" ] = " standard"
169172 transformer = LocallyLinearEmbedding (** method_kws )
170173
171- elif method == ' ltsa' :
172- method_kws [' method' ] = ' ltsa'
174+ elif method == " ltsa" :
175+ method_kws [" method" ] = " ltsa"
173176 transformer = LocallyLinearEmbedding (** method_kws )
174177
175- elif method == ' isomap' :
178+ elif method == " isomap" :
176179 transformer = Isomap (** method_kws )
177180
178181 coords = transformer .fit_transform (topic_dists )
179182
180- topics_xy = DataFrame (coords , columns = [ 'x' , 'y' ] )
181- topics_xy [' topic' ] = topics_xy .index .astype (int )
182- topics_xy [' size' ] = calc_topics_marg_probs (theta )
183- size_sum = topics_xy [' size' ].sum ()
183+ topics_xy = DataFrame (coords , columns = Index ([ "x" , "y" ]) )
184+ topics_xy [" topic" ] = topics_xy .index .astype (int )
185+ topics_xy [" size" ] = calc_topics_marg_probs (theta )
186+ size_sum = topics_xy [" size" ].sum ()
184187 if size_sum > 0 :
185- topics_xy [' size' ] *= ( 100 / topics_xy [' size' ].sum () )
188+ topics_xy [" size" ] *= 100 / topics_xy [" size" ].sum ()
186189 else :
187- topics_xy [' size' ] = np .nan
190+ topics_xy [" size" ] = np .nan
188191 return topics_xy
189192
190193
191194def get_top_topic_words (
192- phi : DataFrame ,
193- words_num : int = 20 ,
194- topics_idx : Union [List [int ], np .ndarray ] = None ) -> DataFrame :
195+ phi : DataFrame ,
196+ words_num : int = 20 ,
197+ topics_idx : Optional [Union [List [int ], np .ndarray ]] = None ,
198+ ) -> DataFrame :
195199 """Select top topic words from a fitted model.
196200
197201 Parameters
@@ -209,9 +213,6 @@ def get_top_topic_words(
209213 DataFrame
210214 Words with highest probabilities in all (or selected) topics.
211215 """
212- return phi .loc [:, topics_idx or phi .columns ]\
213- .apply (
214- lambda x : x
215- .sort_values (ascending = False )
216- .head (words_num ).index , axis = 0
216+ return phi .loc [:, topics_idx or phi .columns ].apply (
217+ lambda x : x .sort_values (ascending = False ).head (words_num ).index , axis = 0
217218 )
0 commit comments