@@ -13,9 +13,11 @@ def _label_percentage(data_frame):
1313 :return: label_percentage_dict: dictionary maps label : % of labels
1414 """
1515 total_examples = len (data_frame )
16- label_frequency_dict = dict (Counter (data_frame ['intent' ]).most_common ())
17- percentage_list = np .array (list (label_frequency_dict .values ()))/ total_examples
18- label_percentage_dict = dict (zip (list (label_frequency_dict .keys ()), percentage_list ))
16+ label_frequency_dict = dict (Counter (data_frame ["intent" ]).most_common ())
17+ percentage_list = np .array (list (label_frequency_dict .values ())) / total_examples
18+ label_percentage_dict = dict (
19+ zip (list (label_frequency_dict .keys ()), percentage_list )
20+ )
1921 return label_percentage_dict
2022
2123
@@ -26,15 +28,17 @@ def _train_test_coloring(val):
2628 :return:
2729 """
2830 if val > 25 :
29- color = ' red'
31+ color = " red"
3032 elif val > 10 :
31- color = ' DarkBlue'
33+ color = " DarkBlue"
3234 else :
33- color = ' green'
34- return ' color: %s' % color
35+ color = " green"
36+ return " color: %s" % color
3537
3638
37- def _train_test_label_difference (workspace_label_percentage_dict , test_label_percentage_dict ):
39+ def _train_test_label_difference (
40+ workspace_label_percentage_dict , test_label_percentage_dict
41+ ):
3842 """
3943 analyze the difference between training set and test set
4044 :param workspace_label_percentage_dict:
@@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per
6670 current_difference = np .abs (test_percentage - workspace_percentage )
6771
6872 if key in test_label_percentage_dict :
69- difference_dict [key ] = [workspace_percentage * 100 ,
70- test_percentage * 100 ,
71- current_difference * 100 ]
73+ difference_dict [key ] = [
74+ workspace_percentage * 100 ,
75+ test_percentage * 100 ,
76+ current_difference * 100 ,
77+ ]
7278
7379 js_distance = distance .jensenshannon (distribution1 , distribution2 , 2.0 )
7480
@@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd):
8692 """
8793 train_vocab = set ()
8894 test_vocab = set ()
89- train_set_tokens = train_set_pd [' utterance' ].apply (word_tokenize )
90- test_set_tokens = test_set_pd [' utterance' ].apply (word_tokenize )
95+ train_set_tokens = train_set_pd [" utterance" ].apply (word_tokenize )
96+ test_set_tokens = test_set_pd [" utterance" ].apply (word_tokenize )
9197
9298 for tokens in train_set_tokens .tolist ():
9399 train_vocab .update (tokens )
@@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd):
107113 train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference]
108114 """
109115 train_pd_temp = train_set_pd .copy ()
110- train_pd_temp [' tokens' ] = train_set_pd [' utterance' ].apply (word_tokenize )
111- train_pd_temp [' Train' ] = train_pd_temp [' tokens' ].apply (len )
112- train_avg_len_by_label = train_pd_temp [[' intent' , ' Train' ]].groupby (' intent' ).mean ()
116+ train_pd_temp [" tokens" ] = train_set_pd [" utterance" ].apply (word_tokenize )
117+ train_pd_temp [" Train" ] = train_pd_temp [" tokens" ].apply (len )
118+ train_avg_len_by_label = train_pd_temp [[" intent" , " Train" ]].groupby (" intent" ).mean ()
113119
114120 test_pd_temp = test_set_pd .copy ()
115- test_pd_temp ['tokens' ] = test_set_pd ['utterance' ].apply (word_tokenize )
116- test_pd_temp ['Test' ] = test_pd_temp ['tokens' ].apply (len )
117- test_avg_len_by_label = test_pd_temp [['intent' , 'Test' ]].groupby ('intent' ).mean ()
118-
119- train_test_length_comparison = pd .merge (train_avg_len_by_label ,
120- test_avg_len_by_label , on = 'intent' )
121- train_test_length_comparison ['Absolute Difference' ] = \
122- np .abs (train_test_length_comparison ['Train' ] - train_test_length_comparison ['Test' ])
121+ test_pd_temp ["tokens" ] = test_set_pd ["utterance" ].apply (word_tokenize )
122+ test_pd_temp ["Test" ] = test_pd_temp ["tokens" ].apply (len )
123+ test_avg_len_by_label = test_pd_temp [["intent" , "Test" ]].groupby ("intent" ).mean ()
124+
125+ train_test_length_comparison = pd .merge (
126+ train_avg_len_by_label , test_avg_len_by_label , on = "intent"
127+ )
128+ train_test_length_comparison ["Absolute Difference" ] = np .abs (
129+ train_test_length_comparison ["Train" ] - train_test_length_comparison ["Test" ]
130+ )
123131 train_test_length_comparison = train_test_length_comparison .sort_values (
124- by = ["Absolute Difference" ], ascending = False )
132+ by = ["Absolute Difference" ], ascending = False
133+ )
125134 train_test_length_comparison = train_test_length_comparison .reset_index ()
126- train_test_length_comparison .rename (columns = {'intent' :'Intent'
127- }, inplace = True )
135+ train_test_length_comparison .rename (columns = {"intent" : "Intent" }, inplace = True )
128136 return train_test_length_comparison
129137
130138
@@ -137,8 +145,8 @@ def _get_metrics(results):
137145 recall_dict: maps the {intent: recall}
138146 f1_dict: maps the {intent:f1}
139147 """
140- groundtruth = results [' correct_intent' ].values .tolist ()
141- top_intent = results [' top_intent' ].values .tolist ()
148+ groundtruth = results [" correct_intent" ].values .tolist ()
149+ top_intent = results [" top_intent" ].values .tolist ()
142150 gt_cnt_dict = dict ()
143151 pred_cnt_dict = dict ()
144152 true_positive_dict = dict ()
@@ -152,13 +160,22 @@ def _get_metrics(results):
152160 f1_dict = dict ()
153161 for lb in true_positive_dict :
154162
155- recall_dict [lb ] = true_positive_dict [lb ] / gt_cnt_dict [lb ] if lb in gt_cnt_dict else 0
156-
157- precision_dict [lb ] = true_positive_dict [lb ] / pred_cnt_dict [lb ] if lb in pred_cnt_dict \
158- else 0
159-
160- f1_dict [lb ] = 0.0 if recall_dict [lb ] == 0 and precision_dict [lb ] == 0 \
161- else 2.0 * recall_dict [lb ] * precision_dict [lb ] / (recall_dict [lb ] + precision_dict [lb ])
163+ recall_dict [lb ] = (
164+ true_positive_dict [lb ] / gt_cnt_dict [lb ] if lb in gt_cnt_dict else 0
165+ )
166+
167+ precision_dict [lb ] = (
168+ true_positive_dict [lb ] / pred_cnt_dict [lb ] if lb in pred_cnt_dict else 0
169+ )
170+
171+ f1_dict [lb ] = (
172+ 0.0
173+ if recall_dict [lb ] == 0 and precision_dict [lb ] == 0
174+ else 2.0
175+ * recall_dict [lb ]
176+ * precision_dict [lb ]
177+ / (recall_dict [lb ] + precision_dict [lb ])
178+ )
162179 return precision_dict , recall_dict , f1_dict
163180
164181
@@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
172189 workspace_label_percentage_dict = _label_percentage (train_set_pd )
173190 test_label_percentage_dict = _label_percentage (test_set_pd )
174191
175- missing_label , difference_dict , js = \
176- _train_test_label_difference (workspace_label_percentage_dict , test_label_percentage_dict )
192+ missing_label , difference_dict , js = _train_test_label_difference (
193+ workspace_label_percentage_dict , test_label_percentage_dict
194+ )
177195 train_vocab , test_vocab = _train_test_vocab_difference (train_set_pd , test_set_pd )
178196
179- train_test_length_comparison_pd = \
180- _train_test_utterance_length_difference (train_set_pd , test_set_pd )
197+ train_test_length_comparison_pd = _train_test_utterance_length_difference (
198+ train_set_pd , test_set_pd
199+ )
181200
182201 display (Markdown ("## Test Data Evaluation" ))
183202
@@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
186205 label = list (difference_dict .keys ())
187206 diff = np .round (list (difference_dict .values ()), 2 )
188207 precision_dict , recall_dict , f1_dict = _get_metrics (results )
189- precision = np .round ([precision_dict [l ]* 100.0 if l in precision_dict else 0.0
190- for l in label ], 2 )
208+ precision = np .round (
209+ [precision_dict [l ] * 100.0 if l in precision_dict else 0.0 for l in label ],
210+ 2 ,
211+ )
191212
192- recall = np .round ([recall_dict [l ]* 100.0 if l in recall_dict else 0.0 for l in label ], 2 )
213+ recall = np .round (
214+ [recall_dict [l ] * 100.0 if l in recall_dict else 0.0 for l in label ], 2
215+ )
193216
194- f1 = np .round ([f1_dict [l ]* 100.0 if l in f1_dict else 0.0 for l in label ], 2 )
217+ f1 = np .round ([f1_dict [l ] * 100.0 if l in f1_dict else 0.0 for l in label ], 2 )
195218
196- train_count_dict = dict (Counter (train_set_pd [' intent' ]))
197- test_count_dict = dict (Counter (test_set_pd [' intent' ]))
219+ train_count_dict = dict (Counter (train_set_pd [" intent" ]))
220+ test_count_dict = dict (Counter (test_set_pd [" intent" ]))
198221 tr_cnt = [train_count_dict [l ] if l in train_count_dict else 0.0 for l in label ]
199222 te_cnt = [test_count_dict [l ] if l in test_count_dict else 0.0 for l in label ]
200223
201- difference_pd = pd .DataFrame ({"Intent" : label ,
202- "% of Train" : diff [:, 0 ],
203- "% of Test" : diff [:, 1 ],
204- "Absolute Difference %" : diff [:, 2 ],
205- "Train Examples" : tr_cnt ,
206- "Test Examples" : te_cnt ,
207- "Test Precision %" : precision ,
208- "Test Recall %" : recall ,
209- "Test F1 %" : f1 })
210-
211- if not difference_pd [difference_pd ["Absolute Difference %" ] > .001 ].empty :
212- table_for_display = difference_pd [difference_pd ["Absolute Difference %" ]
213- > .001 ].sort_values (by = ["Absolute Difference %" ],
214- ascending = False )
215- table_for_display = \
216- table_for_display .style .applymap (_train_test_coloring ,
217- subset = pd .IndexSlice [:, ["Absolute Difference %" ]])
224+ difference_pd = pd .DataFrame (
225+ {
226+ "Intent" : label ,
227+ "% of Train" : diff [:, 0 ],
228+ "% of Test" : diff [:, 1 ],
229+ "Absolute Difference %" : diff [:, 2 ],
230+ "Train Examples" : tr_cnt ,
231+ "Test Examples" : te_cnt ,
232+ "Test Precision %" : precision ,
233+ "Test Recall %" : recall ,
234+ "Test F1 %" : f1 ,
235+ }
236+ )
237+
238+ if not difference_pd [difference_pd ["Absolute Difference %" ] > 0.001 ].empty :
239+ table_for_display = difference_pd [
240+ difference_pd ["Absolute Difference %" ] > 0.001
241+ ].sort_values (by = ["Absolute Difference %" ], ascending = False )
242+ table_for_display = table_for_display .style .applymap (
243+ _train_test_coloring , subset = pd .IndexSlice [:, ["Absolute Difference %" ]]
244+ )
218245 display (table_for_display )
219246 display (Markdown ("\n " ))
220247 display (Markdown ("Distribution Mismatch Color Code" ))
@@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
223250 display (Markdown ("<font color = 'green'> Green - Good </font>" ))
224251
225252 if js >= 0 :
226- js = np .round (js , 2 )* 100
227- display (Markdown ("### Data Distribution Divergence Test vs Train \
228- <font color='blue'>{}%</font>" .format (js )))
253+ js = np .round (js , 2 ) * 100
254+ display (
255+ Markdown (
256+ "### Data Distribution Divergence Test vs Train \
257+ <font color='blue'>{}%</font>" .format (
258+ js
259+ )
260+ )
261+ )
229262 display (Markdown ("**Note** Metric used is Jensen Shannon Distance" ))
230263
231264 if missing_label :
232265 display (Markdown ("### Missing Intents in Test Data" ))
233- missing_label_pd = pd .DataFrame (missing_label ,
234- columns = ["Missing Intents in Test Set " ])
235- missing_label_pd .index = np .arange (1 , len (missing_label_pd )+ 1 )
266+ missing_label_pd = pd .DataFrame (
267+ missing_label , columns = ["Missing Intents in Test Set " ]
268+ )
269+ missing_label_pd .index = np .arange (1 , len (missing_label_pd ) + 1 )
236270 display (missing_label_pd )
237271
238272 display (Markdown ("### Test Data Example Length" ))
239- condition1 = (train_test_length_comparison_pd ["Absolute Difference" ] /
240- train_test_length_comparison_pd ["Train" ] > .3 )
241- condition2 = (train_test_length_comparison_pd ["Absolute Difference" ] > 3 )
273+ condition1 = (
274+ train_test_length_comparison_pd ["Absolute Difference" ]
275+ / train_test_length_comparison_pd ["Train" ]
276+ > 0.3
277+ )
278+ condition2 = train_test_length_comparison_pd ["Absolute Difference" ] > 3
242279
243280 length_comparison_pd = train_test_length_comparison_pd [condition1 & condition2 ]
244281
245282 if not length_comparison_pd .empty :
246- display (Markdown (
247- "Divergence found in average length of user examples in test vs training data" ))
248- length_comparison_pd .index = np .arange (1 , len (length_comparison_pd )+ 1 )
283+ display (
284+ Markdown (
285+ "Divergence found in average length of user examples in test vs training data"
286+ )
287+ )
288+ length_comparison_pd .index = np .arange (1 , len (length_comparison_pd ) + 1 )
249289 display (length_comparison_pd .round (2 ))
250290 else :
251291 display (Markdown ("Average length of user examples is comparable" ))
252292
253293 if train_vocab and test_vocab :
254294 display (Markdown ("### Vocabulary Size Test vs Train" ))
255- oov_vocab_percentage = (len (test_vocab ) - len (train_vocab .intersection (test_vocab ))) \
256- / len (test_vocab )* 100
257-
258- vocab_df = pd .DataFrame (data = {
259- 'Train Vocabulary Size' : [len (train_vocab )],
260- 'Test Vocabulary Size' : [len (test_vocab )],
261- '% Test Set Vocabulary not found in Train' : [oov_vocab_percentage ]})
295+ oov_vocab_percentage = (
296+ (len (test_vocab ) - len (train_vocab .intersection (test_vocab )))
297+ / len (test_vocab )
298+ * 100
299+ )
300+
301+ vocab_df = pd .DataFrame (
302+ data = {
303+ "Train Vocabulary Size" : [len (train_vocab )],
304+ "Test Vocabulary Size" : [len (test_vocab )],
305+ "% Test Set Vocabulary not found in Train" : [oov_vocab_percentage ],
306+ }
307+ )
262308 vocab_df .index = np .arange (1 , len (vocab_df ) + 1 )
263309 display (vocab_df .round (2 ))
264310
0 commit comments