@@ -148,35 +148,35 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
148148 :return: metrics at character- and word-level and indicators of correctly tokenized words
149149 :rtype: dict[str, Union[float, str]]
150150 """
151- ref_sample = _binary_representation (ref_sample )
152- sample = _binary_representation (raw_sample )
151+ ref_sample_arr = _binary_representation (ref_sample )
152+ sample_arr = _binary_representation (raw_sample )
153153
154154 # Compute character-level statistics
155- c_pos_pred , c_neg_pred = np .argwhere (sample == 1 ), np .argwhere (sample == 0 )
155+ c_pos_pred , c_neg_pred = np .argwhere (sample_arr == 1 ), np .argwhere (sample_arr == 0 )
156156
157- c_pos_pred = c_pos_pred [c_pos_pred < ref_sample .shape [0 ]]
158- c_neg_pred = c_neg_pred [c_neg_pred < ref_sample .shape [0 ]]
157+ c_pos_pred = c_pos_pred [c_pos_pred < ref_sample_arr .shape [0 ]]
158+ c_neg_pred = c_neg_pred [c_neg_pred < ref_sample_arr .shape [0 ]]
159159
160- c_tp = np .sum (ref_sample [c_pos_pred ] == 1 )
161- c_fp = np .sum (ref_sample [c_pos_pred ] == 0 )
160+ c_tp = np .sum (ref_sample_arr [c_pos_pred ] == 1 )
161+ c_fp = np .sum (ref_sample_arr [c_pos_pred ] == 0 )
162162
163- c_tn = np .sum (ref_sample [c_neg_pred ] == 0 )
164- c_fn = np .sum (ref_sample [c_neg_pred ] == 1 )
163+ c_tn = np .sum (ref_sample_arr [c_neg_pred ] == 0 )
164+ c_fn = np .sum (ref_sample_arr [c_neg_pred ] == 1 )
165165
166166 # Compute word-level statistics
167167
168168 # Find correctly tokenized words in the reference sample
169- word_boundaries = _find_word_boundaries (ref_sample )
169+ word_boundaries = _find_word_boundaries (ref_sample_arr )
170170
171171 # Find correctly tokenized words in the sample
172- ss_boundaries = _find_word_boundaries (sample )
172+ ss_boundaries = _find_word_boundaries (sample_arr )
173173 tokenization_indicators = _find_words_correctly_tokenised (
174174 word_boundaries , ss_boundaries
175175 )
176176
177177 correctly_tokenised_words = np .sum (tokenization_indicators )
178178
179- tokenization_indicators = list (map (str , tokenization_indicators ))
179+ tokenization_indicators_str = list (map (str , tokenization_indicators ))
180180
181181 return {
182182 "char_level" : {
@@ -187,11 +187,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
187187 },
188188 "word_level" : {
189189 "correctly_tokenised_words" : correctly_tokenised_words ,
190- "total_words_in_sample" : np .sum (sample ),
191- "total_words_in_ref_sample" : np .sum (ref_sample ),
190+ "total_words_in_sample" : np .sum (sample_arr ),
191+ "total_words_in_ref_sample" : np .sum (ref_sample_arr ),
192192 },
193193 "global" : {
194- "tokenisation_indicators" : "" .join (tokenization_indicators )
194+ "tokenisation_indicators" : "" .join (tokenization_indicators_str )
195195 },
196196 }
197197
@@ -246,14 +246,14 @@ def _find_word_boundaries(bin_reps) -> list:
246246def _find_words_correctly_tokenised (
247247 ref_boundaries : list [tuple [int , int ]],
248248 predicted_boundaries : list [tuple [int , int ]],
249- ) -> tuple [int ]:
249+ ) -> tuple [int , ... ]:
250250 """Find whether each word is correctly tokenized.
251251
252252 :param list[tuple(int, int)] ref_boundaries: word boundaries of reference tokenization
253253 :param list[tuple(int, int)] predicted_boundaries: word boundareies of predicted tokenization
254254
255255 :return: binary sequence where 1 indicates the corresponding word is tokenized correctly
256- :rtype: tuple[int]
256+ :rtype: tuple[int, ... ]
257257 """
258258 ref_b = dict (zip (ref_boundaries , [1 ] * len (ref_boundaries )))
259259
0 commit comments