diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py index 06491947a..686c81ee2 100644 --- a/pythainlp/tokenize/han_solo.py +++ b/pythainlp/tokenize/han_solo.py @@ -10,7 +10,7 @@ import threading from importlib.resources import as_file, files -from typing import Optional, Union, cast +from typing import Optional try: import pycrfsuite @@ -61,8 +61,8 @@ def featurize( ) -> dict[str, list]: if padding: sentence = self.pad(sentence) - all_features = [] - all_labels = [] + all_features_list: list[list[str]] = [] + all_labels_int: list[int] = [] skip_next = False for current_position in range( self.radius, len(sentence) - self.radius + 1 @@ -70,11 +70,7 @@ def featurize( if skip_next: skip_next = False continue - features: Union[dict[str, int], list[str]] - if return_type == "list": - features = [] - else: - features = {} + features: list[str] = [] cut = 0 char = sentence[current_position] if char == self.delimiter: @@ -83,7 +79,6 @@ def featurize( counter = 0 chars_left = "" chars_right = "" - chars = "" abs_index_left = current_position # left start at -1 abs_index_right = current_position - 1 # right start at 0 while counter < self.radius: @@ -100,10 +95,7 @@ def featurize( # ใส่ลง feature if indiv_char: left_key = "|".join([str(relative_index_left), char_left]) - if return_type == "dict": - cast(dict[str, int], features)[left_key] = 1 - else: - cast(list[str], features).append(left_key) + features.append(left_key) abs_index_right += ( 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5) @@ -118,10 +110,7 @@ def featurize( right_key = "|".join( [str(relative_index_right), char_right] ) - if return_type == "dict": - cast(dict[str, int], features)[right_key] = 1 - else: - cast(list[str], features).append(right_key) + features.append(right_key) counter += 1 @@ -129,17 +118,24 @@ def featurize( for i in range(0, len(chars) - self.N + 1): ngram = chars[i : i + self.N] ngram_key = "|".join([str(i - self.radius), ngram]) - if return_type == "dict": - cast(dict[str, int], features)[ngram_key] = 1 - else: - cast(list[str], features).append(ngram_key) - all_features.append(features) - if return_type == "list": - all_labels.append(str(cut)) - else: - all_labels.append(cut) - - return {"X": all_features, "Y": all_labels} + features.append(ngram_key) + all_features_list.append(features) + all_labels_int.append(cut) + + # Convert to the requested return type + if return_type == "list": + return { + "X": all_features_list, + "Y": [str(label) for label in all_labels_int] + } + else: + return { + "X": [ + {key: 1 for key in feature_list} + for feature_list in all_features_list + ], + "Y": all_labels_int + } _to_feature = Featurizer()