@@ -36,117 +36,124 @@ class CardinalFst(GraphFst):
3636
3737 def __init__ (self ):
3838 super ().__init__ (name = "cardinal" , kind = "classify" )
39- graph_zero = pynini .string_file (get_abs_path ("data/numbers/zero.tsv" ))
40- graph_digit = pynini .string_file (get_abs_path ("data/numbers/digit.tsv" ))
39+ self . graph_zero = pynini .string_file (get_abs_path ("data/numbers/zero.tsv" ))
40+ self . graph_digit = pynini .string_file (get_abs_path ("data/numbers/digit.tsv" ))
4141 graph_ties = pynini .string_file (get_abs_path ("data/numbers/ties.tsv" ))
4242 graph_teen = pynini .string_file (get_abs_path ("data/numbers/teen.tsv" ))
4343
44- graph_one = pynini .cross ("mốt" , "1" )
45- graph_four = pynini .cross ("tư" , "4" )
46- graph_five = pynini .cross ("lăm" , "5" )
47- graph_half = pynini .cross ("rưỡi" , "5" )
44+ self .graph_one = pynini .cross ("mốt" , "1" )
45+ self .graph_four = pynini .cross ("tư" , "4" )
46+ self .graph_five = pynini .cross ("lăm" , "5" )
47+ self .graph_half = pynini .cross ("rưỡi" , "5" )
48+
49+ self .magnitude_words = pynini .union ("triệu" , "tỉ" , "tỷ" , "vạn" )
50+ self .thousand_words = pynini .union ("ngàn" , "nghìn" )
51+ self .negative_words = pynini .union ("âm" , "trừ" )
52+
4853 graph_hundred = pynini .cross ("trăm" , "" )
4954 graph_ten = pynini .cross ("mươi" , "" )
5055 zero = pynini .cross (pynini .union ("linh" , "lẻ" ), "0" )
56+
57+ graph_zero = self .graph_zero
58+ graph_digit = self .graph_digit
59+ graph_one = self .graph_one
60+ graph_four = self .graph_four
61+ graph_five = self .graph_five
62+ graph_half = self .graph_half
5163
5264 optional_ten = pynini .closure (delete_space + graph_ten , 0 , 1 )
5365 last_digit_exception = pynini .project (pynini .cross ("năm" , "5" ), "input" )
54- last_digit = pynini .union (
66+ self . last_digit = pynini .union (
5567 (pynini .project (graph_digit , "input" ) - last_digit_exception .arcsort ()) @ graph_digit ,
5668 graph_one ,
5769 graph_four ,
5870 graph_five ,
5971 )
60-
61- graph_hundred_ties_component = (graph_digit | graph_zero ) + delete_space + graph_hundred
62- graph_hundred_ties_component += delete_space
63- graph_hundred_ties_component += pynini .union (
72+ last_digit = self .last_digit
73+ # Build hundreds component (e.g., "một trăm", "hai trăm")
74+ graph_hundreds_component = (graph_digit | graph_zero ) + delete_space + graph_hundred
75+ graph_hundreds_component += delete_space
76+ graph_hundreds_component += pynini .union (
6477 graph_teen ,
65- (graph_half | graph_four | graph_one ) + pynutil .insert ("0" ),
66- graph_ties + optional_ten + ((delete_space + last_digit ) | pynutil .insert ("0" )),
67- zero + delete_space + (graph_digit | graph_four ),
68- pynutil .insert ("00" ),
69- )
70- graph_hundred_ties_component |= (
78+ (graph_half | graph_four | graph_one ) + pynutil .insert ("0" , weight = 0.1 ),
79+ graph_ties + optional_ten + ((delete_space + last_digit ) | pynutil .insert ("0" , weight = 0.1 )),
80+ zero + delete_space + (graph_digit | graph_four | graph_five ),
81+ pynutil .insert ("00" , weight = 0.1 ),
82+ ). optimize ()
83+ graph_hundreds_component |= (
7184 pynutil .insert ("0" )
7285 + delete_space
7386 + pynini .union (
7487 graph_teen ,
7588 graph_ties + optional_ten + delete_space + last_digit ,
76- graph_ties + delete_space + graph_ten + pynutil .insert ("0" ),
77- zero + delete_space + (graph_digit | graph_four ),
78- )
89+ graph_ties + delete_space + graph_ten + pynutil .insert ("0" , weight = 0.1 ),
90+ zero + delete_space + (graph_digit | graph_four | graph_five ),
91+ ). optimize ()
7992 )
80- graph_hundred_component = graph_hundred_ties_component | (pynutil .insert ("00" ) + delete_space + graph_digit )
93+ graph_hundred_component = graph_hundreds_component | (pynutil .insert ("00" , weight = 0.1 ) + delete_space + graph_digit )
8194
8295 graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
8396 pynini .closure (NEMO_DIGIT ) + (NEMO_DIGIT - "0" ) + pynini .closure (NEMO_DIGIT )
8497 )
8598 self .graph_hundred_component_at_least_one_none_zero_digit = (
86- graph_hundred_component_at_least_one_none_zero_digit
99+ graph_hundred_component_at_least_one_none_zero_digit . optimize ()
87100 )
88- graph_hundred_ties_zero = graph_hundred_ties_component | pynutil .insert ("000" )
101+ graph_hundreds_zero = graph_hundreds_component | pynutil .insert ("000" , weight = 0.1 )
89102
90103 graph_thousands = pynini .union (
91104 graph_hundred_component_at_least_one_none_zero_digit
92105 + delete_space
93- + pynutil .delete (pynini . union ( "nghìn" , "ngàn" ) ),
106+ + pynutil .delete (self . thousand_words ),
94107 pynutil .insert ("000" , weight = 0.1 ),
95- )
96-
97- graph_ten_thousand = pynini .union (
98- graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil .delete ("vạn" ),
99- pynutil .insert ("0000" , weight = 0.1 ),
100- )
108+ ).optimize ()
101109
102- graph_ten_thousand_suffix = pynini .union (
103- graph_digit + delete_space + pynutil .delete (pynini .union ("nghìn" , "ngàn" )),
104- pynutil .insert ("0" , weight = 0.1 ),
105- )
106110
107111 graph_million = pynini .union (
108112 graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil .delete ("triệu" ),
109113 pynutil .insert ("000" , weight = 0.1 ),
110- )
114+ ). optimize ()
111115 graph_billion = pynini .union (
112116 graph_hundred_component_at_least_one_none_zero_digit
113117 + delete_space
114118 + pynutil .delete (pynini .union ("tỉ" , "tỷ" )),
115119 pynutil .insert ("000" , weight = 0.1 ),
116- )
120+ ). optimize ()
117121
122+ # Main graph combining all magnitude levels
118123 graph = pynini .union (
124+ # Full format: billion + million + thousand + hundred
119125 graph_billion
120126 + delete_space
121127 + graph_million
122128 + delete_space
123129 + graph_thousands
124130 + delete_space
125- + graph_hundred_ties_zero ,
126- graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero ,
131+ + graph_hundreds_zero ,
132+ # Special thousand format with last digit or "rưỡi" (half)
127133 graph_hundred_component_at_least_one_none_zero_digit
128134 + delete_space
129- + pynutil .delete (pynini . union ( "nghìn" , "ngàn" ) )
135+ + pynutil .delete (self . thousand_words )
130136 + delete_space
131- + (((last_digit | graph_half ) + pynutil .insert ("00" )) | graph_hundred_ties_zero ),
137+ + (((last_digit | graph_half ) + pynutil .insert ("00" , weight = 0.1 )) | graph_hundreds_zero ),
138+ # Single digits (for non-exception cases)
132139 graph_digit ,
133140 graph_zero ,
134141 )
135142
136143 graph = graph @ pynini .union (
137144 pynutil .delete (pynini .closure ("0" )) + pynini .difference (NEMO_DIGIT , "0" ) + pynini .closure (NEMO_DIGIT ),
138145 "0" ,
139- )
146+ ). optimize ()
140147
141148 # don't convert cardinals from zero to nine inclusive
142- graph_exception = pynini .project (pynini .union (graph_digit , graph_zero ), "input" )
149+ single_digits = pynini .project (pynini .union (graph_digit , graph_zero ), "input" ). optimize ( )
143150
144151 self .graph_no_exception = graph
145152
146- self .graph = (pynini .project (graph , "input" ) - graph_exception . arcsort () ) @ graph
153+ self .graph = pynini . difference (pynini .project (graph , "input" ), single_digits ) @ graph
147154
148155 optional_minus_graph = pynini .closure (
149- pynutil .insert ("negative: " ) + pynini .cross (pynini . union ( "âm" , "trừ" ) , '"-"' ) + NEMO_SPACE ,
156+ pynutil .insert ("negative: " ) + pynini .cross (self . negative_words , '"-"' ) + NEMO_SPACE ,
150157 0 ,
151158 1 ,
152159 )
0 commit comments