Skip to content

Commit 90d16bf

Browse files
committed
improve numeric semiotic classes
Signed-off-by: folivoramanh <palasek182@gmail.com>
1 parent b63d929 commit 90d16bf

File tree

12 files changed

+238
-156
lines changed

12 files changed

+238
-156
lines changed

nemo_text_processing/inverse_text_normalization/vi/graph_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,10 @@
3535
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
3636
NEMO_NON_BREAKING_SPACE = "\u00a0"
3737
NEMO_SPACE = " "
38+
NEMO_QUOTE = r'"'
3839
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
3940
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
40-
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
41+
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, NEMO_QUOTE).optimize()
4142

4243
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
4344
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -36,117 +36,124 @@ class CardinalFst(GraphFst):
3636

3737
def __init__(self):
3838
super().__init__(name="cardinal", kind="classify")
39-
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
40-
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
39+
self.graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
40+
self.graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
4141
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
4242
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
4343

44-
graph_one = pynini.cross("mốt", "1")
45-
graph_four = pynini.cross("tư", "4")
46-
graph_five = pynini.cross("lăm", "5")
47-
graph_half = pynini.cross("rưỡi", "5")
44+
self.graph_one = pynini.cross("mốt", "1")
45+
self.graph_four = pynini.cross("tư", "4")
46+
self.graph_five = pynini.cross("lăm", "5")
47+
self.graph_half = pynini.cross("rưỡi", "5")
48+
49+
self.magnitude_words = pynini.union("triệu", "tỉ", "tỷ", "vạn")
50+
self.thousand_words = pynini.union("ngàn", "nghìn")
51+
self.negative_words = pynini.union("âm", "trừ")
52+
4853
graph_hundred = pynini.cross("trăm", "")
4954
graph_ten = pynini.cross("mươi", "")
5055
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")
56+
57+
graph_zero = self.graph_zero
58+
graph_digit = self.graph_digit
59+
graph_one = self.graph_one
60+
graph_four = self.graph_four
61+
graph_five = self.graph_five
62+
graph_half = self.graph_half
5163

5264
optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
5365
last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
54-
last_digit = pynini.union(
66+
self.last_digit = pynini.union(
5567
(pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit,
5668
graph_one,
5769
graph_four,
5870
graph_five,
5971
)
60-
61-
graph_hundred_ties_component = (graph_digit | graph_zero) + delete_space + graph_hundred
62-
graph_hundred_ties_component += delete_space
63-
graph_hundred_ties_component += pynini.union(
72+
last_digit = self.last_digit
73+
# Build hundreds component (e.g., "một trăm", "hai trăm")
74+
graph_hundreds_component = (graph_digit | graph_zero) + delete_space + graph_hundred
75+
graph_hundreds_component += delete_space
76+
graph_hundreds_component += pynini.union(
6477
graph_teen,
65-
(graph_half | graph_four | graph_one) + pynutil.insert("0"),
66-
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")),
67-
zero + delete_space + (graph_digit | graph_four),
68-
pynutil.insert("00"),
69-
)
70-
graph_hundred_ties_component |= (
78+
(graph_half | graph_four | graph_one) + pynutil.insert("0", weight=0.1),
79+
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0", weight=0.1)),
80+
zero + delete_space + (graph_digit | graph_four | graph_five),
81+
pynutil.insert("00", weight=0.1),
82+
).optimize()
83+
graph_hundreds_component |= (
7184
pynutil.insert("0")
7285
+ delete_space
7386
+ pynini.union(
7487
graph_teen,
7588
graph_ties + optional_ten + delete_space + last_digit,
76-
graph_ties + delete_space + graph_ten + pynutil.insert("0"),
77-
zero + delete_space + (graph_digit | graph_four),
78-
)
89+
graph_ties + delete_space + graph_ten + pynutil.insert("0", weight=0.1),
90+
zero + delete_space + (graph_digit | graph_four | graph_five),
91+
).optimize()
7992
)
80-
graph_hundred_component = graph_hundred_ties_component | (pynutil.insert("00") + delete_space + graph_digit)
93+
graph_hundred_component = graph_hundreds_component | (pynutil.insert("00", weight=0.1) + delete_space + graph_digit)
8194

8295
graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
8396
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
8497
)
8598
self.graph_hundred_component_at_least_one_none_zero_digit = (
86-
graph_hundred_component_at_least_one_none_zero_digit
99+
graph_hundred_component_at_least_one_none_zero_digit.optimize()
87100
)
88-
graph_hundred_ties_zero = graph_hundred_ties_component | pynutil.insert("000")
101+
graph_hundreds_zero = graph_hundreds_component | pynutil.insert("000", weight=0.1)
89102

90103
graph_thousands = pynini.union(
91104
graph_hundred_component_at_least_one_none_zero_digit
92105
+ delete_space
93-
+ pynutil.delete(pynini.union("nghìn", "ngàn")),
106+
+ pynutil.delete(self.thousand_words),
94107
pynutil.insert("000", weight=0.1),
95-
)
96-
97-
graph_ten_thousand = pynini.union(
98-
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"),
99-
pynutil.insert("0000", weight=0.1),
100-
)
108+
).optimize()
101109

102-
graph_ten_thousand_suffix = pynini.union(
103-
graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
104-
pynutil.insert("0", weight=0.1),
105-
)
106110

107111
graph_million = pynini.union(
108112
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"),
109113
pynutil.insert("000", weight=0.1),
110-
)
114+
).optimize()
111115
graph_billion = pynini.union(
112116
graph_hundred_component_at_least_one_none_zero_digit
113117
+ delete_space
114118
+ pynutil.delete(pynini.union("tỉ", "tỷ")),
115119
pynutil.insert("000", weight=0.1),
116-
)
120+
).optimize()
117121

122+
# Main graph combining all magnitude levels
118123
graph = pynini.union(
124+
# Full format: billion + million + thousand + hundred
119125
graph_billion
120126
+ delete_space
121127
+ graph_million
122128
+ delete_space
123129
+ graph_thousands
124130
+ delete_space
125-
+ graph_hundred_ties_zero,
126-
graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero,
131+
+ graph_hundreds_zero,
132+
# Special thousand format with last digit or "rưỡi" (half)
127133
graph_hundred_component_at_least_one_none_zero_digit
128134
+ delete_space
129-
+ pynutil.delete(pynini.union("nghìn", "ngàn"))
135+
+ pynutil.delete(self.thousand_words)
130136
+ delete_space
131-
+ (((last_digit | graph_half) + pynutil.insert("00")) | graph_hundred_ties_zero),
137+
+ (((last_digit | graph_half) + pynutil.insert("00", weight=0.1)) | graph_hundreds_zero),
138+
# Single digits (for non-exception cases)
132139
graph_digit,
133140
graph_zero,
134141
)
135142

136143
graph = graph @ pynini.union(
137144
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
138145
"0",
139-
)
146+
).optimize()
140147

141148
# don't convert cardinals from zero to nine inclusive
142-
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
149+
single_digits = pynini.project(pynini.union(graph_digit, graph_zero), "input").optimize()
143150

144151
self.graph_no_exception = graph
145152

146-
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
153+
self.graph = pynini.difference(pynini.project(graph, "input"), single_digits) @ graph
147154

148155
optional_minus_graph = pynini.closure(
149-
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE,
156+
pynutil.insert("negative: ") + pynini.cross(self.negative_words, '"-"') + NEMO_SPACE,
150157
0,
151158
1,
152159
)

nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py

Lines changed: 49 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -21,64 +21,9 @@
2121
GraphFst,
2222
delete_extra_space,
2323
delete_space,
24+
NEMO_QUOTE,
25+
insert_space
2426
)
25-
from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path
26-
27-
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
28-
29-
30-
def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike":
31-
"""
32-
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
33-
e.g. một triệu -> integer_part: "1" quantity: "triệu"
34-
e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"
35-
36-
Args:
37-
decimal: decimal FST
38-
cardinal_up_to_hundred: cardinal FST
39-
"""
40-
numbers = cardinal_up_to_hundred @ (
41-
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)
42-
)
43-
suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn")
44-
graph_four = pynini.cross("tư", "4")
45-
graph_one = pynini.cross("mốt", "1")
46-
graph_half = pynini.cross("rưỡi", "5")
47-
last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
48-
last_digit = pynini.union(
49-
(pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit,
50-
graph_one,
51-
graph_four,
52-
graph_half,
53-
)
54-
optional_fraction_graph = pynini.closure(
55-
delete_extra_space
56-
+ pynutil.insert('fractional_part: "')
57-
+ (last_digit | graph_half | graph_one | graph_four)
58-
+ pynutil.insert('"'),
59-
0,
60-
1,
61-
)
62-
63-
res = (
64-
pynutil.insert('integer_part: "')
65-
+ numbers
66-
+ pynutil.insert('"')
67-
+ delete_extra_space
68-
+ pynutil.insert('quantity: "')
69-
+ suffix
70-
+ pynutil.insert('"')
71-
+ optional_fraction_graph
72-
)
73-
res |= (
74-
decimal
75-
+ delete_extra_space
76-
+ pynutil.insert('quantity: "')
77-
+ (suffix | "ngàn" | "nghìn")
78-
+ pynutil.insert('"')
79-
)
80-
return res
81-
8227

8328
class DecimalFst(GraphFst):
8429
"""
@@ -95,40 +40,67 @@ def __init__(self, cardinal: GraphFst):
9540

9641
cardinal_graph = cardinal.graph_no_exception
9742

98-
graph_decimal = graph_digit | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
99-
graph_one = pynini.cross("mốt", "1")
100-
graph_four = pynini.cross("tư", "4")
101-
graph_five = pynini.cross("lăm", "5")
102-
43+
base_decimal = cardinal.graph_digit | cardinal.graph_zero
10344
graph_decimal = pynini.union(
104-
graph_decimal,
105-
graph_four,
106-
pynini.closure(graph_decimal + delete_space, 1) + (graph_decimal | graph_four | graph_five | graph_one),
107-
)
45+
base_decimal,
46+
cardinal.graph_four,
47+
pynini.closure(base_decimal + delete_space, 1) + (base_decimal | cardinal.graph_four | cardinal.graph_five | cardinal.graph_one),
48+
).optimize()
10849
self.graph = graph_decimal
10950

11051
point = pynutil.delete("chấm") | pynutil.delete("phẩy")
111-
11252
optional_graph_negative = pynini.closure(
113-
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space,
53+
pynutil.insert("negative:") + insert_space + pynini.cross(cardinal.negative_words, '"true"') + delete_extra_space,
11454
0,
11555
1,
11656
)
11757

118-
graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
119-
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
58+
graph_fractional = pynutil.insert('fractional_part:') + insert_space + pynutil.insert(NEMO_QUOTE) + graph_decimal + pynutil.insert(NEMO_QUOTE)
59+
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert(NEMO_QUOTE)
12060
final_graph_wo_sign = (
12161
pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional
12262
)
123-
final_graph = optional_graph_negative + final_graph_wo_sign
63+
# Build quantity handling - reuse magnitude words from cardinal context
64+
# e.g. một triệu -> integer_part: "1" quantity: "triệu"
65+
# e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"
66+
numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit @ (
67+
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)
68+
)
69+
70+
magnitude_words = cardinal.magnitude_words
71+
thousand_words = cardinal.thousand_words
72+
73+
last_digit = cardinal.last_digit
74+
optional_fraction_graph = pynini.closure(
75+
delete_extra_space
76+
+ pynutil.insert('fractional_part:') + insert_space + pynutil.insert(NEMO_QUOTE)
77+
+ (last_digit | cardinal.graph_half | cardinal.graph_one | cardinal.graph_four)
78+
+ pynutil.insert(NEMO_QUOTE),
79+
0,
80+
1,
81+
)
12482

125-
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
126-
final_graph_wo_sign,
127-
cardinal.graph_hundred_component_at_least_one_none_zero_digit,
83+
quantity_graph = (
84+
pynutil.insert('integer_part:') + insert_space + pynutil.insert(NEMO_QUOTE)
85+
+ numbers
86+
+ pynutil.insert(NEMO_QUOTE)
87+
+ delete_extra_space
88+
+ pynutil.insert('quantity:') + insert_space + pynutil.insert(NEMO_QUOTE)
89+
+ magnitude_words
90+
+ pynutil.insert(NEMO_QUOTE)
91+
+ optional_fraction_graph
12892
)
129-
final_graph |= optional_graph_negative + get_quantity(
130-
final_graph_wo_sign,
131-
cardinal.graph_hundred_component_at_least_one_none_zero_digit,
93+
quantity_graph |= (
94+
final_graph_wo_sign
95+
+ delete_extra_space
96+
+ pynutil.insert('quantity:') + insert_space + pynutil.insert(NEMO_QUOTE)
97+
+ (magnitude_words | thousand_words)
98+
+ pynutil.insert(NEMO_QUOTE)
13299
)
100+
101+
final_graph = optional_graph_negative + final_graph_wo_sign
102+
103+
self.final_graph_wo_negative = final_graph_wo_sign | quantity_graph
104+
final_graph |= optional_graph_negative + quantity_graph
133105
final_graph = self.add_tokens(final_graph)
134106
self.fst = final_graph.optimize()

nemo_text_processing/inverse_text_normalization/vi/taggers/fraction.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
import pynini
1717
from pynini.lib import pynutil
1818

19-
from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space
19+
from nemo_text_processing.inverse_text_normalization.vi.graph_utils import (
20+
GraphFst,
21+
delete_extra_space, delete_space, NEMO_QUOTE, insert_space
22+
)
2023

2124

2225
class FractionFst(GraphFst):
@@ -32,14 +35,13 @@ class FractionFst(GraphFst):
3235

3336
def __init__(self, cardinal: GraphFst):
3437
super().__init__(name="fraction", kind="classify")
35-
# integer_part # numerator # denominator
3638

3739
graph_cardinal = cardinal.graph_no_exception
38-
graph_four = pynini.cross("tư", "4")
40+
graph_four = cardinal.graph_four
3941

40-
numerator = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"')
42+
numerator = pynutil.insert('numerator:') + insert_space + pynutil.insert(NEMO_QUOTE) + graph_cardinal + pynutil.insert(NEMO_QUOTE)
4143
fraction_component = pynutil.delete(pynini.union("phần", "trên", "chia"))
42-
denominator = pynutil.insert('denominator: "') + (graph_cardinal | graph_four) + pynutil.insert('"')
44+
denominator = pynutil.insert('denominator:') + insert_space + pynutil.insert(NEMO_QUOTE) + (graph_cardinal | graph_four) + pynutil.insert(NEMO_QUOTE)
4345

4446
graph_fraction_component = numerator + delete_space + fraction_component + delete_extra_space + denominator
4547
self.graph_fraction_component = graph_fraction_component
@@ -49,7 +51,7 @@ def __init__(self, cardinal: GraphFst):
4951
self.final_graph_wo_negative = graph
5052

5153
optional_graph_negative = pynini.closure(
52-
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space,
54+
pynutil.insert("negative:") + insert_space + pynini.cross(pynini.union("âm", "trừ"), '"true"') + delete_extra_space,
5355
0,
5456
1,
5557
)

nemo_text_processing/inverse_text_normalization/vi/taggers/ordinal.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
import pynini
1717
from pynini.lib import pynutil
1818

19-
from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_space
19+
from nemo_text_processing.inverse_text_normalization.vi.graph_utils import (
20+
GraphFst,delete_space, NEMO_QUOTE, insert_space
21+
)
2022
from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path
2123

2224

@@ -34,6 +36,6 @@ def __init__(self):
3436
graph = graph_digit
3537

3638
self.graph = graph
37-
final_graph = pynutil.insert('integer: "') + graph_ordinal + delete_space + self.graph + pynutil.insert('"')
39+
final_graph = pynutil.insert('integer:') + insert_space + pynutil.insert(NEMO_QUOTE) + graph_ordinal + delete_space + self.graph + pynutil.insert(NEMO_QUOTE)
3840
final_graph = self.add_tokens(final_graph)
3941
self.fst = final_graph.optimize()

0 commit comments

Comments
 (0)