Skip to content

Commit 10cb69d

Browse files
folivoramanhanand-nvpre-commit-ci[bot]
authored andcommitted
Date time itn (#333)
* improve numeric semiotic classes Signed-off-by: folivoramanh <[email protected]> * Fix Jenkinsfile for CI (#325) * Fix Jenkinsfile for CI Signed-off-by: Anand Joseph <[email protected]> * Fix requirements for test Signed-off-by: Anand Joseph <[email protected]> * Update paths and docker Signed-off-by: Anand Joseph <[email protected]> * Fix docker name Signed-off-by: Anand Joseph <[email protected]> * Fix click version Signed-off-by: Anand Joseph <[email protected]> * Change path of grammars for sparrowhawk tests Signed-off-by: Anand Joseph <[email protected]> * Update paths in sh_test.sh Signed-off-by: Anand Joseph <[email protected]> * Update paths Signed-off-by: Anand Joseph <[email protected]> * Revert paths Signed-off-by: Anand Joseph <[email protected]> --------- Signed-off-by: Anand Joseph <[email protected]> Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: folivoramanh <[email protected]> * revert old codes Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert not inherit Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * improve date time Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix pynini union instead of union operator Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * improve measure, telephone, electronic Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change union operator to pynini union Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: folivoramanh <[email protected]> Signed-off-by: Anand Joseph <[email protected]> Co-authored-by: anand-nv <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor <[email protected]>
1 parent 1252387 commit 10cb69d

29 files changed

+737
-318
lines changed

nemo_text_processing/inverse_text_normalization/vi/data/currency.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ $ đô la mỹ
88
won
99
uôn
1010
RM ringgit
11-
đồng
11+
£ bảng anh

nemo_text_processing/inverse_text_normalization/vi/data/electronic/symbols.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
- gạch
33
_ gạch dưới
44
_ shift gạch
5+
_ shift trừ
56
_ síp gạch
67
! chấm than
78
# thăng
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
. chấm
2+
- gạch
3+
- gạch ngang
4+
_ gạch dưới
5+
_ shift gạch
6+
_ shift trừ
7+
_ síp gạch
8+
/ sẹc

nemo_text_processing/inverse_text_normalization/vi/graph_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
NEMO_SPACE = " "
3838
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
3939
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
40-
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
40+
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, '"').optimize()
4141

4242
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
4343
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -36,117 +36,118 @@ class CardinalFst(GraphFst):
3636

3737
def __init__(self):
3838
super().__init__(name="cardinal", kind="classify")
39-
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
40-
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
4139
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
4240
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
4341

42+
thousand_words = pynini.union("ngàn", "nghìn")
43+
negative_words = pynini.union("âm", "trừ")
44+
45+
graph_hundred = pynini.cross("trăm", "")
46+
graph_ten = pynini.cross("mươi", "")
47+
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")
48+
49+
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
50+
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
4451
graph_one = pynini.cross("mốt", "1")
4552
graph_four = pynini.cross("tư", "4")
4653
graph_five = pynini.cross("lăm", "5")
4754
graph_half = pynini.cross("rưỡi", "5")
48-
graph_hundred = pynini.cross("trăm", "")
49-
graph_ten = pynini.cross("mươi", "")
50-
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")
5155

5256
optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
5357
last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
54-
last_digit = pynini.union(
58+
self.last_digit = pynini.union(
5559
(pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit,
5660
graph_one,
5761
graph_four,
5862
graph_five,
5963
)
60-
61-
graph_hundred_ties_component = (graph_digit | graph_zero) + delete_space + graph_hundred
62-
graph_hundred_ties_component += delete_space
63-
graph_hundred_ties_component += pynini.union(
64+
last_digit = self.last_digit
65+
# Build hundreds component (e.g., "một trăm", "hai trăm")
66+
graph_hundreds_component = (graph_digit | graph_zero) + delete_space + graph_hundred
67+
graph_hundreds_component += delete_space
68+
graph_hundreds_component += pynini.union(
6469
graph_teen,
65-
(graph_half | graph_four | graph_one) + pynutil.insert("0"),
66-
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")),
67-
zero + delete_space + (graph_digit | graph_four),
68-
pynutil.insert("00"),
69-
)
70-
graph_hundred_ties_component |= (
70+
(graph_half | graph_four | graph_one) + pynutil.insert("0", weight=0.1),
71+
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0", weight=0.1)),
72+
zero + delete_space + (graph_digit | graph_four | graph_five),
73+
pynutil.insert("00", weight=0.1),
74+
).optimize()
75+
graph_hundreds_component |= (
7176
pynutil.insert("0")
7277
+ delete_space
7378
+ pynini.union(
7479
graph_teen,
7580
graph_ties + optional_ten + delete_space + last_digit,
76-
graph_ties + delete_space + graph_ten + pynutil.insert("0"),
77-
zero + delete_space + (graph_digit | graph_four),
78-
)
81+
graph_ties + delete_space + graph_ten + pynutil.insert("0", weight=0.1),
82+
zero + delete_space + (graph_digit | graph_four | graph_five),
83+
).optimize()
84+
)
85+
graph_hundred_component = graph_hundreds_component | (
86+
pynutil.insert("00", weight=0.1) + delete_space + graph_digit
7987
)
80-
graph_hundred_component = graph_hundred_ties_component | (pynutil.insert("00") + delete_space + graph_digit)
8188

8289
graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
8390
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
8491
)
8592
self.graph_hundred_component_at_least_one_none_zero_digit = (
86-
graph_hundred_component_at_least_one_none_zero_digit
93+
graph_hundred_component_at_least_one_none_zero_digit.optimize()
8794
)
88-
graph_hundred_ties_zero = graph_hundred_ties_component | pynutil.insert("000")
95+
graph_hundreds_zero = graph_hundreds_component | pynutil.insert("000", weight=0.1)
8996

9097
graph_thousands = pynini.union(
91-
graph_hundred_component_at_least_one_none_zero_digit
92-
+ delete_space
93-
+ pynutil.delete(pynini.union("nghìn", "ngàn")),
98+
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(thousand_words),
9499
pynutil.insert("000", weight=0.1),
95-
)
96-
97-
graph_ten_thousand = pynini.union(
98-
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"),
99-
pynutil.insert("0000", weight=0.1),
100-
)
101-
102-
graph_ten_thousand_suffix = pynini.union(
103-
graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
104-
pynutil.insert("0", weight=0.1),
105-
)
100+
).optimize()
106101

107102
graph_million = pynini.union(
108103
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"),
109104
pynutil.insert("000", weight=0.1),
110-
)
105+
).optimize()
111106
graph_billion = pynini.union(
112107
graph_hundred_component_at_least_one_none_zero_digit
113108
+ delete_space
114109
+ pynutil.delete(pynini.union("tỉ", "tỷ")),
115110
pynutil.insert("000", weight=0.1),
116-
)
111+
).optimize()
117112

113+
# Main graph combining all magnitude levels
118114
graph = pynini.union(
115+
# Full format: billion + million + thousand + hundred
119116
graph_billion
120117
+ delete_space
121118
+ graph_million
122119
+ delete_space
123120
+ graph_thousands
124121
+ delete_space
125-
+ graph_hundred_ties_zero,
126-
graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero,
122+
+ graph_hundreds_zero,
123+
# Special thousand format with last digit or "rưỡi" (half)
127124
graph_hundred_component_at_least_one_none_zero_digit
128125
+ delete_space
129-
+ pynutil.delete(pynini.union("nghìn", "ngàn"))
126+
+ pynutil.delete(thousand_words)
130127
+ delete_space
131-
+ (((last_digit | graph_half) + pynutil.insert("00")) | graph_hundred_ties_zero),
128+
+ (((last_digit | graph_half) + pynutil.insert("00", weight=0.1)) | graph_hundreds_zero),
129+
# Single digits (for non-exception cases)
132130
graph_digit,
133131
graph_zero,
134132
)
135133

136-
graph = graph @ pynini.union(
137-
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
138-
"0",
134+
graph = (
135+
graph
136+
@ pynini.union(
137+
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
138+
"0",
139+
).optimize()
139140
)
140141

141142
# don't convert cardinals from zero to nine inclusive
142-
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
143+
single_digits = pynini.project(pynini.union(graph_digit, graph_zero), "input").optimize()
143144

144145
self.graph_no_exception = graph
145146

146-
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
147+
self.graph = pynini.difference(pynini.project(graph, "input"), single_digits) @ graph
147148

148149
optional_minus_graph = pynini.closure(
149-
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE,
150+
pynutil.insert("negative: ") + pynini.cross(negative_words, '"-"') + NEMO_SPACE,
150151
0,
151152
1,
152153
)

0 commit comments

Comments
 (0)