Skip to content

Commit b8506ad

Browse files
folivoramanhpre-commit-ci[bot]
authored andcommitted
Vietnamese MRC 1.0 fix case (#312)
* fix and add cases Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: folivoramanh <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor <[email protected]>
1 parent bfa8eef commit b8506ad

File tree

11 files changed

+136
-37
lines changed

11 files changed

+136
-37
lines changed

nemo_text_processing/text_normalization/data_loader_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
"FRACTION",
4646
"TIME",
4747
"ADDRESS",
48+
"ROMAN",
49+
"RANGE",
4850
]
4951

5052

nemo_text_processing/text_normalization/vi/data/measure/measurements_minimal.tsv

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
°f độ f
2+
°F độ F
23
°c độ c
4+
°C độ C
35
°k độ k
6+
°K độ K
7+
° độ
8+
°E độ đông
9+
°N độ bắc
10+
°S độ nam
11+
°W độ tây
412
ha héc ta
513
mi mile
614
ft foot

nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,8 @@ năm
99
khoản
1010
phụ lục
1111
khóa
12-
số
12+
số
13+
điều
14+
tiểu mục
15+
bài
16+
khối

nemo_text_processing/text_normalization/vi/taggers/date.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst
18+
from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, NEMO_SPACE, GraphFst
1919
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
2020

2121

@@ -32,6 +32,23 @@ class DateFst(GraphFst):
3232
def __init__(self, cardinal, deterministic: bool = True):
3333
super().__init__(name="date", kind="classify", deterministic=deterministic)
3434

35+
# Vietnamese date keywords
36+
DAY_WORD = "ngày"
37+
MONTH_WORD = "tháng"
38+
YEAR_WORD = "năm"
39+
ORDINAL_YEAR_WORD = "năm thứ"
40+
41+
# Prebuilt patterns for common usage
42+
day_prefix = pynini.accep(DAY_WORD + NEMO_SPACE)
43+
month_prefix = pynini.accep(MONTH_WORD + NEMO_SPACE)
44+
year_prefix = pynini.accep(YEAR_WORD + NEMO_SPACE)
45+
ordinal_year_prefix = pynini.accep(ORDINAL_YEAR_WORD + NEMO_SPACE)
46+
47+
delete_day_prefix = pynutil.delete(DAY_WORD + NEMO_SPACE)
48+
delete_month_prefix = pynutil.delete(MONTH_WORD + NEMO_SPACE)
49+
delete_year_prefix = pynutil.delete(YEAR_WORD + NEMO_SPACE)
50+
delete_ordinal_year_prefix = pynutil.delete(ORDINAL_YEAR_WORD + NEMO_SPACE)
51+
3552
day_mappings = load_labels(get_abs_path("data/date/days.tsv"))
3653
month_mappings = load_labels(get_abs_path("data/date/months.tsv"))
3754
era_mappings = load_labels(get_abs_path("data/date/year_suffix.tsv"))
@@ -60,73 +77,82 @@ def __init__(self, cardinal, deterministic: bool = True):
6077

6178
patterns = []
6279

80+
# DD/MM/YYYY format (Vietnamese standard)
6381
date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part
6482
patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep))
6583
patterns.append(
6684
pynini.compose(
67-
pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit,
68-
pynutil.delete("ngày ") + date_sep,
85+
day_prefix + day_digit + separator + month_digit + separator + year_digit,
86+
delete_day_prefix + date_sep,
6987
)
7088
)
7189

72-
for sep in [separator, pynini.accep(" ")]:
90+
# YYYY/MM/DD format (ISO standard) - output in Vietnamese order
91+
iso_year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\" ")
92+
iso_month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ")
93+
iso_day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\"")
94+
95+
iso_date_sep = (
96+
iso_year_part + pynutil.delete(separator) + iso_month_part + pynutil.delete(separator) + iso_day_part
97+
)
98+
patterns.append(pynini.compose(year_digit + separator + month_digit + separator + day_digit, iso_date_sep))
99+
100+
for sep in [separator, pynini.accep(NEMO_SPACE)]:
73101
patterns.append(
74102
pynini.compose(
75-
pynini.accep("tháng ") + month_digit + sep + year_digit,
76-
pynutil.delete("tháng ") + month_part + pynutil.delete(sep) + year_part,
103+
month_prefix + month_digit + sep + year_digit,
104+
delete_month_prefix + month_part + pynutil.delete(sep) + year_part,
77105
)
78106
)
79107

80108
day_month_sep = day_part + pynutil.delete(separator) + month_final
81109
patterns.append(
82-
pynini.compose(
83-
pynini.accep("ngày ") + day_digit + separator + month_digit, pynutil.delete("ngày ") + day_month_sep
84-
)
110+
pynini.compose(day_prefix + day_digit + separator + month_digit, delete_day_prefix + day_month_sep)
85111
)
86112

87113
patterns.append(
88114
pynini.compose(
89-
pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit,
90-
pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final,
115+
day_prefix + day_digit + pynini.accep(NEMO_SPACE + MONTH_WORD + NEMO_SPACE) + month_digit,
116+
delete_day_prefix + day_part + pynutil.delete(NEMO_SPACE + MONTH_WORD + NEMO_SPACE) + month_final,
91117
)
92118
)
93119

94120
patterns.append(
95121
pynini.compose(
96-
pynini.accep("ngày ")
122+
day_prefix
97123
+ day_digit
98-
+ pynini.accep(" tháng ")
124+
+ pynini.accep(NEMO_SPACE + MONTH_WORD + NEMO_SPACE)
99125
+ month_digit
100-
+ pynini.accep(" năm ")
126+
+ pynini.accep(NEMO_SPACE + YEAR_WORD + NEMO_SPACE)
101127
+ year_digit,
102-
pynutil.delete("ngày ")
128+
delete_day_prefix
103129
+ day_part
104-
+ pynutil.delete(" tháng ")
130+
+ pynutil.delete(NEMO_SPACE + MONTH_WORD + NEMO_SPACE)
105131
+ month_part
106-
+ pynutil.delete(" năm ")
132+
+ pynutil.delete(NEMO_SPACE + YEAR_WORD + NEMO_SPACE)
107133
+ year_part,
108134
)
109135
)
110136

111-
patterns.append(pynini.compose(pynini.accep("năm ") + year_digit, pynutil.delete("năm ") + year_part))
137+
patterns.append(pynini.compose(year_prefix + year_digit, delete_year_prefix + year_part))
112138

113139
era_abbrs = list(era_to_full.keys())
114140
for era_abbr in era_abbrs:
115141
patterns.append(
116142
pynini.compose(
117-
pynini.accep("năm ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr),
118-
pynutil.delete("năm ") + year_part + pynutil.delete(" ") + era_part,
143+
year_prefix + year_digit + pynini.accep(NEMO_SPACE) + pynini.accep(era_abbr),
144+
delete_year_prefix + year_part + pynutil.delete(NEMO_SPACE) + era_part,
119145
)
120146
)
121147

122148
patterns.append(
123149
pynini.compose(
124-
pynini.accep("năm thứ ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr),
125-
pynutil.delete("năm thứ ")
150+
ordinal_year_prefix + year_digit + pynini.accep(NEMO_SPACE) + pynini.accep(era_abbr),
151+
delete_ordinal_year_prefix
126152
+ pynutil.insert("ordinal: \"")
127153
+ year_convert
128154
+ pynutil.insert("\" ")
129-
+ pynutil.delete(" ")
155+
+ pynutil.delete(NEMO_SPACE)
130156
+ era_part,
131157
)
132158
)

nemo_text_processing/text_normalization/vi/taggers/decimal.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,27 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
113113
)
114114
patterns.append(abbr_pattern)
115115

116-
# 5. Compound abbreviations: 1tr2 -> một triệu hai trăm nghìn, 2t3 -> hai tỷ ba trăm triệu
116+
# 5. Decimal with abbreviations: 2,5tr, but avoid measure conflicts
117+
measure_prefix_labels = load_labels(get_abs_path("data/measure/prefixes.tsv"))
118+
measure_prefixes = {prefix.lower() for prefix, _ in measure_prefix_labels}
119+
120+
# Filter quantity abbreviations to avoid measure conflicts
121+
safe_quantity_abbrs = [
122+
(abbr, full) for abbr, full in quantity_abbr_labels if abbr.lower() not in measure_prefixes
123+
]
124+
125+
for abbr, full_name in safe_quantity_abbrs:
126+
decimal_abbr_pattern = (
127+
(integer_part + pynutil.insert(NEMO_SPACE)).ques
128+
+ pynutil.delete(NEMO_COMMA)
129+
+ pynutil.insert(NEMO_SPACE)
130+
+ fractional_part
131+
+ pynutil.insert(f" quantity: \"{full_name}\"")
132+
+ pynutil.delete(abbr)
133+
)
134+
patterns.append(decimal_abbr_pattern)
135+
136+
# 6. Compound abbreviations: 1tr2 -> một triệu hai trăm nghìn, 2t3 -> hai tỷ ba trăm triệu
117137
compound_expansions = {
118138
"tr": ("triệu", "trăm nghìn"), # 1tr2 -> một triệu hai trăm nghìn
119139
"t": ("tỷ", "trăm triệu"), # 2t3 -> hai tỷ ba trăm triệu

nemo_text_processing/text_normalization/vi/taggers/measure.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,17 +115,19 @@ def __init__(
115115

116116
# Optional negative sign handling for Vietnamese
117117
optional_graph_negative = pynini.closure(
118-
pynini.cross(pynini.union("âm", "trừ"), "negative: \"true\" "),
118+
pynini.cross("-", "negative: \"true\" "),
119119
0,
120120
1,
121121
)
122122

123123
# Domain restriction patterns - only match core number+unit patterns
124124
# Remove punctuation handling to let punctuation tagger handle it separately
125125
optional_space = pynini.closure(NEMO_SPACE, 0, 1)
126-
integer_measure_domain = number + optional_space + unit_pattern
127-
decimal_measure_domain = decimal_number + optional_space + unit_pattern
128-
fraction_measure_domain = number + "/" + number + optional_space + unit_pattern
126+
optional_negative_sign = pynini.closure("-" + optional_space, 0, 1)
127+
128+
integer_measure_domain = optional_negative_sign + number + optional_space + unit_pattern
129+
decimal_measure_domain = optional_negative_sign + decimal_number + optional_space + unit_pattern
130+
fraction_measure_domain = optional_negative_sign + number + "/" + number + optional_space + unit_pattern
129131

130132
cardinal_number_graph = pynutil.insert('integer: "') + (number @ cardinal_graph) + pynutil.insert('"')
131133

nemo_text_processing/text_normalization/vi/taggers/punctuation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class PunctuationFst(GraphFst):
2626
def __init__(self, deterministic: bool = True):
2727
super().__init__(name="punctuation", kind="classify", deterministic=deterministic)
2828

29-
s = "!#%&'()*+,-./:;<=>?@^_`{|}~"
29+
s = "!#%&'()*+,-./:;<=>?@^_`{|}~′″°"
3030

3131
punct = pynini.union(*s)
3232
self.punct_marks = punct

nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,14 @@ def __init__(
168168
| pynutil.add_weight(money_graph, 1.1)
169169
| pynutil.add_weight(range_graph, 1.1)
170170
| pynutil.add_weight(decimal_graph, 1.1)
171-
| pynutil.add_weight(roman_graph, 1.1)
172171
| pynutil.add_weight(date_graph, 1.1)
173172
| pynutil.add_weight(cardinal_graph, 1.1)
174173
| pynutil.add_weight(ordinal_graph, 1.1)
175174
| pynutil.add_weight(fraction_graph, 1.1)
176175
| pynutil.add_weight(time_graph, 1.1)
177176
| pynutil.add_weight(measure_graph, 1.1)
178177
| pynutil.add_weight(word_graph, 100)
178+
| pynutil.add_weight(roman_graph, 101)
179179
)
180180
punct = (
181181
pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 2.1) + pynutil.insert(" }")

nemo_text_processing/text_normalization/vi/taggers/word.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_SPACE, GraphFst
18+
from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_ALPHA, NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst
1919

2020

2121
class WordFst(GraphFst):
@@ -30,5 +30,17 @@ class WordFst(GraphFst):
3030

3131
def __init__(self, deterministic: bool = True):
3232
super().__init__(name="word", kind="classify", deterministic=deterministic)
33-
word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
33+
34+
# Symbols that should cause token breaks
35+
# Include measure symbols, currency symbols, and digits
36+
symbols_to_exclude = pynini.union("°", "′", "″", "$", "€", "₩", "£", "¥", "#", "%", "₫", NEMO_DIGIT).optimize()
37+
38+
word_chars = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)
39+
default_word_graph = word_chars
40+
41+
alpha_word_graph = pynini.closure(NEMO_ALPHA, 1)
42+
43+
graph = pynutil.add_weight(alpha_word_graph, -1.0) | default_word_graph
44+
45+
word = pynutil.insert("name: \"") + graph + pynutil.insert("\"")
3446
self.fst = word.optimize()

nemo_text_processing/text_normalization/vi/verbalizers/measure.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import pynini
16+
1517
from nemo_text_processing.text_normalization.vi.graph_utils import (
1618
GraphFst,
1719
delete_preserve_order,
@@ -43,17 +45,23 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de
4345
# Extract components
4446
unit = extract_field("units")
4547

48+
# Handle negative sign - Vietnamese uses "âm" for negative numbers
49+
optional_negative = pynini.closure(pynini.cross("negative: \"true\"", "âm ") + delete_space, 0, 1)
50+
if not deterministic:
51+
# Alternative ways to say negative in Vietnamese
52+
optional_negative |= pynini.closure(pynini.cross("negative: \"true\"", "trừ ") + delete_space, 0, 1)
53+
4654
# Combine all number types into single graph
4755
number_graph = (
4856
extract_wrapper_content("decimal", decimal.numbers)
4957
| extract_wrapper_content("cardinal", cardinal.numbers)
5058
| extract_wrapper_content("fraction", fraction.numbers)
5159
)
5260

53-
# Main pattern: number + space + unit (most common case)
54-
graph = number_graph + delete_space + insert_space + unit
61+
# Main pattern: [negative] number + space + unit (most common case)
62+
graph = optional_negative + number_graph + delete_space + insert_space + unit
5563

56-
# Handle preserve_order: unit + space + number
57-
graph |= unit + delete_space + insert_space + number_graph + delete_preserve_order
64+
# Handle preserve_order: [negative] unit + space + number
65+
graph |= optional_negative + unit + delete_space + insert_space + number_graph + delete_preserve_order
5866

5967
self.fst = self.delete_tokens(graph).optimize()

0 commit comments

Comments
 (0)