Skip to content

Commit df5b3dc

Browse files
HI TN: Staging branch cleanup for main merge (#355)
* Review changes - cleanup Signed-off-by: shreeshd-tn <[email protected]> * Missed cleanup Signed-off-by: shreeshd-tn <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: shreeshd-tn <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent aa22d29 commit df5b3dc

File tree

17 files changed

+285
-146
lines changed

17 files changed

+285
-146
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
हफ़्ते
2+
सप्ताह
3+
सदियां
4+
सदियों
5+

nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv renamed to nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,4 @@ hp हॉर्सपॉवर
88
d दिन
99
month महीना
1010
months महीने
11-
हफ़्ते
12-
सप्ताह
13-
सदियां
14-
सदियों
11+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
१ला पहला
2+
१ली पहली
3+
२रा दूसरा
4+
२री दूसरी
5+
३रा तीसरा
6+
३री तीसरी
7+
४था चौथा
8+
४थी चौथी
9+
५वां पाँचवां
10+
५वीं पाँचवीं
11+
६ठा छठा
12+
६ठी छठी
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
वां
22
वीं
33
वें
4-
वे वें
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
वे वें
2+

nemo_text_processing/text_normalization/hi/graph_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@
3030
NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize()
3131
NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize()
3232
NEMO_HI_ZERO = "०"
33+
34+
HI_DEDH = "डेढ़" # 1.5
35+
HI_DHAI = "ढाई" # 2.5
36+
HI_SAVVA = "सवा" # quarter more (1.25)
37+
HI_SADHE = "साढ़े" # half more (X.5)
38+
HI_PAUNE = "पौने" # quarter less (0.75)
39+
3340
NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
3441
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
3542
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()

nemo_text_processing/text_normalization/hi/taggers/date.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst):
6565
(NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand
6666
)
6767

68-
cardinal_graph = (
69-
digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands
68+
cardinal_graph = pynini.union(
69+
digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands
7070
)
7171

72-
graph_year = graph_year_thousands | graph_year_hundreds_as_thousands
72+
graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands)
7373

7474
delete_dash = pynutil.delete("-")
7575
delete_slash = pynutil.delete("/")
@@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst):
102102
# Updated logic to use prefix_union
103103
year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"")
104104

105-
graph_dd_mm_yyyy = (
106-
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
107-
)
105+
delete_separator = pynini.union(delete_dash, delete_slash)
106+
graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph
108107

109-
graph_mm_dd_yyyy = (
110-
months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph
111-
)
108+
graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph
112109

113110
graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")
114111

nemo_text_processing/text_normalization/hi/taggers/fraction.py

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,21 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst
18+
from nemo_text_processing.text_normalization.hi.graph_utils import (
19+
HI_DEDH,
20+
HI_DHAI,
21+
HI_PAUNE,
22+
HI_SADHE,
23+
HI_SAVVA,
24+
NEMO_SPACE,
25+
GraphFst,
26+
)
1927
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
2028

29+
HI_ONE_HALF = "१/२" # 1/2
30+
HI_ONE_QUARTER = "१/४" # 1/4
31+
HI_THREE_QUARTERS = "३/४" # 3/4
32+
2133

2234
class FractionFst(GraphFst):
2335
"""
@@ -40,37 +52,62 @@ def __init__(self, cardinal, deterministic: bool = True):
4052
cardinal_graph = cardinal.final_graph
4153

4254
self.optional_graph_negative = pynini.closure(
43-
pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1
55+
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1
4456
)
4557
self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
4658
self.numerator = (
47-
pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ")
59+
pynutil.insert("numerator: \"")
60+
+ cardinal_graph
61+
+ pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"")
62+
+ pynutil.insert(NEMO_SPACE)
4863
)
4964
self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"")
5065

51-
dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")])
66+
dedh_dhai_graph = pynini.string_map(
67+
[("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)]
68+
)
5269

53-
savva_numbers = cardinal_graph + pynini.cross(" १/४", "")
54-
savva_graph = pynutil.insert("सवा ") + savva_numbers
70+
savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "")
71+
savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers
5572

56-
sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "")
57-
sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers
73+
sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "")
74+
sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers
5875

5976
paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
60-
paune_numbers = paune + pynini.cross(" ३/४", "")
61-
paune_graph = pynutil.insert("पौने ") + paune_numbers
62-
63-
graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ")
77+
paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "")
78+
paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers
79+
80+
graph_dedh_dhai = (
81+
pynutil.insert("morphosyntactic_features: \"")
82+
+ dedh_dhai_graph
83+
+ pynutil.insert("\"")
84+
+ pynutil.insert(NEMO_SPACE)
85+
)
6486

65-
graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ")
87+
graph_savva = (
88+
pynutil.insert("morphosyntactic_features: \"")
89+
+ savva_graph
90+
+ pynutil.insert("\"")
91+
+ pynutil.insert(NEMO_SPACE)
92+
)
6693

67-
graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ")
94+
graph_sadhe = (
95+
pynutil.insert("morphosyntactic_features: \"")
96+
+ sadhe_graph
97+
+ pynutil.insert("\"")
98+
+ pynutil.insert(NEMO_SPACE)
99+
)
68100

69-
graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ")
101+
graph_paune = (
102+
pynutil.insert("morphosyntactic_features: \"")
103+
+ paune_graph
104+
+ pynutil.insert("\"")
105+
+ pynutil.insert(NEMO_SPACE)
106+
)
70107

71108
final_graph = (
72109
self.optional_graph_negative
73-
+ pynini.closure(self.integer + pynini.accep(" "), 0, 1)
110+
+ pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1)
74111
+ self.numerator
75112
+ self.denominator
76113
)

nemo_text_processing/text_normalization/hi/taggers/measure.py

Lines changed: 82 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,24 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space
18+
from nemo_text_processing.text_normalization.hi.graph_utils import (
19+
HI_DEDH,
20+
HI_DHAI,
21+
HI_PAUNE,
22+
HI_SADHE,
23+
HI_SAVVA,
24+
NEMO_SPACE,
25+
GraphFst,
26+
delete_space,
27+
insert_space,
28+
)
1929
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
2030

31+
HI_POINT_FIVE = ".५" # .5
32+
HI_ONE_POINT_FIVE = "१.५" # 1.5
33+
HI_TWO_POINT_FIVE = "२.५" # 2.5
34+
HI_DECIMAL_25 = ".२५" # .25
35+
HI_DECIMAL_75 = ".७५" # .75
2136

2237
digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
2338
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
@@ -54,7 +69,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
5469
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
5570
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
5671
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
57-
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))
72+
73+
# Load quarterly units from separate files: map (FST) and list (FSA)
74+
quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv"))
75+
quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv"))
76+
quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list)
5877

5978
optional_graph_negative = pynini.closure(
6079
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space,
@@ -65,16 +84,28 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
6584
# Define the quarterly measurements
6685
quarter = pynini.string_map(
6786
[
68-
(".५", "साढ़े"),
69-
("१.५", "डेढ़"),
70-
("२.५", "ढाई"),
87+
(HI_POINT_FIVE, HI_SADHE),
88+
(HI_ONE_POINT_FIVE, HI_DEDH),
89+
(HI_TWO_POINT_FIVE, HI_DHAI),
7190
]
7291
)
7392
quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"")
7493

7594
# Define the unit handling
76-
unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ")
77-
units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ")
95+
unit = (
96+
pynutil.insert(NEMO_SPACE)
97+
+ pynutil.insert("units: \"")
98+
+ unit_graph
99+
+ pynutil.insert("\"")
100+
+ pynutil.insert(NEMO_SPACE)
101+
)
102+
units = (
103+
pynutil.insert(NEMO_SPACE)
104+
+ pynutil.insert("units: \"")
105+
+ quarterly_units_graph
106+
+ pynutil.insert("\"")
107+
+ pynutil.insert(NEMO_SPACE)
108+
)
78109

79110
# Handling symbols like x, X, *
80111
symbol_graph = pynini.string_map(
@@ -94,24 +125,43 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
94125
+ unit
95126
)
96127

97-
dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")])
128+
dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)])
98129
dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"")
99130

100-
savva_numbers = cardinal_graph + pynini.cross(".२५", "")
101-
savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"")
131+
savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "")
132+
savva_graph = (
133+
pynutil.insert("integer: \"")
134+
+ pynutil.insert(HI_SAVVA)
135+
+ pynutil.insert(NEMO_SPACE)
136+
+ savva_numbers
137+
+ pynutil.insert("\"")
138+
)
102139

103-
sadhe_numbers = cardinal_graph + pynini.cross(".५", "")
104-
sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"")
140+
sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "")
141+
sadhe_graph = (
142+
pynutil.insert("integer: \"")
143+
+ pynutil.insert(HI_SADHE)
144+
+ pynutil.insert(NEMO_SPACE)
145+
+ sadhe_numbers
146+
+ pynutil.insert("\"")
147+
)
105148

106149
paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
107-
paune_numbers = paune + pynini.cross(".७५", "")
108-
paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"")
150+
paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "")
151+
paune_graph = (
152+
pynutil.insert("integer: \"")
153+
+ pynutil.insert(HI_PAUNE)
154+
+ pynutil.insert(NEMO_SPACE)
155+
+ paune_numbers
156+
+ pynutil.insert("\"")
157+
)
109158

110159
graph_dedh_dhai = (
111160
pynutil.insert("cardinal { ")
112161
+ optional_graph_negative
113162
+ dedh_dhai_graph
114-
+ pynutil.insert(" }")
163+
+ pynutil.insert(NEMO_SPACE)
164+
+ pynutil.insert("}")
115165
+ delete_space
116166
+ units
117167
)
@@ -120,7 +170,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
120170
pynutil.insert("cardinal { ")
121171
+ optional_graph_negative
122172
+ savva_graph
123-
+ pynutil.insert(" }")
173+
+ pynutil.insert(NEMO_SPACE)
174+
+ pynutil.insert("}")
124175
+ delete_space
125176
+ units
126177
)
@@ -129,7 +180,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
129180
pynutil.insert("cardinal { ")
130181
+ optional_graph_negative
131182
+ sadhe_graph
132-
+ pynutil.insert(" }")
183+
+ pynutil.insert(NEMO_SPACE)
184+
+ pynutil.insert("}")
133185
+ delete_space
134186
+ units
135187
)
@@ -149,7 +201,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
149201
+ pynutil.insert("integer: \"")
150202
+ cardinal_graph
151203
+ pynutil.insert("\"")
152-
+ pynutil.insert(" }")
204+
+ pynutil.insert(NEMO_SPACE)
205+
+ pynutil.insert("}")
153206
+ delete_space
154207
+ unit
155208
)
@@ -162,9 +215,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
162215
+ cardinal_graph
163216
+ pynutil.insert("\"")
164217
+ pynutil.insert(" }")
165-
+ pynutil.insert(" units: \"")
218+
+ pynutil.insert(NEMO_SPACE)
219+
+ pynutil.insert("units: \"")
166220
+ symbol_graph
167-
+ pynutil.insert("\" ")
221+
+ pynutil.insert("\"")
222+
+ pynutil.insert(NEMO_SPACE)
168223
+ pynutil.insert("} }")
169224
+ insert_space
170225
+ pynutil.insert("tokens { cardinal { ")
@@ -175,13 +230,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
175230
)
176231

177232
graph = (
178-
pynutil.add_weight(graph_decimal, 0.01)
179-
| pynutil.add_weight(graph_cardinal, 0.01)
180-
| pynutil.add_weight(graph_exceptions, 0.01)
181-
| pynutil.add_weight(graph_dedh_dhai, 0.001)
182-
| pynutil.add_weight(graph_savva, 0.005)
183-
| pynutil.add_weight(graph_sadhe, 0.005)
184-
| pynutil.add_weight(graph_paune, -0.2)
233+
pynutil.add_weight(graph_decimal, 0.1)
234+
| pynutil.add_weight(graph_cardinal, 0.1)
235+
| pynutil.add_weight(graph_exceptions, 0.1)
236+
| pynutil.add_weight(graph_dedh_dhai, -0.2)
237+
| pynutil.add_weight(graph_savva, -0.1)
238+
| pynutil.add_weight(graph_sadhe, -0.1)
239+
| pynutil.add_weight(graph_paune, -0.5)
185240
)
186241
self.graph = graph.optimize()
187242

nemo_text_processing/text_normalization/hi/taggers/ordinal.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,14 @@ class OrdinalFst(GraphFst):
3434
def __init__(self, cardinal: CardinalFst, deterministic: bool = True):
3535
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
3636

37-
suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
37+
suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
38+
suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv"))
39+
suffixes_fst = pynini.union(suffixes_list, suffixes_map)
40+
exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv"))
3841

3942
graph = cardinal.final_graph + suffixes_fst
43+
exceptions = pynutil.add_weight(exceptions, -0.1)
44+
graph = pynini.union(exceptions, graph)
4045

4146
final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
4247
final_graph = self.add_tokens(final_graph)

0 commit comments

Comments
 (0)