Skip to content

Commit dd0b8b7

Browse files
Hindi TN: Ordinal Implementation (#343)
* Adding ordinals into staging_hi_tn Signed-off-by: shreeshd-tn <[email protected]> * Ordinal Cleanup Signed-off-by: shreeshd-tn <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Review changes Signed-off-by: shreeshd-tn <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: shreeshd-tn <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent eb7b3e6 commit dd0b8b7

File tree

11 files changed

+166
-7
lines changed

11 files changed

+166
-7
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pipeline {
2727
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
30-
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-29-25-0'
30+
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-25-0'
3131
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3232
}
3333
stages {

nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ months महीने
1111
हफ़्ते
1212
सप्ताह
1313
सदियां
14-
सदियों
14+
सदियों
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
वां
2+
वीं
3+
वें
4+
वे वें

nemo_text_processing/text_normalization/hi/taggers/measure.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
5353
point = pynutil.delete(".")
5454
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
5555
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
56-
5756
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
5857
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))
5958

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst
19+
from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst
20+
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
21+
22+
23+
class OrdinalFst(GraphFst):
24+
"""
25+
Finite state transducer for classifying Hindi ordinals, e.g.
26+
१०वां -> ordinal { integer: "दसवां" }
27+
२१वीं -> ordinal { integer: "इक्कीसवीं" }
28+
29+
Args:
30+
deterministic: if True will provide a single transduction option,
31+
for False multiple transduction are generated (used for audio-based normalization)
32+
"""
33+
34+
def __init__(self, cardinal: CardinalFst, deterministic: bool = True):
35+
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
36+
37+
suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
38+
39+
graph = cardinal.final_graph + suffixes_fst
40+
41+
final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
42+
final_graph = self.add_tokens(final_graph)
43+
44+
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst
3434
from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst
3535
from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst
36+
from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst
3637
from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst
3738
from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst
3839
from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst
@@ -114,6 +115,11 @@ def __init__(
114115
money_graph = money.fst
115116
logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes")
116117

118+
start_time = time.time()
119+
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
120+
ordinal_graph = ordinal.fst
121+
logging.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes")
122+
117123
start_time = time.time()
118124
whitelist_graph = WhiteListFst(
119125
input_case=input_case, deterministic=deterministic, input_file=whitelist
@@ -140,6 +146,7 @@ def __init__(
140146
| pynutil.add_weight(measure_graph, 1.1)
141147
| pynutil.add_weight(money_graph, 1.1)
142148
| pynutil.add_weight(telephone_graph, 1.1)
149+
| pynutil.add_weight(ordinal_graph, 1.1)
143150
)
144151

145152
start_time = time.time()
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
19+
20+
21+
class OrdinalFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Hindi ordinals, e.g.
24+
ordinal { integer: "दसवां" } -> दसवां
25+
ordinal { integer: "इक्कीसवीं" } -> इक्कीसवीं
26+
27+
Args:
28+
deterministic: if True will provide a single transduction option,
29+
for False multiple options (used for audio-based normalization)
30+
"""
31+
32+
def __init__(self, deterministic: bool = True):
33+
super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
34+
35+
integer_value = delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
36+
graph = pynutil.delete("integer:") + integer_value
37+
delete_tokens = self.delete_tokens(graph)
38+
self.fst = delete_tokens.optimize()

nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst
2020
from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst
2121
from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst
22+
from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst
2223
from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst
2324
from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst
2425
from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst
@@ -61,6 +62,8 @@ def __init__(self, deterministic: bool = True):
6162

6263
telephone = TelephoneFst()
6364
telephone_graph = telephone.fst
65+
ordinal = OrdinalFst(deterministic=deterministic)
66+
ordinal_graph = ordinal.fst
6467

6568
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
6669

@@ -72,6 +75,7 @@ def __init__(self, deterministic: bool = True):
7275
| time_graph
7376
| measure_graph
7477
| money_graph
78+
| ordinal_graph
7579
| whitelist_graph
7680
| telephone_graph
7781
)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
५वां~पाँचवां
2+
५वीं~पाँचवीं
3+
७वां~सातवां
4+
७वीं~सातवीं
5+
८वां~आठवां
6+
८वीं~आठवीं
7+
९वां~नौवां
8+
९वीं~नौवीं
9+
११वां~ग्यारहवां
10+
१२वीं~बारहवीं
11+
१४वां~चौदहवां
12+
१६वीं~सोलहवीं
13+
१७वां~सत्रहवां
14+
१८वीं~अठारहवीं
15+
१९वां~उन्नीसवां
16+
२०वां~बीसवां
17+
२१वां~इक्कीसवां
18+
२५वीं~पच्चीसवीं
19+
२७वें~सत्ताईसवें
20+
३०वीं~तीसवीं
21+
३३वां~तैंतीसवां
22+
४०वीं~चालीसवीं
23+
४५वां~पैंतालीसवां
24+
५०वां~पचासवां
25+
५६वें~छप्पनवें
26+
६०वां~साठवां
27+
६७वीं~सड़सठवीं
28+
७५वीं~पचहत्तरवीं
29+
८०वें~अस्सीवें
30+
८८वां~अट्ठासीवां
31+
९१वीं~इक्यानबेवीं
32+
९९वां~निन्यानबेवां
33+
१००वां~एक सौवां
34+
१०१वां~एक सौ एकवां
35+
१११वीं~एक सौ ग्यारहवीं
36+
१२५वें~एक सौ पच्चीसवें
37+
१५३वीं~एक सौ तिरेपनवीं
38+
२००वीं~दो सौवीं
39+
२१९वीं~दो सौ उन्नीसवीं
40+
२४०वां~दो सौ चालीसवां
41+
३२९वां~तीन सौ उनतीसवां
42+
३६५वां~तीन सौ पैंसठवां
43+
४५५वां~चार सौ पचपनवां
44+
५५५वीं~पाँच सौ पचपनवीं
45+
६४०वीं~छह सौ चालीसवीं
46+
८९०वां~आठ सौ नब्बेवां
47+
१००१वीं~एक हज़ार एकवीं
48+
१०९१वें~एक हज़ार इक्यानबेवें
49+
१७८२वीं~सत्रह सौ बयासीवीं
50+
१८९०वां~एक हज़ार आठ सौ नब्बेवां
51+
१९८१वीं~उन्नीस सौ इक्यासीवीं
52+
९८२६वीं~अट्ठानबे सौ छब्बीसवीं

tests/nemo_text_processing/hi/test_ordinal.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,24 @@
1717
from parameterized import parameterized
1818

1919
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
20+
from nemo_text_processing.text_normalization.normalize import Normalizer
2021

2122
from ..utils import CACHE_DIR, parse_test_case_file
2223

2324

2425
class TestOrdinal:
26+
normalizer = Normalizer(
27+
input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False
28+
)
2529
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
2630

31+
@parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_ordinal.txt'))
32+
@pytest.mark.run_only_on('CPU')
33+
@pytest.mark.unit
34+
def test_norm(self, test_input, expected):
35+
pred = self.normalizer.normalize(test_input, verbose=False)
36+
assert pred.strip() == expected.strip()
37+
2738
@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt'))
2839
@pytest.mark.run_only_on('CPU')
2940
@pytest.mark.unit

0 commit comments

Comments
 (0)