Hindi TN: Ordinal Implementation (#343)

shreeshd-tn · pre-commit-ci[bot] · web-flow · commit dd0b8b7ccf54 · 2025-10-17T11:50:02.000-04:00
* Adding ordinals into staging_hi_tn Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * Ordinal Cleanup Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Review changes Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -27,7 +27,7 @@ pipeline {
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
-    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-29-25-0'
+    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-25-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {
diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv
@@ -11,4 +11,4 @@ months	महीने
 हफ़्ते
 सप्ताह
 सदियां
-सदियों
+सदियों
diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv
@@ -0,0 +1,4 @@
+वां
+वीं
+वें
+वे	वें
diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py
@@ -53,7 +53,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
         point = pynutil.delete(".")
         decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
         decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
-
         unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
         quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))
 
diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst
+from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst
+from nemo_text_processing.text_normalization.hi.utils import get_abs_path
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for classifying Hindi ordinals, e.g.
+        १०वां -> ordinal { integer: "दसवां" }
+        २१वीं -> ordinal { integer: "इक्कीसवीं" }
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, cardinal: CardinalFst, deterministic: bool = True):
+        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
+
+        suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
+
+        graph = cardinal.final_graph + suffixes_fst
+
+        final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
+        final_graph = self.add_tokens(final_graph)
+
+        self.fst = final_graph.optimize()
diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py
@@ -33,6 +33,7 @@
 from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst
 from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst
 from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst
+from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst
 from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst
@@ -114,6 +115,11 @@ def __init__(
             money_graph = money.fst
             logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes")
 
+            start_time = time.time()
+            ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
+            ordinal_graph = ordinal.fst
+            logging.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes")
+
             start_time = time.time()
             whitelist_graph = WhiteListFst(
                 input_case=input_case, deterministic=deterministic, input_file=whitelist
@@ -140,6 +146,7 @@ def __init__(
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
                 | pynutil.add_weight(telephone_graph, 1.1)
+                | pynutil.add_weight(ordinal_graph, 1.1)
             )
 
             start_time = time.time()
diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing Hindi ordinals, e.g.
+        ordinal { integer: "दसवां" } -> दसवां
+        ordinal { integer: "इक्कीसवीं" } -> इक्कीसवीं
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
+
+        integer_value = delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        graph = pynutil.delete("integer:") + integer_value
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py
@@ -19,6 +19,7 @@
 from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst
 from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst
 from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst
+from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst
 from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst
 from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst
 from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst
@@ -61,6 +62,8 @@ def __init__(self, deterministic: bool = True):
 
         telephone = TelephoneFst()
         telephone_graph = telephone.fst
+        ordinal = OrdinalFst(deterministic=deterministic)
+        ordinal_graph = ordinal.fst
 
         whitelist_graph = WhiteListFst(deterministic=deterministic).fst
 
@@ -72,6 +75,7 @@ def __init__(self, deterministic: bool = True):
             | time_graph
             | measure_graph
             | money_graph
+            | ordinal_graph
             | whitelist_graph
             | telephone_graph
         )
diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt
@@ -0,0 +1,52 @@
+५वां~पाँचवां
+५वीं~पाँचवीं
+७वां~सातवां
+७वीं~सातवीं
+८वां~आठवां
+८वीं~आठवीं
+९वां~नौवां
+९वीं~नौवीं
+११वां~ग्यारहवां
+१२वीं~बारहवीं
+१४वां~चौदहवां
+१६वीं~सोलहवीं
+१७वां~सत्रहवां
+१८वीं~अठारहवीं
+१९वां~उन्नीसवां
+२०वां~बीसवां
+२१वां~इक्कीसवां
+२५वीं~पच्चीसवीं
+२७वें~सत्ताईसवें
+३०वीं~तीसवीं
+३३वां~तैंतीसवां
+४०वीं~चालीसवीं
+४५वां~पैंतालीसवां
+५०वां~पचासवां
+५६वें~छप्पनवें
+६०वां~साठवां
+६७वीं~सड़सठवीं
+७५वीं~पचहत्तरवीं
+८०वें~अस्सीवें
+८८वां~अट्ठासीवां
+९१वीं~इक्यानबेवीं
+९९वां~निन्यानबेवां
+१००वां~एक सौवां
+१०१वां~एक सौ एकवां
+१११वीं~एक सौ ग्यारहवीं
+१२५वें~एक सौ पच्चीसवें
+१५३वीं~एक सौ तिरेपनवीं
+२००वीं~दो सौवीं
+२१९वीं~दो सौ उन्नीसवीं
+२४०वां~दो सौ चालीसवां
+३२९वां~तीन सौ उनतीसवां
+३६५वां~तीन सौ पैंसठवां
+४५५वां~चार सौ पचपनवां
+५५५वीं~पाँच सौ पचपनवीं
+६४०वीं~छह सौ चालीसवीं
+८९०वां~आठ सौ नब्बेवां
+१००१वीं~एक हज़ार एकवीं
+१०९१वें~एक हज़ार इक्यानबेवें
+१७८२वीं~सत्रह सौ बयासीवीं
+१८९०वां~एक हज़ार आठ सौ नब्बेवां
+१९८१वीं~उन्नीस सौ इक्यासीवीं
+९८२६वीं~अट्ठानबे सौ छब्बीसवीं
diff --git a/tests/nemo_text_processing/hi/test_ordinal.py b/tests/nemo_text_processing/hi/test_ordinal.py
@@ -17,13 +17,24 @@
 from parameterized import parameterized
 
 from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize import Normalizer
 
 from ..utils import CACHE_DIR, parse_test_case_file
 
 
 class TestOrdinal:
+    normalizer = Normalizer(
+        input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False
+    )
     inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
 
+    @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_ordinal.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_norm(self, test_input, expected):
+        pred = self.normalizer.normalize(test_input, verbose=False)
+        assert pred.strip() == expected.strip()
+
     @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt'))
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh
@@ -76,10 +76,10 @@ testTNMoney() {
   runtest $input
 }
 
-#testTNOrdinal() {
-#  input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt
-#  runtest $input
-#}
+testTNOrdinal() {
+ input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt
+ runtest $input
+}
 
 testTNTelephone() {
  input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ pipeline {`
`27`	`27`	`HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'`
`28`	`28`	`MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'`
`29`	`29`	`JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'`
`30`		`- HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-29-25-0'`
	`30`	`+ HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-25-0'`
`31`	`31`	`DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`32`	`32`	`}`
`33`	`33`	`stages {`