NVIDIA · mgrafu · Feb 28, 2025 · Apr 2, 2025
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -27,7 +27,7 @@ pipeline {
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
-    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1'
+    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {

diff --git a/nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
@@ -39,6 +39,7 @@ ASR	a s r
 ASUS	a sus
 ASUS	asus
 AT&T	a t and t
+AT&T	at and t
 ATM	a t m
 AV1	a v one
 AV1	av one

diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv
@@ -0,0 +1,3 @@
+ई.पू.	ईसा पूर्व
+ई.	ईस्वी
+ई.	ईसवी
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv
@@ -0,0 +1,10 @@
+०	zero
+१	one
+२	two
+३	three
+४	four
+५	five
+६	six
+७	seven
+८	eight
+९	nine
diff --git a/...ext_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv b/...ext_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv
@@ -0,0 +1,90 @@
+१०	ten
+११	eleven
+१२	twelve
+१३	thirteen
+१४	fourteen
+१५	fifteen
+१६	sixteen
+१७	seventeen
+१८	eighteen
+१९	nineteen
+२०	twenty
+२१	twenty one
+२२	twenty two
+२३	twenty three
+२४	twenty four
+२५	twenty five
+२६	twenty six
+२७	twenty seven
+२८	twenty eight
+२९	twenty nine
+३०	thirty
+३१	thirty one
+३२	thirty two
+३३	thirty three
+३४	thirty four
+३५	thirty five
+३६	thirty six
+३७	thirty seven
+३८	thirty eight
+३९	thirty nine
+४०	forty
+४१	forty one
+४२	forty two
+४३	forty three
+४४	forty four
+४५	forty five
+४६	forty six
+४७	forty seven
+४८	forty eight
+४९	forty nine
+५०	fifty
+५१	fifty one
+५२	fifty two
+५३	fifty three
+५४	fifty four
+५५	fifty five
+५६	fifty six
+५७	fifty seven
+५८	fifty eight
+५९	fifty nine
+६०	sixty
+६१	sixty one
+६२	sixty two
+६३	sixty three
+६४	sixty four
+६५	sixty five
+६६	sixty six
+६७	sixty seven
+६८	sixty eight
+६९	sixty nine
+७०	seventy
+७१	seventy one
+७२	seventy two
+७३	seventy three
+७४	seventy four
+७५	seventy five
+७६	seventy six
+७७	seventy seven
+७८	seventy eight
+७९	seventy nine
+८०	eighty
+८१	eighty one
+८२	eighty two
+८३	eighty three
+८४	eighty four
+८५	eighty five
+८६	eighty six
+८७	eighty seven
+८८	eighty eight
+८९	eighty nine
+९०	ninety
+९१	ninety one
+९२	ninety two
+९३	ninety three
+९४	ninety four
+९५	ninety five
+९६	ninety six
+९७	ninety seven
+९८	ninety eight
+९९	ninety nine
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py
@@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst):
 
         month_graph = pynini.string_file(get_abs_path("data/date/months.tsv"))
         graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert()
+        graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert()
 
         self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ")
         self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ")
         self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ")
+        self.year_range = (
+            pynutil.insert("year: \"")
+            + graph_year
+            + delete_space
+            + pynini.cross("से", "-")
+            + delete_space
+            + graph_year
+            + delete_space
+            + pynutil.insert("\" ")
+        )
+        self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ")
         insert_comma = pynutil.insert(", ")
 
         graph_day_month = self.day + delete_space + self.month
@@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst):
         graph_month_day_year += pynutil.insert(" preserve_order: true")
         graph_month_year = self.month + delete_space + self.year
         graph_saal = self.year
+        graph_AD_BC = self.year + delete_space + self.century
+        graph_day_month_year_century = (
+            self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century
+        )
+        graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century
+        graph_year_range = self.year_range
 
-        graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year
-        self.graph = graph.optimize()
+        graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day
+        graph_date_exceptions += pynutil.insert("preserve_order: true")
 
+        graph = (
+            graph_day_month
+            | graph_month_day
+            | graph_day_month_year
+            | graph_month_day_year
+            | graph_month_year
+            | graph_saal
+            | graph_AD_BC
+            | graph_day_month_year_century
+            | graph_month_year_century
+            | graph_year_range
+            | graph_date_exceptions
+        )
         final_graph = self.add_tokens(graph)
         self.fst = final_graph
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space
+from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for classifying telephone numbers, e.g.
+    e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }
+
+    Args:
+        Cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="telephone", kind="classify")
+
+        hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
+        hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
+
+        english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert()
+
+        country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
+        country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
+        country_code_graph_single_digits |= pynini.string_file(
+            get_abs_path("data/telephone/eng_to_hindi_digit.tsv")
+        ).invert()
+
+        country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert()
+        country_code_graph_double_digits |= pynini.string_file(
+            get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")
+        ).invert()
+
+        self.hindi_digit = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(hindi_digit_graph + delete_space, 0, 9)
+            + hindi_digit_graph
+            + pynutil.insert("\" ")
+        )
+        self.english_digit = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(english_digit_graph + delete_space, 0, 9)
+            + english_digit_graph
+            + delete_space
+            + pynutil.insert("\" ")
+        )
+
+        self.country_code_with_single_digits = (
+            pynutil.insert("country_code: \"")
+            + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2)
+            + pynutil.insert("\" ")
+        )
+        self.country_code_with_double_digits = (
+            pynutil.insert("country_code: \"")
+            + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1)
+            + pynutil.insert("\" ")
+        )
+        self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits
+
+        # two, three, four-digit extension code with zero
+        self.city_code_hindi = (
+            pynutil.insert("extension: \"")
+            + pynini.closure(hindi_digit_graph + delete_space, 2, 5)
+            + pynutil.insert("\" ")
+        )
+        self.city_code_english = (
+            pynutil.insert("extension: \"")
+            + pynini.closure(english_digit_graph + delete_space, 2, 5)
+            + pynutil.insert("\" ")
+        )
+
+        self.city_extension = self.city_code_hindi | self.city_code_english
+
+        # 7-digit landline graph in hindi and english digits
+        self.landline_hindi = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(hindi_digit_graph + delete_space, 7, 7)
+            + pynutil.insert("\" ")
+        )
+        self.landline_english = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(english_digit_graph + delete_space, 7, 7)
+            + pynutil.insert("\" ")
+        )
+
+        self.landline = self.landline_hindi | self.landline_english
+
+        self.pincode_in_hindi = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(hindi_digit_graph + delete_space, 0, 5)
+            + hindi_digit_graph
+            + pynutil.insert("\" ")
+        )
+        self.pincode_in_english = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(english_digit_graph + delete_space, 0, 5)
+            + english_digit_graph
+            + pynutil.insert("\" ")
+        )
+
+        self.credit_card_last_digits_hindi = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(hindi_digit_graph + delete_space, 0, 3)
+            + hindi_digit_graph
+            + pynutil.insert("\" ")
+        )
+        self.credit_card_last_digits_english = (
+            pynutil.insert("number_part: \"")
+            + pynini.closure(english_digit_graph + delete_space, 0, 3)
+            + english_digit_graph
+            + pynutil.insert("\" ")
+        )
+
+        delete_plus = pynini.union(
+            pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS")
+        )
+
+        delete_zero = pynini.union(
+            pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO")
+        )
+
+        graph_number_with_hindi_digit = (
+            delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit
+        )
+        graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit
+
+        graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline
+
+        graph_pincode = self.pincode_in_hindi | self.pincode_in_english
+
+        graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english
+
+        graph = (
+            graph_number_with_hindi_digit
+            | graph_number_with_english_digit
+            | graph_landline_with_extension
+            | graph_pincode
+            | graph_credit_card_last_digits
+        )
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py
@@ -33,6 +33,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
@@ -82,6 +83,8 @@ def __init__(
             measure_graph = measure.fst
             money = MoneyFst(cardinal, decimal)
             money_graph = money.fst
+            telephone = TelephoneFst(cardinal)
+            telephone_graph = telephone.fst
             punct_graph = PunctuationFst().fst
             whitelist_graph = WhiteListFst().fst
             word_graph = WordFst().fst
@@ -95,6 +98,7 @@ def __init__(
                 | pynutil.add_weight(time_graph, 1.1)
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
+                | pynutil.add_weight(telephone_graph, 1.1)
                 | pynutil.add_weight(word_graph, 100)
                 | pynutil.add_weight(whitelist_graph, 1.01)
             )