diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..ba381f535 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv new file mode 100644 index 000000000..da69e23eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -0,0 +1,3 @@ +ई.पू. ईसा पूर्व +ई. ईस्वी +ई. ईसवी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv new file mode 100644 index 000000000..53c5e36cb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv @@ -0,0 +1,10 @@ +० zero +१ one +२ two +३ three +४ four +५ five +६ six +७ seven +८ eight +९ nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv new file mode 100644 index 000000000..ac37b55f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv @@ -0,0 +1,90 @@ +१० ten +११ eleven +१२ twelve +१३ thirteen +१४ fourteen +१५ fifteen +१६ sixteen +१७ seventeen +१८ eighteen +१९ nineteen +२० twenty +२१ twenty one +२२ twenty two +२३ twenty three +२४ twenty four +२५ twenty five +२६ twenty six +२७ twenty seven +२८ twenty eight +२९ twenty nine +३० thirty +३१ thirty one +३२ thirty two +३३ thirty three +३४ thirty four +३५ thirty five +३६ thirty six +३७ thirty seven +३८ thirty eight +३९ thirty nine +४० forty +४१ forty one +४२ forty two +४३ forty three +४४ forty four +४५ forty five +४६ forty six +४७ forty seven +४८ forty eight +४९ forty nine +५० fifty +५१ fifty one +५२ fifty two +५३ fifty three +५४ fifty four +५५ fifty five +५६ fifty six +५७ fifty seven +५८ fifty eight +५९ fifty nine +६० sixty +६१ sixty one +६२ sixty two +६३ sixty three +६४ sixty four +६५ sixty five +६६ sixty six +६७ sixty seven +६८ sixty eight +६९ sixty nine +७० seventy +७१ seventy one +७२ seventy two +७३ seventy three +७४ seventy four +७५ seventy five +७६ seventy six +७७ seventy seven +७८ seventy eight +७९ seventy nine +८० eighty +८१ eighty one +८२ eighty two +८३ eighty three +८४ eighty four +८५ eighty five +८६ eighty six +८७ eighty seven +८८ eighty eight +८९ eighty nine +९० ninety +९१ ninety one +९२ ninety two +९३ ninety three +९४ ninety four +९५ ninety five +९६ ninety six +९७ ninety seven +९८ ninety eight +९९ ninety nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 61183ae72..6859f0834 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst): month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.year_range = ( + pynutil.insert("year: \"") + + graph_year + + delete_space + + pynini.cross("से", "-") + + delete_space + + graph_year + + delete_space + + pynutil.insert("\" ") + ) + self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") graph_day_month = self.day + delete_space + self.month @@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst): graph_month_day_year += pynutil.insert(" preserve_order: true") graph_month_year = self.month + delete_space + self.year graph_saal = self.year + graph_AD_BC = self.year + delete_space + self.century + graph_day_month_year_century = ( + self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + ) + graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century + graph_year_range = self.year_range - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year - self.graph = graph.optimize() + graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day + graph_date_exceptions += pynutil.insert("preserve_order: true") + graph = ( + graph_day_month + | graph_month_day + | graph_day_month_year + | graph_month_day_year + | graph_month_year + | graph_saal + | graph_AD_BC + | graph_day_month_year_century + | graph_month_year_century + | graph_year_range + | graph_date_exceptions + ) final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..1d1d3c875 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file( + get_abs_path("data/telephone/eng_to_hindi_digit.tsv") + ).invert() + + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + # two, three, four-digit extension code with zero + self.city_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + self.city_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + + self.city_extension = self.city_code_hindi | self.city_code_english + + # 7-digit landline graph in hindi and english digits + self.landline_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + self.landline_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + + self.landline = self.landline_hindi | self.landline_english + + self.pincode_in_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.pincode_in_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 5) + + english_digit_graph + + pynutil.insert("\" ") + ) + + self.credit_card_last_digits_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.credit_card_last_digits_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 3) + + english_digit_graph + + pynutil.insert("\" ") + ) + + delete_plus = pynini.union( + pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") + ) + + delete_zero = pynini.union( + pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") + ) + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) + graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + + graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + + graph_pincode = self.pincode_in_hindi | self.pincode_in_english + + graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_extension + | graph_pincode + | graph_credit_card_last_digits + ) + + final_graph = self.add_tokens(graph) + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index a5a371d90..62554bd14 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,6 +33,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst @@ -82,6 +83,8 @@ def __init__( measure_graph = measure.fst money = MoneyFst(cardinal, decimal) money_graph = money.fst + telephone = TelephoneFst(cardinal) + telephone_graph = telephone.fst punct_graph = PunctuationFst().fst whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst @@ -95,6 +98,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(whitelist_graph, 1.01) ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 5442777da..eacfb5765 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -61,22 +61,45 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - graph_fy = period + delete_space + year + graph_fy = year + graph_fy |= period + delete_space + year + + # century + graph_century = year + delete_extra_space + period + # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + # day month year century + graph_dmyc = ( + day + + delete_extra_space + + month + + pynutil.insert(",") + + delete_extra_space + + year + + delete_extra_space + + period + ) + # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) + # month year century + graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period + # month day graph_md = month + pynini.closure(delete_extra_space + day, 0, 1) # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) + # year range + graph_year_range = year + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -88,7 +111,18 @@ def __init__(self): ) final_graph = ( - (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm) + ( + graph_fy + | graph_mdy + | graph_dmy + | graph_my + | graph_md + | graph_dm + | graph_century + | graph_dmyc + | graph_myc + | graph_year_range + ) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..3f4b4de1f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2025 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone, e.g. + telephone { number_part: "123-123-5678" } + -> 123-123-5678 + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynutil.insert("+") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + optional_city_code = pynini.closure( + pynutil.delete("extension: \"") + + pynutil.insert("०") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + delete_tokens = self.delete_tokens(optional_country_code + number_part) + delete_tokens |= self.delete_tokens(optional_city_code + number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index d88bd25d9..165fe7a7e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst @@ -45,6 +46,7 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -59,5 +61,6 @@ def __init__(self): | time_graph | measure_graph | money_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index bdc450fdd..6d570a9c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -22,4 +22,14 @@ सत्ताईस जुलाई दो हज़ार ग्यारह~२७ जुलाई, २०११ जुलाई सत्ताईस~जुलाई २७ वर्ष दो हज़ार उन्नीस~वर्ष २०१९ -सन उन्नीस सौ नब्बे~सन १९९० \ No newline at end of file +सन उन्नीस सौ नब्बे~सन १९९० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ +दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ +दो हज़ार पाँच से उन्नीस~२००५-१९ +चौंतीस सौ ईसा पूर्व~३४०० ई.पू. +उन्नीस सौ बीस ईस्वी~१९२० ई. +पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. +इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. +पच्चीस ईसा पूर्व~२५ ई.पू. +मार्च की दो~मार्च २ +फ़रवरी की बीस~फ़रवरी २० diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..0c51d8df0 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,37 @@ +प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ +प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ +प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus eleven nine four one one one two three~+११ ९४१११२३ +zero eight zero two nine four one one one two~०८० २९४१११२ +शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ +zero four zero two seven eight one eight three nine~०४० २७८१८३९ +शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +प्लस नौ एक नौ तीन आठ दो सात एक चार छह पांच शून्य~+९१ ९३८२७१४६५० +प्लस नौ एक नौ शून्य पांच एक तीन चार आठ दो सात छह~+९१ ९०५१३४८२७६ +प्लस नौ एक नौ चार तीन सात दो शून्य पांच छह एक आठ~+९१ ९४३७२०५६१८ +PLUS ninety one nine three eight two seven one four six five zero~+९१ ९३८२७१४६५० +plus nine one nine zero five one three four eight two seven six~+९१ ९०५१३४८२७६ +plus ninety one nine four three seven two zero five six one eight~+९१ ९४३७२०५६१८ +ZERO seven three चार पाँच छह सात आठ नौ शून्य~०७३ ४५६७८९० +शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ +ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० +zero two eight seven six five four three two seven~०२८ ७६५४३२७ +PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ +zero eight zero two two nine four one one one~०८० २२९४१११ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +zero eight zero nine two two nine four one one one~०८०९ २२९४१११ +शून्य सात नौ नौ एक नौ आठ सात छह पांच चार~०७९९ १९८७६५४ +zero three one nine two two two nine four one one one~०३१९२ २२९४१११ +शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार~०७९११ १९८७६५४ +एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है +बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index aec7299d5..a365a834d 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,11 @@ testITNMoney() { runtest $input } +testITNTelephone() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + testITNWord() { input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt runtest $input diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py new file mode 100644 index 000000000..7e43f7e82 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip()