Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ ASR a s r
ASUS a sus
ASUS asus
AT&T a t and t
AT&T at and t
ATM a t m
AV1 a v one
AV1 av one
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ई.पू. ईसा पूर्व
ई. ईस्वी
ई. ईसवी
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
० zero
१ one
२ two
३ three
४ four
५ five
६ six
७ seven
८ eight
९ nine
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
१० ten
११ eleven
१२ twelve
१३ thirteen
१४ fourteen
१५ fifteen
१६ sixteen
१७ seventeen
१८ eighteen
१९ nineteen
२० twenty
२१ twenty one
२२ twenty two
२३ twenty three
२४ twenty four
२५ twenty five
२६ twenty six
२७ twenty seven
२८ twenty eight
२९ twenty nine
३० thirty
३१ thirty one
३२ thirty two
३३ thirty three
३४ thirty four
३५ thirty five
३६ thirty six
३७ thirty seven
३८ thirty eight
३९ thirty nine
४० forty
४१ forty one
४२ forty two
४३ forty three
४४ forty four
४५ forty five
४६ forty six
४७ forty seven
४८ forty eight
४९ forty nine
५० fifty
५१ fifty one
५२ fifty two
५३ fifty three
५४ fifty four
५५ fifty five
५६ fifty six
५७ fifty seven
५८ fifty eight
५९ fifty nine
६० sixty
६१ sixty one
६२ sixty two
६३ sixty three
६४ sixty four
६५ sixty five
६६ sixty six
६७ sixty seven
६८ sixty eight
६९ sixty nine
७० seventy
७१ seventy one
७२ seventy two
७३ seventy three
७४ seventy four
७५ seventy five
७६ seventy six
७७ seventy seven
७८ seventy eight
७९ seventy nine
८० eighty
८१ eighty one
८२ eighty two
८३ eighty three
८४ eighty four
८५ eighty five
८६ eighty six
८७ eighty seven
८८ eighty eight
८९ eighty nine
९० ninety
९१ ninety one
९२ ninety two
९३ ninety three
९४ ninety four
९५ ninety five
९६ ninety six
९७ ninety seven
९८ ninety eight
९९ ninety nine
35 changes: 33 additions & 2 deletions nemo_text_processing/inverse_text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst):

month_graph = pynini.string_file(get_abs_path("data/date/months.tsv"))
graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert()
graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert()

self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ")
self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ")
self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ")
self.year_range = (
pynutil.insert("year: \"")
+ graph_year
+ delete_space
+ pynini.cross("से", "-")
+ delete_space
+ graph_year
+ delete_space
+ pynutil.insert("\" ")
)
self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ")
insert_comma = pynutil.insert(", ")

graph_day_month = self.day + delete_space + self.month
Expand All @@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst):
graph_month_day_year += pynutil.insert(" preserve_order: true")
graph_month_year = self.month + delete_space + self.year
graph_saal = self.year
graph_AD_BC = self.year + delete_space + self.century
graph_day_month_year_century = (
self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century
)
graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century
graph_year_range = self.year_range

graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year
self.graph = graph.optimize()
graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day
graph_date_exceptions += pynutil.insert("preserve_order: true")

graph = (
graph_day_month
| graph_month_day
| graph_day_month_year
| graph_month_day_year
| graph_month_year
| graph_saal
| graph_AD_BC
| graph_day_month_year_century
| graph_month_year_century
| graph_year_range
| graph_date_exceptions
)
final_graph = self.add_tokens(graph)
self.fst = final_graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space
from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path


class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }

Args:
Cardinal: CardinalFst
"""

def __init__(self, cardinal: GraphFst):
super().__init__(name="telephone", kind="classify")

hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()

english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert()

country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
country_code_graph_single_digits |= pynini.string_file(
get_abs_path("data/telephone/eng_to_hindi_digit.tsv")
).invert()

country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert()
country_code_graph_double_digits |= pynini.string_file(
get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")
).invert()

self.hindi_digit = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 9)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.english_digit = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 9)
+ english_digit_graph
+ delete_space
+ pynutil.insert("\" ")
)

self.country_code_with_single_digits = (
pynutil.insert("country_code: \"")
+ pynini.closure(country_code_graph_single_digits + delete_space, 0, 2)
+ pynutil.insert("\" ")
)
self.country_code_with_double_digits = (
pynutil.insert("country_code: \"")
+ pynini.closure(country_code_graph_double_digits + delete_space, 0, 1)
+ pynutil.insert("\" ")
)
self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits

# two, three, four-digit extension code with zero
self.city_code_hindi = (
pynutil.insert("extension: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 2, 5)
+ pynutil.insert("\" ")
)
self.city_code_english = (
pynutil.insert("extension: \"")
+ pynini.closure(english_digit_graph + delete_space, 2, 5)
+ pynutil.insert("\" ")
)

self.city_extension = self.city_code_hindi | self.city_code_english

# 7-digit landline graph in hindi and english digits
self.landline_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 7, 7)
+ pynutil.insert("\" ")
)
self.landline_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 7, 7)
+ pynutil.insert("\" ")
)

self.landline = self.landline_hindi | self.landline_english

self.pincode_in_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 5)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.pincode_in_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 5)
+ english_digit_graph
+ pynutil.insert("\" ")
)

self.credit_card_last_digits_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 3)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.credit_card_last_digits_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 3)
+ english_digit_graph
+ pynutil.insert("\" ")
)

delete_plus = pynini.union(
pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS")
)

delete_zero = pynini.union(
pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO")
)

graph_number_with_hindi_digit = (
delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit
)
graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit

graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline

graph_pincode = self.pincode_in_hindi | self.pincode_in_english

graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english

graph = (
graph_number_with_hindi_digit
| graph_number_with_english_digit
| graph_landline_with_extension
| graph_pincode
| graph_credit_card_last_digits
)

final_graph = self.add_tokens(graph)
self.fst = final_graph
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
Expand Down Expand Up @@ -82,6 +83,8 @@ def __init__(
measure_graph = measure.fst
money = MoneyFst(cardinal, decimal)
money_graph = money.fst
telephone = TelephoneFst(cardinal)
telephone_graph = telephone.fst
punct_graph = PunctuationFst().fst
whitelist_graph = WhiteListFst().fst
word_graph = WordFst().fst
Expand All @@ -95,6 +98,7 @@ def __init__(
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
| pynutil.add_weight(whitelist_graph, 1.01)
)
Expand Down
Loading
Loading