Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
513fff5
Addition of whitelist and word classes
tarushi2k2 Nov 27, 2024
535af69
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 27, 2024
d4e380f
Updation of Jenkins date
tarushi2k2 Nov 27, 2024
60f8757
Cleanup
tarushi2k2 Nov 28, 2024
9aa85c0
Updation
tarushi2k2 Nov 29, 2024
bf6ebe3
Updation
tarushi2k2 Nov 29, 2024
28c2cd7
Merge branch 'NVIDIA:main' into main
tarushi2k2 Dec 12, 2024
ba19d36
Future implementations for date
tarushi2k2 Dec 12, 2024
6452e61
pushing rough date code for ref
tarushi2k2 Dec 16, 2024
3821339
Future implementations date.py
tarushi2k2 Jan 9, 2025
6ece14b
Cleanup
tarushi2k2 Jan 9, 2025
6ec714c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 9, 2025
73e75b2
Merge branch 'NVIDIA:main' into hi_itn
tarushi2k2 Jan 15, 2025
2adeee4
Updation of Jenkinsfile
tarushi2k2 Jan 15, 2025
b5ede2f
Telephone.py-hindi itn
tarushi2k2 Jan 20, 2025
461962b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 20, 2025
3a011b9
Telephone.py - Hindi ITN
tarushi2k2 Jan 20, 2025
a5cf050
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 21, 2025
1c506e2
Telephone modified tagger and verbalizer
tarushi2k2 Jan 21, 2025
4503378
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 21, 2025
eb269ef
telephone tagger with 3,4,5 digit std codes
tarushi2k2 Jan 24, 2025
26e9d7f
Further additions - telephone.py
tarushi2k2 Jan 24, 2025
b743170
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 24, 2025
ab5d886
Jenkins update
tarushi2k2 Jan 27, 2025
a0ff72e
Telephone.py
tarushi2k2 Jan 28, 2025
cac9be6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 28, 2025
e238871
Updated tagger-telephone.py
tarushi2k2 Jan 29, 2025
d4d27da
Telephone and Jenkinsfile cleanup
tarushi2k2 Jan 30, 2025
e072a01
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ई.पू. ईसा पूर्व
ई. ईस्वी
ई. ईसवी
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
० zero
१ one
२ two
३ three
४ four
५ five
६ six
७ seven
८ eight
९ nine
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
१० ten
११ eleven
१२ twelve
१३ thirteen
१४ fourteen
१५ fifteen
१६ sixteen
१७ seventeen
१८ eighteen
१९ nineteen
२० twenty
२१ twenty one
२२ twenty two
२३ twenty three
२४ twenty four
२५ twenty five
२६ twenty six
२७ twenty seven
२८ twenty eight
२९ twenty nine
३० thirty
३१ thirty one
३२ thirty two
३३ thirty three
३४ thirty four
३५ thirty five
३६ thirty six
३७ thirty seven
३८ thirty eight
३९ thirty nine
४० forty
४१ forty one
४२ forty two
४३ forty three
४४ forty four
४५ forty five
४६ forty six
४७ forty seven
४८ forty eight
४९ forty nine
५० fifty
५१ fifty one
५२ fifty two
५३ fifty three
५४ fifty four
५५ fifty five
५६ fifty six
५७ fifty seven
५८ fifty eight
५९ fifty nine
६० sixty
६१ sixty one
६२ sixty two
६३ sixty three
६४ sixty four
६५ sixty five
६६ sixty six
६७ sixty seven
६८ sixty eight
६९ sixty nine
७० seventy
७१ seventy one
७२ seventy two
७३ seventy three
७४ seventy four
७५ seventy five
७६ seventy six
७७ seventy seven
७८ seventy eight
७९ seventy nine
८० eighty
८१ eighty one
८२ eighty two
८३ eighty three
८४ eighty four
८५ eighty five
८६ eighty six
८७ eighty seven
८८ eighty eight
८९ eighty nine
९० ninety
९१ ninety one
९२ ninety two
९३ ninety three
९४ ninety four
९५ ninety five
९६ ninety six
९७ ninety seven
९८ ninety eight
९९ ninety nine
35 changes: 33 additions & 2 deletions nemo_text_processing/inverse_text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst):

month_graph = pynini.string_file(get_abs_path("data/date/months.tsv"))
graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert()
graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert()

self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ")
self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ")
self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ")
self.year_range = (
pynutil.insert("year: \"")
+ graph_year
+ delete_space
+ pynini.cross("से", "-")
+ delete_space
+ graph_year
+ delete_space
+ pynutil.insert("\" ")
)
self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ")
insert_comma = pynutil.insert(", ")

graph_day_month = self.day + delete_space + self.month
Expand All @@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst):
graph_month_day_year += pynutil.insert(" preserve_order: true")
graph_month_year = self.month + delete_space + self.year
graph_saal = self.year
graph_AD_BC = self.year + delete_space + self.century
graph_day_month_year_century = (
self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century
)
graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century
graph_year_range = self.year_range

graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year
self.graph = graph.optimize()
graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day
graph_date_exceptions += pynutil.insert("preserve_order: true")

graph = (
graph_day_month
| graph_month_day
| graph_day_month_year
| graph_month_day_year
| graph_month_year
| graph_saal
| graph_AD_BC
| graph_day_month_year_century
| graph_month_year_century
| graph_year_range
| graph_date_exceptions
)
final_graph = self.add_tokens(graph)
self.fst = final_graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space
from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path


class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }

Args:
Cardinal: CardinalFst
"""

def __init__(self, cardinal: GraphFst):
super().__init__(name="telephone", kind="classify")

hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()

english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert()

country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
country_code_graph_single_digits |= pynini.string_file(
get_abs_path("data/telephone/eng_to_hindi_digit.tsv")
).invert()

country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert()
country_code_graph_double_digits |= pynini.string_file(
get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")
).invert()

self.hindi_digit = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 9)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.english_digit = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 9)
+ english_digit_graph
+ delete_space
+ pynutil.insert("\" ")
)

self.country_code_with_single_digits = (
pynutil.insert("country_code: \"")
+ pynini.closure(country_code_graph_single_digits + delete_space, 0, 2)
+ pynutil.insert("\" ")
)
self.country_code_with_double_digits = (
pynutil.insert("country_code: \"")
+ pynini.closure(country_code_graph_double_digits + delete_space, 0, 1)
+ pynutil.insert("\" ")
)
self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits

# two, three, four-digit extension code with zero
self.city_code_hindi = (
pynutil.insert("extension: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 2, 5)
+ pynutil.insert("\" ")
)
self.city_code_english = (
pynutil.insert("extension: \"")
+ pynini.closure(english_digit_graph + delete_space, 2, 5)
+ pynutil.insert("\" ")
)

self.city_extension = self.city_code_hindi | self.city_code_english

# 7-digit landline graph in hindi and english digits
self.landline_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 7, 7)
+ pynutil.insert("\" ")
)
self.landline_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 7, 7)
+ pynutil.insert("\" ")
)

self.landline = self.landline_hindi | self.landline_english

self.pincode_in_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 5)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.pincode_in_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 5)
+ english_digit_graph
+ pynutil.insert("\" ")
)

self.credit_card_last_digits_hindi = (
pynutil.insert("number_part: \"")
+ pynini.closure(hindi_digit_graph + delete_space, 0, 3)
+ hindi_digit_graph
+ pynutil.insert("\" ")
)
self.credit_card_last_digits_english = (
pynutil.insert("number_part: \"")
+ pynini.closure(english_digit_graph + delete_space, 0, 3)
+ english_digit_graph
+ pynutil.insert("\" ")
)

delete_plus = pynini.union(
pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS")
)

delete_zero = pynini.union(
pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO")
)

graph_number_with_hindi_digit = (
delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit
)
graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit

graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline

graph_pincode = self.pincode_in_hindi | self.pincode_in_english

graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english

graph = (
graph_number_with_hindi_digit
| graph_number_with_english_digit
| graph_landline_with_extension
| graph_pincode
| graph_credit_card_last_digits
)

final_graph = self.add_tokens(graph)
self.fst = final_graph
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
Expand Down Expand Up @@ -82,6 +83,8 @@ def __init__(
measure_graph = measure.fst
money = MoneyFst(cardinal, decimal)
money_graph = money.fst
telephone = TelephoneFst(cardinal)
telephone_graph = telephone.fst
punct_graph = PunctuationFst().fst
whitelist_graph = WhiteListFst().fst
word_graph = WordFst().fst
Expand All @@ -95,6 +98,7 @@ def __init__(
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
| pynutil.add_weight(whitelist_graph, 1.01)
)
Expand Down
Loading