diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..53c784920 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-12-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv new file mode 100644 index 000000000..d4c1ca0b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -0,0 +1,3 @@ +सन् +सन +साल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv new file mode 100644 index 000000000..6806d3f12 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv @@ -0,0 +1,10 @@ + में + का + की + के + से + तक + ईस्वी + शताब्दी + दशक + सदी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..acb37d534 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,2 @@ +ई. पू. ईसा पूर्व +ई. ईसवी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv new file mode 100644 index 000000000..eaddf930a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -0,0 +1,12 @@ +s सेकंड +hr घंटा +h घंटे +min मिनट +doz दर्जन +yr साल +yr वर्ष +hp हॉर्सपॉवर +d दिन +month महीना +months महीने +हफ़्ते हफ़्ते \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv new file mode 100644 index 000000000..cf62891d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -0,0 +1,9 @@ +रुपए पैसे +पाउंड पेंस +वॉन जिओन +डॉलर सेंट +लीरा कुरस +टका पैसे +येन सेन +नाइरा कोबो +यूरो सेंट diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv index 1d61c77b7..fbf248266 100644 --- a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -79,12 +79,12 @@ ८८ अट्ठासी ८९ नवासी ९० नब्बे -९१ इक्यानबे -९२ बानबे -९३ तिरानबे -९४ चौरानबे -९५ पंचानबे -९६ छियानबे -९७ सत्तानबे -९८ अट्ठानबे +९१ इक्यानबे +९२ बानबे +९३ तिरानबे +९४ चौरानबे +९५ पंचानबे +९६ छियानबे +९७ सत्तानबे +९८ अट्ठानबे ९९ निन्यानबे diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..05d7a4ee4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties) graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds) graph_ten_thousands.optimize() + self.graph_ten_thousands = graph_ten_thousands # Lakhs graph and ten lakhs graph suffix_lakhs = pynutil.insert(" लाख") @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands) graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands) graph_lakhs.optimize() + self.graph_lakhs = graph_lakhs graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit) @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands) graph_ten_lakhs.optimize() + self.graph_ten_lakhs = graph_ten_lakhs # Crores graph ten crores graph suffix_crores = pynutil.insert(" करोड़") diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..468753e23 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -26,6 +26,20 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + +# Read suffixes from file into a list +with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: + suffixes_list = f.read().splitlines() +with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: + prefixes_list = f.read().splitlines() + +# Create union of suffixes and prefixes +suffix_union = pynini.union(*suffixes_list) +prefix_union = pynini.union(*prefixes_list) class DateFst(GraphFst): @@ -51,10 +65,15 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) + cardinal_graph = ( + digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + ) + graph_year = graph_year_thousands | graph_year_hundreds_as_thousands delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space @@ -68,6 +87,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + + # Graph for year + century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + + # Updated logic to use suffix_union + year_number = graph_year + suffix_union + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + + # Updated logic to use prefix_union + year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +113,20 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("era: \"") + + cardinal_graph + + insert_space + + range_graph + + insert_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -87,7 +135,12 @@ def __init__(self, cardinal: GraphFst): | graph_mm_dd | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy - | graph_mm_yyyy + | pynutil.add_weight(graph_mm_yyyy, -0.2) + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(century_text, -0.001) + | pynutil.add_weight(year_text, -0.001) + | pynutil.add_weight(year_prefix, -0.009) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..954215771 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -19,6 +19,11 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + + class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, suppletive aware, e.g. @@ -35,26 +40,55 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = cardinal.final_graph - decimal_graph = decimal.final_graph_wo_negative + cardinal_graph = ( + digit + | teens_and_ties + | cardinal.graph_hundreds + | cardinal.graph_thousands + | cardinal.graph_ten_thousands + | cardinal.graph_lakhs + | cardinal.graph_ten_lakhs + ) + point = pynutil.delete(".") + decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, ) + # Define the quarterly measurements + quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),]) + quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") + # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") + units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") - graph_measurements = ( + # Handling symbols like x, X, * + symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) + + graph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph_measurements |= ( + + graph_quarter = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + quarter_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") @@ -62,10 +96,35 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph = graph_measurements + # Handling cardinal clubbed with symbol as single token + graph_exceptions = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + ) + + graph = ( + pynutil.add_weight(graph_decimal, 0.01) + | pynutil.add_weight(graph_quarter, 0.005) + | pynutil.add_weight(graph_cardinal, 0.01) + | pynutil.add_weight(graph_exceptions, 0.01) + ) self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..6d9ac6dcc 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,9 +24,11 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + Args: cardinal: CardinalFst decimal: DecimalFst @@ -34,7 +36,7 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph @@ -42,21 +44,25 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( + graph_major_only = optional_graph_negative + currency_major + insert_space + integer + graph_major_and_minor = ( optional_graph_negative - + self.currency + + currency_major + insert_space - + self.interger - + pynutil.delete(".") + + integer + + pynini.cross(".", " ") + + fraction + insert_space - + self.fraction + + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..bdec90c06 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -68,11 +68,11 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +107,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..187acf7d6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,6 +39,8 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -60,7 +62,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..cba534e61 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -39,10 +39,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") + insert_aur = pynutil.insert(" और ") fraction_default = numerator + insert_bata + denominator - self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space) + fraction_default + self.graph = ( + optional_sign + + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + + fraction_default + ) graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..048140295 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,26 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +43,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 3f478a2d2..ed419f2f7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..a4b3caf07 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -17,3 +17,18 @@ ११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व +३२७वीं सदी~तीन सौ सत्ताईसवीं सदी +१८वीं शताब्दी~अठारहवीं शताब्दी +१९वीं दशक~उन्नीसवीं दशक +१९९९ में~उन्नीस सौ निन्यानबे में +१९९० का~उन्नीस सौ नब्बे का +१९९२ की~उन्नीस सौ बानबे की +१९६० के अभिनेता है~उन्नीस सौ साठ के अभिनेता है +१७८८ से~सत्रह सौ अट्ठासी से +१९५४ तक~उन्नीस सौ चौवन तक +सन १९९९~सन उन्नीस सौ निन्यानबे +सन् १९२०~सन् उन्नीस सौ बीस +साल १९७१~साल उन्नीस सौ इकहत्तर +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 25c18b777..d1473412e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -1,5 +1,5 @@ ९९/९९~निन्यानबे बटा निन्यानबे -२२ ३१/१७~बाईस इकतीस बटा सत्रह +२२ ३१/१७~बाईस और इकतीस बटा सत्रह ९७/०~सत्तानबे बटा शून्य २५६३/४१२~दो हज़ार पाँच सौ तिरेसठ बटा चार सौ बारह ७२८६०/७०~बहत्तर हज़ार आठ सौ साठ बटा सत्तर @@ -19,3 +19,5 @@ १०००००००००००००/३~एक नील बटा तीन १०००००००००००००००/८~एक पद्म बटा आठ १०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह +२ २/७~दो और दो बटा सात +१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..b576dac38 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट