diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8728287d..698a94e89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v5.0.0 hooks: - id: check-yaml - id: check-case-conflict @@ -37,7 +37,7 @@ repos: - --select=W605 - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort name: Format imports @@ -45,7 +45,7 @@ repos: exclude: docs/ - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 24.10.0 hooks: - id: black name: Format code diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index d634f5a09..82c96aa6f 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -515,7 +515,11 @@ def _relax_diff(text): return acceptable -def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]: +def get_labels( + targets: List[str], + norm_texts_weights: List[Tuple[str, str]], + lang="en", +) -> List[List[str]]: """ Assign labels to generated normalization options (1 - for ground truth, 0 - other options) Args: @@ -605,7 +609,14 @@ def print_df(df): prints data frame """ with pd.option_context( - "display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400, + "display.max_rows", + None, + "display.max_columns", + None, + "display.width", + 1000, + "display.max_colwidth", + 400, ): print(df) diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index 47febc4ac..2c58df6a9 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -33,7 +33,9 @@ def __init__(self, tn_cardinal): self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py index f0d641d14..3b22ece05 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py @@ -36,7 +36,9 @@ def __init__(self, tn_decimal): super().__init__(name="decimal", kind="classify") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, + 0, + 1, ) graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index fa5df3367..5eea89af1 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -207,7 +207,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + graph_in_thousands ) - graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,) + graph = pynini.union( + (graph_int | graph_ind) + delete_space + graph_hundreds, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index 5be9240d7..b1ace40ce 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -165,7 +165,11 @@ def __init__(self, ordinal: GraphFst, input_case: str): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_mdy = month_graph + ( (delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index 1d730ec30..6e5de2418 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("point") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py index a2373d9d7..0a41b4702 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py @@ -106,7 +106,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + url_symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py index 2d9d5e02c..69eeaa56e 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py @@ -58,7 +58,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py index 2a1e32a49..2c5d5ad78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU # "one fifty" -> "one hundred fifty" with_hundred = pynini.compose( pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA, - pynini.compose(cardinal_graph, NEMO_DIGIT ** 3), + pynini.compose(cardinal_graph, NEMO_DIGIT**3), ) cardinal_graph |= with_hundred graph_decimal_final = decimal.final_graph_wo_negative diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index 06d749e39..9a106ca78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -40,7 +40,7 @@ def get_serial_number(cardinal): """ digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT) - two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002) + two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002) character = digit | two_digit | NEMO_ALPHA sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2) sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2) @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): triple_digit.invert() # to handle cases like "one twenty three" - two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) + two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): number_part = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") @@ -156,16 +156,16 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): graph = optional_country_code + number_part # credit card number - space_four_digits = insert_space + NEMO_DIGIT ** 4 + space_four_digits = insert_space + NEMO_DIGIT**4 space_five_digits = space_four_digits + NEMO_DIGIT space_six_digits = space_five_digits + NEMO_DIGIT credit_card_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits, + NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits, ).optimize() credit_card_graph |= pynini.compose( - single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits + single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits ).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") @@ -173,7 +173,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): # SSN ssn_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py index 53d3dd931..46dc71bc8 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py @@ -71,14 +71,32 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") - oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",) + oclock = pynini.cross( + pynini.union( + "o' clock", + "o clock", + "o'clock", + "oclock", + "hundred hours", + ), + "", + ) if input_case == INPUT_CASED: minute_to_graph = capitalized_input_graph(minute_to_graph) graph_minute_single = capitalized_input_graph(graph_minute_single) graph_minute_double = capitalized_input_graph(graph_minute_double) graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15") - oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",) + oclock |= pynini.cross( + pynini.union( + "O' clock", + "O clock", + "O'clock", + "Oclock", + "Hundred hours", + ), + "", + ) final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 3e164bcc9..d3082509a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -160,18 +160,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 50a5e07f7..a7d767119 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -136,7 +136,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index d97cc752a..7cdcfacc7 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) - ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) + ordinal_graph_union = pynini.union( + graph_digit, + graph_teens, + graph_twenties, + full_graph_ties, + graph_hundreds, + ) accept_o_endings = NEMO_SIGMA + pynini.accep("o") accept_a_endings = NEMO_SIGMA + pynini.accep("a") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 2086d643c..8c73ca434 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -110,7 +110,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # Denormalized phone numbers are grouped in sets of 3 or 4 digits group_of_two = pynini.union(doubled_digit, digit_twice, double_digits) - group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,) + group_of_three = pynini.union( + tripled_digit, + single_digits + pynutil.delete(" ") + group_of_two, + ) group_of_four = pynini.union( group_of_two + pynutil.delete(" ") + group_of_two, diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index d827a63e2..ea1fcf8ea 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -248,18 +248,13 @@ def __init__(self): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py index 06807f6a3..68d35741c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst): day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") optional_graph_year = pynini.closure( - delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1, + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), + 0, + 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index ce0bdf8c4..3e654b859 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -46,7 +46,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) space_every_three_integer = ( diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py index 77dd6323f..3179af643 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py @@ -61,12 +61,12 @@ def __init__(self): graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert() graph_roman_zero_digit = pynutil.delete("0") - graph_roman_hundreds = NEMO_DIGIT ** 3 @ ( + graph_roman_hundreds = NEMO_DIGIT**3 @ ( graph_roman_hundreds + pynini.union(graph_roman_ties, graph_roman_zero_digit) + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) - graph_roman_ties = NEMO_DIGIT ** 2 @ ( + graph_roman_ties = NEMO_DIGIT**2 @ ( graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) graph_roman_digits = NEMO_DIGIT @ graph_roman_digits diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py index f1e4da381..63b055bef 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py @@ -79,12 +79,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # This only covers "standard format". # Conventional format like thousand crores/lakh crores is yet to be implemented graph_in_thousands = pynini.union( - self.graph_two_digit + delete_space + delete_thousand, pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + delete_thousand, + pynutil.insert("००", weight=0.1), ) self.graph_thousands = graph_in_thousands graph_in_lakhs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("लाख"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("लाख"), + pynutil.insert("००", weight=0.1), ) graph_in_crores = pynini.union( @@ -93,23 +95,28 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_in_arabs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("अरब"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("अरब"), + pynutil.insert("००", weight=0.1), ) graph_in_kharabs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("खरब"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("खरब"), + pynutil.insert("००", weight=0.1), ) graph_in_nils = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("नील"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("नील"), + pynutil.insert("००", weight=0.1), ) graph_in_padmas = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("पद्म"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("पद्म"), + pynutil.insert("००", weight=0.1), ) graph_in_shankhs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("शंख"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("शंख"), + pynutil.insert("००", weight=0.1), ) graph_ind = ( diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 61183ae72..0d980a712 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -27,9 +27,9 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. पांच जनवरी दो हज़ार बारह -> date { month: "जनवरी" day: "५" year: "२०१२" preserve_order: true } - e.g. दो हज़ार बारह -> date { year: "२०१२" preserve_order: true } + e.g. दो हज़ार बारह -> date { year: "२०१२" preserve_order: true } Args: cardinal: CardinalFst date: DateFst diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py index 215c34e5c..ddbf32c9b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py @@ -40,8 +40,8 @@ def get_quantity( Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. दस लाख -> integer_part: "१॰" quantity: "लाख" e.g. एक दशमलव पाँच लाख -> integer_part: "१" fractional_part: "५" quantity: "लाख" - - Args: + + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST input_case: accepting either "lower_cased" or "cased" input. @@ -70,17 +70,17 @@ class DecimalFst(GraphFst): Decimal point "." is determined by "दशमलव" e.g. ऋण एक दशमलव दो छह -> decimal { negative: "true" integer_part: "१" morphosyntactic_features: "." fractional_part: "२६" } - + This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('दशमलव') plus (any sequence of cardinals <१०००, including 'शून्य') - - Also writes large numbers in shortened form, e.g. + + Also writes large numbers in shortened form, e.g. e.g. एक दशमलव दो छह लाख -> decimal { negative: "false" integer_part: "१" morphosyntactic_features: "." fractional_part: "२६" quantity: "लाख" } e.g. दो लाख -> decimal { negative: "false" integer_part: "२" quantity: "लाख" } e.g. एक अरब आठ सौ चौबीस लाख -> decimal { negative: "false" integer_part: "१८२४" quantity: "लाख" } Args: cardinal: CardinalFst - + """ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("दशमलव") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py index 56b2c63e3..1e44f59e8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py @@ -40,7 +40,7 @@ class FractionFst(GraphFst): e.g. ऋण एक बटा छब्बीस -> fraction { negative: "true" numerator: "१" denominator: "२६" } e.g. छह सौ साठ बटा पाँच सौ तैंतालीस -> fraction { negative: "false" numerator: "६६०" denominator: "५४३" } - + The fractional rule assumes that fractions can be pronounced as: (a cardinal) + ('बटा') plus (a cardinal, excluding 'शून्य') Args: @@ -65,7 +65,9 @@ def __init__(self, cardinal: GraphFst): self.graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) graph = optional_graph_negative + graph final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py index d7e9ba562..15d8e4eb8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py @@ -45,7 +45,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_graph = decimal.final_graph_wo_negative optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) measurements_graph = pynini.string_file(get_abs_path("data/measure/measurements.tsv")).invert() diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py index ac539966d..6bfc51af7 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py @@ -21,9 +21,9 @@ class TimeFst(GraphFst): """ - Finite state transducer for classifying time, + Finite state transducer for classifying time, e.g. एक बजके सात मिनट -> time { hours: "१" minutes: "७" } - e.g. चार बजे चवालीस मिनट -> time { hours: "४" minutes: "४४" } + e.g. चार बजे चवालीस मिनट -> time { hours: "४" minutes: "४४" } Args: cardinal: CardinalFst time: TimeFst diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index a5a371d90..22d665b18 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -52,7 +52,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, input_case: str = None, + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/hi/utils.py b/nemo_text_processing/inverse_text_normalization/hi/utils.py index 5e387b6e8..8e3f62c3c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path @@ -52,10 +52,10 @@ def load_labels(abs_path): def apply_fst(text, fst): - """ Given a string input, returns the output string - produced by traversing the path with lowest weight. - If no valid path accepts input string, returns an - error. + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. """ try: print(pynini.shortestpath(text @ fst).string()) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py index d8d61f2f8..17dfebf64 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py @@ -23,7 +23,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "अब" } tokens { time { hours: "१२" minutes: "३०" } } tokens { name: "बज" } tokens { name: "गए" } tokens { name: "हैं" } -> अब १२:३० बज गए हैं """ diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py index e912ff60b..b0d4e52cc 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py @@ -37,7 +37,12 @@ def __init__(self): convert_one = pynini.cross("[BOS]1", "[BOS]1-ին") convert_rest = pynutil.insert("-րդ", weight=0.01) - suffix = pynini.cdrewrite(convert_rest | convert_one, "", "[EOS]", NEMO_SIGMA,) + suffix = pynini.cdrewrite( + convert_rest | convert_one, + "", + "[EOS]", + NEMO_SIGMA, + ) graph = graph @ suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py index fa6bebd87..15d17f81d 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py @@ -39,7 +39,10 @@ def __init__(self): hundred = pynutil.delete("百") | pynutil.delete("ひゃく") | pynutil.delete("びゃく") | pynutil.delete("ぴゃく") hundred_alt = ( - pynini.cross("百", "1") | pynini.cross("ひゃく", "1") | pynini.cross("びゃく", "1") | pynini.cross("ぴゃく", "1") + pynini.cross("百", "1") + | pynini.cross("ひゃく", "1") + | pynini.cross("びゃく", "1") + | pynini.cross("ぴゃく", "1") ) graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) graph_hundred_component += pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py index 908b02d95..0ced0c679 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + fraction_word = ( + pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + ) integer_word = pynutil.delete("と") | pynutil.delete("荷") root_word = pynini.accep("√") | pynini.cross("ルート", "√") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py index 8fca40fdd..26e053334 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py @@ -32,7 +32,9 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py index 8477dfaa5..20ff3f34a 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py @@ -38,14 +38,18 @@ def __init__(self): minutes_seconds = pynini.string_file(get_abs_path("data/time_minutes_seconds.tsv")) hour_component = ( - pynutil.insert("hours: \"") + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + pynutil.insert("\"") + pynutil.insert("hours: \"") + + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + + pynutil.insert("\"") ) minute_component = ( pynutil.insert("minutes: \"") + ((minutes_seconds + pynutil.delete("分")) | pynini.accep("半")) + pynutil.insert("\"") ) - second_component = pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + second_component = ( + pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + ) graph_regular = ( pynini.closure(hour_component + insert_space + minute_component + insert_space + second_component) diff --git a/nemo_text_processing/inverse_text_normalization/ja/utils.py b/nemo_text_processing/inverse_text_normalization/ja/utils.py index 95555308b..fd3017d28 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ja/utils.py @@ -17,19 +17,19 @@ def get_abs_path(rel_path): """ - Get absolute path + Get absolute path - Args: - rel_path: relative path to this file -<<<<<<< HEAD -<<<<<<< HEAD + Args: + rel_path: relative path to this file + <<<<<<< HEAD + <<<<<<< HEAD -======= - ->>>>>>> 0a4a21c (Jp itn 20240221 (#141)) -======= + ======= ->>>>>>> 59f46198ab4c8880c6a5fb88f3cbee9530156498 - Returns absolute path + >>>>>>> 0a4a21c (Jp itn 20240221 (#141)) + ======= + + >>>>>>> 59f46198ab4c8880c6a5fb88f3cbee9530156498 + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py index 60bdff8a1..62d41cb65 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py @@ -52,7 +52,7 @@ def __init__(self): + pynutil.delete("\"") ) - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py index 7bbc16516..103cfb7a8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py @@ -36,7 +36,11 @@ class PostProcessor(GraphFst): ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py index 798cd001d..8e95e14cf 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py @@ -40,12 +40,18 @@ def __init__(self): hours_component |= hours_component_alt minutes_component = ( - pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("分") + pynutil.delete("\"") + pynutil.delete("minutes: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("分") + + pynutil.delete("\"") ) minutes_component_alt = pynutil.delete("minutes: \"") + pynini.accep("半") + pynutil.delete("\"") minutes_component |= minutes_component_alt second_component = ( - pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("秒") + pynutil.delete("\"") + pynutil.delete("seconds: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("秒") + + pynutil.delete("\"") ) suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py index 980e41816..7624d5f1b 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py @@ -47,7 +47,12 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() if far_file: diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py index 27d0a35c5..8aa218a9a 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py @@ -91,7 +91,11 @@ def __init__(self): graph_arabs + delete_space + graph_crores + delete_space + graph_lakhs + delete_space + graph_thousands ) - graph = pynini.union(graph_higher_powers + delete_space + graph_hundreds, graph_hundred_unique, graph_zero,) + graph = pynini.union( + graph_higher_powers + delete_space + graph_hundreds, + graph_hundred_unique, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT), "०" diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py index 96e8fb08d..15a75affc 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py @@ -46,7 +46,11 @@ def __init__(self, cardinal: GraphFst): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_ad_bc = pynutil.insert("text: \"") + prefixes + delete_space + pynutil.insert("\"") graph_mdy = month_graph + ( diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py index 8882b860c..92af8c7c3 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -65,7 +65,9 @@ def __init__(self, cardinal: GraphFst): graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() decimal_word = pynini.cross("पूर्णांक", "") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_integer = ( pynutil.insert("integer_part: \"") diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py index 8eeea3876..59b30ae9e 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -171,9 +171,9 @@ def __init__(self, use_strict_e=False): ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize() - graph_hundred_component_no_prefix = pynini.union(graph_hundreds + graph_e + graph_ties_component,) @ ( - pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) - ) + graph_hundred_component_no_prefix = pynini.union( + graph_hundreds + graph_e + graph_ties_component, + ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize() graph_mil_prefix_e = pynini.union( @@ -350,18 +350,13 @@ def __init__(self, use_strict_e=False): self.graph_no_exception = graph # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py index 5bb6c63bc..5d9308958 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py @@ -31,7 +31,8 @@ class DateFst(GraphFst): """ def __init__( - self, tn_date_tagger: GraphFst, + self, + tn_date_tagger: GraphFst, ): super().__init__(name="date", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py index e39a9017a..97bd36582 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py @@ -49,7 +49,15 @@ def __init__(self, itn_cardinal_tagger: GraphFst, tn_decimal_tagger: GraphFst): self.final_graph_wo_sign = final_graph_wo_sign self.final_graph_wo_negative = ( - final_graph_wo_sign | get_quantity(final_graph_wo_sign, None, hundreds_no_one, None, False, True,) + final_graph_wo_sign + | get_quantity( + final_graph_wo_sign, + None, + hundreds_no_one, + None, + False, + True, + ) ).optimize() optional_minus_graph = pynini.closure(pynini.cross("minus ", "negative: \"true\" "), 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py index 016df4f1d..155513937 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py @@ -134,7 +134,8 @@ def __init__(self): ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", ) # don't convert cardinals from zero to nine inclusive @@ -145,7 +146,9 @@ def __init__(self): self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py index b0cd8561a..21576efd5 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py @@ -59,7 +59,10 @@ def _get_year_graph(): def _get_digits_graph(): zero = pynini.cross((pynini.union("linh", "lẻ")), "0") four = pynini.cross("tư", "4") - graph = pynini.union(zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit,) + graph = pynini.union( + zero + delete_space + (graph_digit | four), + graph_zero + delete_space + graph_digit, + ) graph.optimize() return graph diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py index 033f3d86e..60c550228 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py @@ -123,10 +123,12 @@ def __init__(self, cardinal: GraphFst): final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph |= optional_graph_negative + get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py index 30d262722..2ad4d5bbf 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py @@ -70,7 +70,9 @@ def __init__(self): ) optional_zone = pynini.closure(zone, 0, 1) optional_second = pynini.closure( - delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), 0, 1, + delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), + 0, + 1, ) graph_h = hour + pynutil.insert("h") diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index de1a7a28c..9c0199b13 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -86,7 +86,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -100,7 +103,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): print(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index f3b30238c..0715a3988 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -110,7 +110,12 @@ def __init__(self): + graph_hundreds_complex ) | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) - | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + | ( + graph_hundreds_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "000") + + graph_digits + ) ) graph_millions = ( pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 331f0b7ff..108c222fd 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -61,7 +61,9 @@ def __init__(self): # graph_date = graph_year | graph_month | graph_day # grammar for optional prefix ad or bc - graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + graph_bc_prefix = ( + pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + ) graph_bc = pynutil.delete(graph_bc_prefix) graph_ad_prefix = ( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index e660b6015..477a82f5d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -57,7 +57,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # yuan major plus minor major_symbol = pynini.accep("块") | pynini.cross("塊", "块") - tencent = pynini.accep("毛") | pynini.accep("角",) + tencent = pynini.accep("毛") | pynini.accep( + "角", + ) cent = pynini.accep("分") graph_kuai = ( graph_integer_component diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 2877d4160..3364ed4b2 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -49,7 +49,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, + self, + input_case: str, + cache_dir: str = None, + whitelist: str = None, + overwrite_cache: bool = False, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 31d5880dc..f33987173 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="cardinal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) suffix = pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index 28e2d5ff1..b36e44dfa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # insert a "," for every three numbers before decimal point diff --git a/nemo_text_processing/text_normalization/ar/taggers/measure.py b/nemo_text_processing/text_normalization/ar/taggers/measure.py index 707b40998..ce22f3d76 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ar/taggers/measure.py @@ -55,7 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( @@ -76,15 +78,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) subgraph_cardinal = ( - (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - + insert_space - + pynini.closure(pynutil.delete(" "), 0, 1) - + unit_plural - | unit_plural - + pynini.closure(pynutil.delete(" "), 0, 1) - + insert_space - + (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - ) + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst + insert_space + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + unit_plural | unit_plural + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + insert_space + ( + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 5098989c6..925fa348e 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -142,7 +142,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight(integer_plus_maj, weight=0.0001,) + graph_with_no_minor_curr |= pynutil.add_weight( + integer_plus_maj, + weight=0.0001, + ) graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index a8ef5af17..902a62b3f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -166,7 +166,7 @@ def thousand(): self.graph = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -181,7 +181,7 @@ def thousand(): self.graph_hundred_component_at_least_one_none_zero_digit = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 3 + @ NEMO_DIGIT**3 @ hundred_non_zero() ) | pynini.cross("1", "eins") diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index 21b32eb2b..8c13882d2 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -42,7 +42,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': cardinal: cardinal GraphFst """ - year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT ** 2) @ cardinal.graph + year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero hundred = pynutil.insert("hundert") diff --git a/nemo_text_processing/text_normalization/de/taggers/measure.py b/nemo_text_processing/text_normalization/de/taggers/measure.py index 122ff8a67..a46822a0f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/text_normalization/de/taggers/measure.py @@ -82,7 +82,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index 90af2f07e..97482a236 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -45,7 +45,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit - two_digit_and_zero = (NEMO_DIGIT ** 2 @ cardinal.two_digit_non_zero) | graph_zero + two_digit_and_zero = (NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index 371ad16ac..2fe74f5ba 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -65,7 +65,9 @@ def __init__(self, deterministic: bool = True): + pynutil.insert('"') ) final_time_zone_optional = pynini.closure( - pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, + pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), + 0, + 1, ) # Accepts the following formats: 02:30 Uhr, 02.30 Uhr, 2:30 Uhr, 2.30 Uhr diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index e6590536f..646d7a6b7 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -70,7 +70,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -92,7 +93,10 @@ def __init__( self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -104,7 +108,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py index f8d5f6967..d4ea8eb09 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py @@ -43,7 +43,10 @@ def __init__(self, deterministic: bool = True): self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( - pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, + pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index 161e5d97e..668e1fb7c 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -103,14 +103,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -125,7 +147,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -209,7 +233,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -223,7 +250,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): logger.debug(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 6ec0ac9dd..5e2a8535c 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -83,7 +83,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): graph = ( pynini.closure(NEMO_DIGIT, 1, 3) - + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3)) + + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3)) ) @ graph self.graph = graph @@ -118,7 +118,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph |= pynini.compose(final_graph, one_to_a_replacement_graph.optimize() + NEMO_SIGMA).optimize() # remove commas for 4 digits numbers - four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT ** 3 + four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT**3 final_graph |= pynini.compose(four_digit_comma_graph.optimize(), final_graph).optimize() self.final_graph = final_graph diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 869716ef9..52225f0ba 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -126,11 +126,11 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): 123 A.D., 4200 B.C """ graph = get_four_digit_year_graph(deterministic) - graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph + graph = (pynini.union("1", "2") + (NEMO_DIGIT**3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph graph |= _get_two_digit_year_with_s_graph() - three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph + three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT**2) @ cardinal_graph year_with_suffix = ( (get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix ) @@ -270,7 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year - day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph + day_ex_month = (NEMO_DIGIT**2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 3262c7485..874d2e437 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -49,9 +49,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): else: numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") @@ -59,10 +65,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" ) - dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,) + dict_words = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/words.tsv")), + MIN_NEG_WEIGHT, + ) dict_words_without_delimiter = dict_words + pynini.closure( - pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1, + pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), + 1, ) dict_words_graph = dict_words_without_delimiter | dict_words diff --git a/nemo_text_processing/text_normalization/en/taggers/measure.py b/nemo_text_processing/text_normalization/en/taggers/measure.py index fc61620ce..e8d92e1da 100644 --- a/nemo_text_processing/text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/text_normalization/en/taggers/measure.py @@ -53,7 +53,11 @@ class MeasureFst(GraphFst): """ def __init__( - self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, + self, + cardinal: GraphFst, + decimal: GraphFst, + fraction: GraphFst, + deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and) @@ -63,7 +67,8 @@ def __init__( graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv")) graph_unit |= pynini.compose( - pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit, + pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), + graph_unit, ).optimize() graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) @@ -76,7 +81,9 @@ def __init__( ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( @@ -250,11 +257,12 @@ def get_address_graph(self, cardinal): ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( - pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), ordinal_verbalizer, + pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), + ordinal_verbalizer, ) address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit - address_num += insert_space + NEMO_DIGIT ** 2 @ ( + address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit ) @@ -292,8 +300,12 @@ def get_address_graph(self, cardinal): state = pynini.invert(state_graph) state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) - zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph) - zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,) + zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) + zip_code = pynini.closure( + pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, + 0, + 1, + ) address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1) diff --git a/nemo_text_processing/text_normalization/en/taggers/money.py b/nemo_text_processing/text_normalization/en/taggers/money.py index ef38c56b5..0687b0c1a 100644 --- a/nemo_text_processing/text_normalization/en/taggers/money.py +++ b/nemo_text_processing/text_normalization/en/taggers/money.py @@ -112,7 +112,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( - NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj, + NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), + integer_plus_maj, ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma @@ -189,7 +190,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( - NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered, + NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), + integer_graph_reordered, ) final_graph += graph_per_units.ques diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 9d57a9fb9..c989e99f5 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -33,7 +33,12 @@ class RangeFst(GraphFst): """ def __init__( - self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False, + self, + time: GraphFst, + date: GraphFst, + cardinal: GraphFst, + deterministic: bool = True, + lm: bool = False, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -47,14 +52,14 @@ def __init__( cardinal = cardinal.graph_with_and # YEAR - date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date - date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_four_digit = (NEMO_DIGIT**4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_two_digit = (NEMO_DIGIT**2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = ( date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space - + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal)) + + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT**2 @ cardinal)) ) mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit) diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index 913c09285..f650c8ff3 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -71,7 +71,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( - NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 + NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 ) # add space between letter and digit/symbol diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index 28614fad1..7a253cccc 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -78,7 +78,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", + cache_dir, + f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -107,7 +108,12 @@ def __init__( logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") start_time = time.time() - measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic,) + measure = MeasureFst( + cardinal=cardinal, + decimal=decimal, + fraction=fraction, + deterministic=deterministic, + ) measure_graph = measure.fst logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") @@ -157,7 +163,10 @@ def __init__( time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst( - time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic, + time=time_final, + date=date_final, + cardinal=cardinal, + deterministic=deterministic, ).fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py index 4ad7d1c85..dff205f8e 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py @@ -46,7 +46,10 @@ def __init__(self, deterministic: bool = True): convert_rest = pynutil.insert("th") suffix = pynini.cdrewrite( - graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, "", "[EOS]", NEMO_SIGMA, + graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 101185a90..946f4234e 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -107,7 +107,10 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike": """ fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA) fem_allign @= pynini.cdrewrite( - fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA, + fem_ones, + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, ) # If before a quote or EOS, we know it's the end of a string return fst @ fem_allign diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index 1b8f0a440..85402089f 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -47,7 +47,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -157,7 +157,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/es/taggers/date.py b/nemo_text_processing/text_normalization/es/taggers/date.py index ea7f15292..dd5cd7f0e 100644 --- a/nemo_text_processing/text_normalization/es/taggers/date.py +++ b/nemo_text_processing/text_normalization/es/taggers/date.py @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): dash = "-" day_optional = pynini.closure(pynini.cross(dash, NEMO_SPACE) + day, 0, 1) - graph_ymd = NEMO_DIGIT ** 4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional + graph_ymd = NEMO_DIGIT**4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional final_graph = graph_dmy + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd diff --git a/nemo_text_processing/text_normalization/es/taggers/fraction.py b/nemo_text_processing/text_normalization/es/taggers/fraction.py index 1fb5b8118..7bbe86402 100644 --- a/nemo_text_processing/text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/es/taggers/fraction.py @@ -47,15 +47,50 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = ordinal_graph = ordinal.graph # 2-10 are all ordinals - three_to_ten = pynini.string_map(["2", "3", "4", "5", "6", "7", "8", "9", "10",]) + three_to_ten = pynini.string_map( + [ + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + ] + ) block_three_to_ten = pynutil.delete(three_to_ten) # To block cardinal productions if not deterministic: # Multiples of tens are sometimes rendered as ordinals - three_to_ten |= pynini.string_map(["20", "30", "40", "50", "60", "70", "80", "90",]) + three_to_ten |= pynini.string_map( + [ + "20", + "30", + "40", + "50", + "60", + "70", + "80", + "90", + ] + ) graph_three_to_ten = three_to_ten @ ordinal_graph graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) # Higher powers of tens (and multiples) are converted to ordinals. - hundreds = pynini.string_map(["100", "200", "300", "400", "500", "600", "700", "800", "900",]) + hundreds = pynini.string_map( + [ + "100", + "200", + "300", + "400", + "500", + "600", + "700", + "800", + "900", + ] + ) graph_hundreds = hundreds @ ordinal_graph multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos @@ -68,7 +103,10 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = graph_higher_powers_of_ten += higher_powers_of_ten graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten graph_higher_powers_of_ten @= pynini.cdrewrite( - pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), NEMO_SIGMA, + pynutil.delete("un "), + pynini.accep("[BOS]"), + pynini.project(higher_powers_of_ten, "output"), + NEMO_SIGMA, ) # we drop 'un' from these ordinals (millionths, not one-millionths) graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten @@ -83,10 +121,16 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = # Blocking the digits and hundreds from Cardinal graph graph_fractions_cardinals = pynini.cdrewrite( - block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + block_three_to_ten | block_higher_powers_of_ten, + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) graph_fractions_cardinals @= NEMO_CHAR.plus @ pynini.cdrewrite( - pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + pynutil.delete("0"), + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) # Empty characters become '0' for NEMO_CHAR fst, so need to block graph_fractions_cardinals @= cardinal_graph graph_fractions_cardinals += pynutil.insert( diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index a1933dbed..a63677c47 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -79,7 +79,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) complex_unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/es/taggers/time.py b/nemo_text_processing/text_normalization/es/taggers/time.py index 4a947dd31..de2752657 100644 --- a/nemo_text_processing/text_normalization/es/taggers/time.py +++ b/nemo_text_processing/text_normalization/es/taggers/time.py @@ -115,7 +115,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): time_zone_graph = time_zones + pynini.closure(utc_or_gmt_diff, 0, 1) final_time_zone_optional = pynini.closure( - delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, + delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), + 0, + 1, ) # 02.30 h diff --git a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py index 5aa66031a..165f5eeca 100644 --- a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index 3758c1bd5..5d7afc1b7 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -141,7 +141,8 @@ def __init__(self, deterministic: bool = True): fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( - denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word, + denominator_singular_fem @ merge_stem, + denominator_singular_fem @ merge_into_single_word, ) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( @@ -150,7 +151,8 @@ def __init__(self, deterministic: bool = True): fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( - denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word, + denominator_plural_fem @ merge_stem, + denominator_plural_fem @ merge_into_single_word, ) fraction_default_fem += pynutil.insert(" partes") diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index de9a0b047..0b38aeebb 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -62,7 +62,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -79,7 +80,11 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst word_graph = WordFst(deterministic=deterministic).fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..f6a8bdd65 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -21,12 +21,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } } + s + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -42,7 +42,7 @@ def create_graph_suffix(digit_graph, suffix, zeros_counts): if zeros_counts == 0: return digit_graph + suffix - return digit_graph + (zero ** zeros_counts) + suffix + return digit_graph + (zero**zeros_counts) + suffix def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): insert_space = pynutil.insert(" ") @@ -50,7 +50,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): if zeros_counts == 0: return digit_graph + suffix + insert_space + sub_graph - return digit_graph + suffix + (zero ** zeros_counts) + insert_space + sub_graph + return digit_graph + suffix + (zero**zeros_counts) + insert_space + sub_graph # Hundred graph suffix_hundreds = pynutil.insert(" सौ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..42135add7 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -33,7 +33,7 @@ class DateFst(GraphFst): Finite state transducer for classifying date, e.g. "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + Args: cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index d0bef9373..955e8c0d3 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -22,13 +22,12 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': - """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. १ लाख -> integer_part: "एक" quantity: "लाख" e.g. १.५ लाख -> integer_part: "एक" fractional_part: "पाँच" quantity: "लाख" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -49,7 +48,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -१२.५००६ अरब -> decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } १ अरब -> decimal { integer_part: "एक" quantity: "अरब" } @@ -69,7 +68,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): point = pynutil.delete(".") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) self.graph_fractional = pynutil.insert("fractional_part: \"") + self.graph + pynutil.insert("\"") diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index a29a72666..8971cd3dd 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -25,7 +25,7 @@ class FractionFst(GraphFst): fraction { integer: "तेईस" numerator: "चार" denominator: "छः"} ४/६" -> fraction { numerator: "चार" denominator: "छः"} - + Args: cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..55279f4da 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -21,7 +21,7 @@ class MeasureFst(GraphFst): """ - Finite state transducer for classifying measure, suppletive aware, e.g. + Finite state transducer for classifying measure, suppletive aware, e.g. -१२kg -> measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -१२.२kg -> measure { decimal { negative: "true" integer_part: "बारह" fractional_part: "दो"} units: "किलोग्राम" } @@ -40,7 +40,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) # Define the unit handling diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..7446b77e5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -26,7 +26,7 @@ class MoneyFst(GraphFst): Finite state transducer for classifying money, suppletive aware, e.g. ₹1 -> money { currency: "रुपए" integer_part: "एक" } ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + Args: cardinal: CardinalFst decimal: DecimalFst @@ -40,7 +40,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal_graph = cardinal.final_graph optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 622d4d5cb..6c87c9aad 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -29,7 +29,7 @@ class TimeFst(GraphFst): १२:३०:३० -> time { hours: "बारह" minutes: "तीस" seconds: "तीस" } १:४० -> time { hours: "एक" minutes: "चालीस" } १:०० -> time { hours: "एक" } - + Args: time: GraphFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..cc22a99f5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/utils.py b/nemo_text_processing/text_normalization/hi/utils.py index 102212183..5d314506e 100644 --- a/nemo_text_processing/text_normalization/hi/utils.py +++ b/nemo_text_processing/text_normalization/hi/utils.py @@ -23,7 +23,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -46,7 +46,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: @@ -63,10 +63,10 @@ def augment_labels_with_punct_at_end(labels): def apply_fst(text, fst): - """ Given a string input, returns the output string - produced by traversing the path with lowest weight. - If no valid path accepts input string, returns an - error. + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. """ try: print(pynini.shortestpath(text @ fst).string()) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..f0af1a2d4 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -23,7 +23,7 @@ class DateFst(GraphFst): Finite state transducer for verbalizing date, e.g. date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } -> "एक अप्रैल दो हज़ार चौबीस" date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } -> "अप्रैल एक दो हज़ार चौबीस" - + Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py index 57ec38003..ca4636897 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py @@ -21,8 +21,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } -> ऋणात्मक बारह दशमलव पाँच शून्य शून्य छह + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } -> ऋणात्मक बारह दशमलव पाँच शून्य शून्य छह decimal { integer_part: "बारह" quantity: "billion" } -> बारह अरब """ diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..39b16b423 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): Finite state transducer for verbalizing fraction e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छः" }-> तेईस चार बटा छः e.g. fraction { numerator: "चार" denominator: "छः" } -> चार बटा छः - + Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py index 6cc6f8879..d6d17ac37 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py @@ -23,8 +23,8 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -> ऋणात्मक बारह किलोग्राम measure { decimal { integer_part: "बारह" fractional_part: "दो" } units: "किलोग्राम" } -> बारह दशमलव दो किलोग्राम - - + + Args: decimal: DecimalFst cardinal: CardinalFs @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="verbalize") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) unit = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py index 87ec8e389..d838ca6ff 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py index c20a3d27b..c9c5c3063 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py @@ -62,7 +62,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': fst: A pynini.FstLike object """ cardinal_separator = pynini.string_map([".", NEMO_SPACE]) - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string up_to_three_digits = up_to_three_digits - "000" - "00" - "0" @@ -246,7 +246,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ clean_output ) @@ -257,12 +257,12 @@ def __init__(self, deterministic: bool = True): zero_space + digit, ).optimize() self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ).optimize() self.four_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 3)) @ self.graph, zero_space + self.three_digits_read + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**3)) @ self.graph, zero_space + self.three_digits_read ).optimize() self.graph |= graph_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index 5026caec3..10ae4a8fe 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -101,7 +101,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ]: for modifier in ["", "tíz", "száz"]: decimal_number |= ( - (NEMO_DIGIT ** order + (NEMO_DIGIT - "0")) + (NEMO_DIGIT**order + (NEMO_DIGIT - "0")) @ pynini.cdrewrite(pynini.cross("0", ""), "[BOS]", "", NEMO_SIGMA) @ cardinal_graph + final_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/measure.py b/nemo_text_processing/text_normalization/hu/taggers/measure.py index 9e5f328fb..f2c3a2368 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hu/taggers/measure.py @@ -61,7 +61,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index ae1592f74..43e067fef 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -180,7 +180,11 @@ def hours_to_pairs(): final_time_zone = ( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") ) - final_time_zone_optional = pynini.closure(final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + final_time_zone, + 0, + 1, + ) # This might be better as just the inflected forms hour_only_delimited = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py index 60ed0ddc9..8c269bb00 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py index f17f7c36a..b52e6efb7 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py @@ -34,7 +34,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") @@ -53,6 +57,8 @@ def __init__(self, deterministic: bool = True): 1, ) - graph = pynini.union(optional_country_code + number_part + optional_extension,) + graph = pynini.union( + optional_country_code + number_part + optional_extension, + ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index ecb003775..1e16d6e36 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -48,7 +48,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -162,7 +162,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index 40144cd61..880be0aa7 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -68,7 +68,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py index 3aebcca91..603d520b5 100644 --- a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py @@ -66,7 +66,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -88,10 +89,18 @@ def __init__( self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.measure = MeasureFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) measure_graph = self.measure.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.time = TimeFst(deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/text_normalization/ja/taggers/cardinal.py index b17abbbbb..ff80f6a3b 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ja/taggers/cardinal.py @@ -23,7 +23,7 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 23 -> cardinal { integer: "二十三" } + e.g. 23 -> cardinal { integer: "二十三" } """ def __init__(self, deterministic: bool = True): @@ -41,13 +41,13 @@ def __init__(self, deterministic: bool = True): graph_all = (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (pynini.cross('1', '百') | (graph_digit_alt + pynutil.insert('百'))) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all) ) graph_hundred = hundreds @ graph_hundred_component - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (pynini.cross('1', '千') | (graph_digit_alt + pynutil.insert('千'))) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -61,7 +61,7 @@ def __init__(self, deterministic: bool = True): # this grammar is for larger number in later gramamr graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -71,8 +71,8 @@ def __init__(self, deterministic: bool = True): graph_ten_thousand = ten_thousands @ graph_ten_thousand_component self.man = graph_ten_thousand.optimize() - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -82,8 +82,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -93,8 +93,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component_alt graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -104,7 +104,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -117,8 +117,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all graph_thousand_million_component = (thousand_millions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -132,8 +132,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -147,8 +147,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component_alt graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/ja/taggers/date.py b/nemo_text_processing/text_normalization/ja/taggers/date.py index 25dbd71de..a8a469252 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/date.py +++ b/nemo_text_processing/text_normalization/ja/taggers/date.py @@ -38,24 +38,24 @@ class DateFst(GraphFst): M.5 -> date { era: "明治" "year: "五年" } 21日月曜日 -> tokens { date { day: "二十一日" weekday: "月曜日" } } 70年代 -> date { year: "七十年代" } - 西暦794年 -> tokens { date { era: "西暦" year: "七百九十四年" } } - 1月1日(月)〜3日(水) - -> tokens { date { month: "一月" day: "一日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { day: "三日" weekday: "水曜日" } } + 西暦794年 -> tokens { date { era: "西暦" year: "七百九十四年" } } + 1月1日(月)〜3日(水) + -> tokens { date { month: "一月" day: "一日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { day: "三日" weekday: "水曜日" } } 70〜80年代 -> tokens { cardinal { integer: "七十" } } tokens { name: "から" } tokens { date { year: "八十年代" } } 7月5〜9日(月〜金) - -> tokens { date { month: "七月" } } tokens { cardinal { integer: "五" } } tokens { name: "から" } tokens { date { day: "九日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { weekday: "金曜日" } } + -> tokens { date { month: "七月" } } tokens { cardinal { integer: "五" } } tokens { name: "から" } tokens { date { day: "九日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { weekday: "金曜日" } } 7月初旬〜9月中旬 - -> tokens { date { month: "七月" } } tokens { name: "初" } tokens { name: "旬" } tokens { name: "から" } tokens { date { month: "九月" } } tokens { name: "中" } tokens { name: "旬" } + -> tokens { date { month: "七月" } } tokens { name: "初" } tokens { name: "旬" } tokens { name: "から" } tokens { date { month: "九月" } } tokens { name: "中" } tokens { name: "旬" } 3〜4月 -> tokens { cardinal { integer: "三" } } tokens { name: "から" } tokens { date { month: "四月" } } - 2023年3月1日(水)〜6月12日(火) - -> tokens { date { year: "二千二十三年" month: "三月" day: "一日" weekday: "水曜日" } } tokens { name: "から" } tokens { date { month: "六月" day: "十二日" weekday: "火曜日" } } + 2023年3月1日(水)〜6月12日(火) + -> tokens { date { year: "二千二十三年" month: "三月" day: "一日" weekday: "水曜日" } } tokens { name: "から" } tokens { date { month: "六月" day: "十二日" weekday: "火曜日" } } 10月中旬〜11月上旬 -> tokens { date { month: "十月" } } tokens { date { month: "中旬" } } tokens { name: "から" } tokens { date { month: "十一月" } } tokens { date { month: "上旬" } } - 1976年7月17日〜8月1日 - -> tokens { date { year: "千九百七十六年" month: "七月" day: "十七日" } } tokens { name: "から" } tokens { date { month: "八月" day: "一日" } } - + 1976年7月17日〜8月1日 + -> tokens { date { year: "千九百七十六年" month: "七月" day: "十七日" } } tokens { name: "から" } tokens { date { month: "八月" day: "一日" } } + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ja/taggers/decimal.py b/nemo_text_processing/text_normalization/ja/taggers/decimal.py index 4ccd06d57..8fdea4c87 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ja/taggers/decimal.py @@ -25,7 +25,7 @@ class DecimalFst(GraphFst): Finite state transducer for classifying decimal, e.g. 0.5 -> decimal { integer_part: "零" fractional_part: "五" } -0.5万 -> decimal { negative: "マイナス" integer_part: "零" fractional_part: "五" quantity: "万"} - + Args: cardinal: CardinalFst """ @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_decimal_no_sign = graph_integer + pynutil.delete('.') + pynutil.insert(" ") + graph_fraction graph_optional_sign = ( - pynutil.insert("negative: \"") + (pynini.cross("-", "マイナス") | pynini.accep("マイナス")) + pynutil.insert("\"") + pynutil.insert("negative: \"") + + (pynini.cross("-", "マイナス") | pynini.accep("マイナス")) + + pynutil.insert("\"") ) graph_decimal = graph_decimal_no_sign | (graph_optional_sign + pynutil.insert(" ") + graph_decimal_no_sign) diff --git a/nemo_text_processing/text_normalization/ja/taggers/fraction.py b/nemo_text_processing/text_normalization/ja/taggers/fraction.py index 0dd488f4f..94fb4af68 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ja/taggers/fraction.py @@ -110,7 +110,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) optional_sign = ( - pynutil.insert("negative: \"") + (pynini.accep("マイナス") | pynini.cross("-", "マイナス")) + pynutil.insert("\"") + pynutil.insert("negative: \"") + + (pynini.accep("マイナス") | pynini.cross("-", "マイナス")) + + pynutil.insert("\"") ) graph_fraction_slash_sigh = pynini.closure(optional_sign + pynutil.insert(NEMO_SPACE), 0, 1) + ( diff --git a/nemo_text_processing/text_normalization/ja/taggers/punctuation.py b/nemo_text_processing/text_normalization/ja/taggers/punctuation.py index 24ee2f15f..c5df8388c 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/ja/taggers/punctuation.py @@ -62,7 +62,9 @@ def __init__(self, deterministic: bool = True): + pynini.accep(">") ) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) - range_component = pynini.cross("〜", "から") | pynini.accep("から") # forcing this conversion for special tilde + range_component = pynini.cross("〜", "から") | pynini.accep( + "から" + ) # forcing this conversion for special tilde self.graph = punct | pynutil.add_weight(range_component, -1.0) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/ja/taggers/time.py b/nemo_text_processing/text_normalization/ja/taggers/time.py index 6b8a308a2..7c74bc53e 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/text_normalization/ja/taggers/time.py @@ -25,7 +25,7 @@ class TimeFst(GraphFst): Finite state transducer for classifying time, e.g. 1時30分 -> time { hours: "一" minutes: "三十" } 今夜0時 -> time { suffix: "今夜" hours: "零" } - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py index c28c444ed..f992e9b70 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py @@ -33,9 +33,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/ja/utils.py b/nemo_text_processing/text_normalization/ja/utils.py index 2a5455b2b..65523afed 100644 --- a/nemo_text_processing/text_normalization/ja/utils.py +++ b/nemo_text_processing/text_normalization/ja/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -35,6 +35,7 @@ def get_abs_path(rel_path): # Args: # abs_path: absolute path + # Returns dictionary of mappings # """ # #label_tsv = open(abs_path, encoding="utf-8") diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/date.py b/nemo_text_processing/text_normalization/ja/verbalizers/date.py index 209c3c34f..8292c622a 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/date.py @@ -23,7 +23,7 @@ class DateFst(GraphFst): """ Finite state transducer for verbalizing date e.g. date { year: "二千二十四" month: "三" day: "四" } -> 二千二十四年三月四日 - + """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py index 89b56f8dc..3ff05fa57 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py @@ -29,14 +29,18 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/time.py b/nemo_text_processing/text_normalization/ja/verbalizers/time.py index 73029ae6c..7058f437b 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/time.py @@ -22,8 +22,8 @@ class TimeFst(GraphFst): """ Finite state transducer for verbalizing time e.g. - - + + """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py index dbf59c446..6a16f96d9 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py @@ -29,7 +29,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, @@ -52,7 +52,14 @@ def __init__(self, deterministic: bool = True): whitelist = WhiteListFst(deterministic=deterministic) graph = pynini.union( - date.fst, cardinal.fst, ordinal.fst, decimal.fst, fraction.fst, word.fst, time.fst, whitelist.fst, + date.fst, + cardinal.fst, + ordinal.fst, + decimal.fst, + fraction.fst, + word.fst, + time.fst, + whitelist.fst, ) graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py index c0327a876..750598649 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py @@ -26,9 +26,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) @@ -46,6 +44,11 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/word.py b/nemo_text_processing/text_normalization/ja/verbalizers/word.py index 6ee724f93..afd5d1037 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/word.py @@ -20,7 +20,7 @@ class WordFst(GraphFst): ''' - tokens { char: "文字" } -> 文字 + tokens { char: "文字" } -> 文字 ''' def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 6a61efd4e..8a60516cc 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -164,11 +164,16 @@ def normalize( text_with_span_tags_list[masked_idx_list[sem_tag_idx]] = "" else: non_deter_options = self.normalize_non_deterministic( - text=cur_semiotic_span, n_tagged=n_tagged, punct_post_process=punct_post_process, verbose=verbose, + text=cur_semiotic_span, + n_tagged=n_tagged, + punct_post_process=punct_post_process, + verbose=verbose, ) try: best_option, cer, _ = self.select_best_match( - normalized_texts=non_deter_options, pred_text=cur_pred_text, verbose=verbose, + normalized_texts=non_deter_options, + pred_text=cur_pred_text, + verbose=verbose, ) if cer_threshold > 0 and cer > cer_threshold: best_option = cur_deter_norm @@ -366,7 +371,11 @@ def get_verbalized_text(tagged_text): continue def select_best_match( - self, normalized_texts: List[str], pred_text: str, verbose: bool = False, remove_punct: bool = False, + self, + normalized_texts: List[str], + pred_text: str, + verbose: bool = False, + remove_punct: bool = False, ): """ Selects the best normalization option based on the lowest CER diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index 2dc87ee06..3ad16f999 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -78,7 +78,7 @@ def __init__(self, number_names: dict, deterministic: bool): month = ( pynutil.insert("month: \"") + (month_name | pynutil.add_weight(digit_month, 0.1)) + pynutil.insert("\"") ).optimize() - year = pynini.compose(((NEMO_DIGIT ** 4) | (NEMO_DIGIT ** 2)), numbers).optimize() + year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() year |= zero_digit # reduce year options diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index 4fbfbf06a..456bd6f1a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -48,13 +48,13 @@ def __init__(self, number_names: dict, deterministic: bool = True): optional_country_code = pynini.closure(country_code + insert_space, 0, 1) number_part = ( - NEMO_DIGIT ** 3 @ number + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 3 @ number + + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 2 @ number + + NEMO_DIGIT**2 @ number + separator - + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) + + NEMO_DIGIT**2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) ) number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") tagger_graph = (optional_country_code + number_part).optimize() diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py index ce75cd17e..585db5ba4 100644 --- a/nemo_text_processing/text_normalization/rw/graph_utils.py +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -107,14 +107,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -129,7 +151,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 021e652bd..750ff867b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -69,7 +69,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_separator = NEMO_SPACE @@ -249,7 +249,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -276,30 +276,27 @@ def __init__(self, deterministic: bool = True): zero_space = zero + insert_space self.zero_space = zero_space self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, zero_space + digit + insert_space + digit, ) self.three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, zero_space + digit + insert_space + digit, ) self.two_or_three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, zero_space + single_digits_graph + pynini.closure(insert_space + digit, 0, 1), single_digits_graph + pynini.closure(insert_space + single_digits_graph, 3), @@ -307,7 +304,7 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), zero_space + single_digits_graph + pynini.closure(insert_space + single_digits_graph, 0, 1), @@ -316,9 +313,8 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_both = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index e114e9e6d..4da3f81c2 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -81,7 +81,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 25dfb6e9b..0877ca08f 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -95,7 +95,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): hundreds |= pynini.cross("1", "ett hundra") hundreds |= digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra") - graph_hundreds = hundreds + pynini.union(graph_tens, (pynutil.delete("0") + graph_digit),) + graph_hundreds = hundreds + pynini.union( + graph_tens, + (pynutil.delete("0") + graph_digit), + ) if not deterministic: graph_hundreds |= hundreds + pynini.union( (graph_teens | pynutil.insert(NEMO_SPACE) + graph_teens), (pynini.cross("0", NEMO_SPACE) + graph_digit) @@ -179,7 +182,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index 676e78592..cb5067058 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -106,7 +106,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(ensure_space + final_suffix, 0, 1) final_time_zone = pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") - final_time_zone_optional = pynini.closure(NEMO_SPACE + final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + NEMO_SPACE + final_time_zone, + 0, + 1, + ) # 2:30 pm, 02:30, 2:00 graph_hm_kl = ( diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index af17c6d48..6656e3445 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -40,7 +40,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 21437e82f..a0c3b587d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 5cd95e58c..b283f3444 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -32,7 +32,9 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index dab0cea0f..dcdd73622 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -36,7 +36,11 @@ class PostProcessor(GraphFst): ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index 4592d7841..846254938 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -42,6 +42,11 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/setup.py b/setup.py index 4667b49e8..e22afbab3 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,9 @@ elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( - os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', encoding='utf-8', + os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), + 'r', + encoding='utf-8', ).read() long_description_content_type = "text/x-rst" @@ -125,7 +127,8 @@ def __call_checker(self, base_command, scope, check): command.extend(['--check', '--diff']) self.announce( - msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, + msg='Running command: %s' % str(' '.join(command)), + level=distutils_log.INFO, ) return_code = subprocess.call(command) @@ -133,10 +136,18 @@ def __call_checker(self, base_command, scope, check): return return_code def _isort(self, scope, check): - return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__ISORT_BASE.split(), + scope=scope, + check=check, + ) def _black(self, scope, check): - return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__BLACK_BASE.split(), + scope=scope, + check=check, + ) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) diff --git a/tests/conftest.py b/tests/conftest.py index 8db3b106c..a26dab531 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,7 +56,9 @@ def pytest_addoption(parser): help="path to a directory with .far grammars for CPU TN/ITN tests, (DEFAULT: None, i.e. no cache)", ) parser.addoption( - '--run_audio_based', action='store_true', help="pass this argument to run audio-based TN tests", + '--run_audio_based', + action='store_true', + help="pass this argument to run audio-based TN tests", ) @@ -148,10 +150,12 @@ def pytest_configure(config): If file absent or sizes not equal, function downloads the archive from github and unpacks it. """ config.addinivalue_line( - "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]", + "markers", + "run_only_on(device): runs the test only on a given device [CPU | GPU]", ) config.addinivalue_line( - "markers", "with_downloads: runs the test using data present in tests/.data", + "markers", + "with_downloads: runs the test using data present in tests/.data", ) # Test dir and archive filepath. test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR) diff --git a/tests/nemo_text_processing/ar/test_money.py b/tests/nemo_text_processing/ar/test_money.py index 6fe36ba35..2aa49ba9a 100644 --- a/tests/nemo_text_processing/ar/test_money.py +++ b/tests/nemo_text_processing/ar/test_money.py @@ -49,6 +49,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_address.py b/tests/nemo_text_processing/en/test_address.py index c7a3523a0..ea8328d10 100644 --- a/tests/nemo_text_processing/en/test_address.py +++ b/tests/nemo_text_processing/en/test_address.py @@ -42,6 +42,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_cardinal.py b/tests/nemo_text_processing/en/test_cardinal.py index 1ee3a2a5b..f40e0d1f6 100644 --- a/tests/nemo_text_processing/en/test_cardinal.py +++ b/tests/nemo_text_processing/en/test_cardinal.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_decimal.py b/tests/nemo_text_processing/en/test_decimal.py index ff021f72a..ea20f18d6 100644 --- a/tests/nemo_text_processing/en/test_decimal.py +++ b/tests/nemo_text_processing/en/test_decimal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_electronic.py b/tests/nemo_text_processing/en/test_electronic.py index e8640062c..4dfec585e 100644 --- a/tests/nemo_text_processing/en/test_electronic.py +++ b/tests/nemo_text_processing/en/test_electronic.py @@ -60,6 +60,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=100, punct_post_process=False, + test_input, + n_tagged=100, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_fraction.py b/tests/nemo_text_processing/en/test_fraction.py index 764205591..a6186aabb 100644 --- a/tests/nemo_text_processing/en/test_fraction.py +++ b/tests/nemo_text_processing/en/test_fraction.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_math.py b/tests/nemo_text_processing/en/test_math.py index e2ecdebb8..22859f596 100644 --- a/tests/nemo_text_processing/en/test_math.py +++ b/tests/nemo_text_processing/en/test_math.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_measure.py b/tests/nemo_text_processing/en/test_measure.py index b03b3ff53..6ea9a0eda 100644 --- a/tests/nemo_text_processing/en/test_measure.py +++ b/tests/nemo_text_processing/en/test_measure.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index c81945ecd..103223d5e 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_ordinal.py b/tests/nemo_text_processing/en/test_ordinal.py index 6f87a832d..dac56bf38 100644 --- a/tests/nemo_text_processing/en/test_ordinal.py +++ b/tests/nemo_text_processing/en/test_ordinal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_punctuation.py b/tests/nemo_text_processing/en/test_punctuation.py index 75ff2e73c..761b3c9f4 100644 --- a/tests/nemo_text_processing/en/test_punctuation.py +++ b/tests/nemo_text_processing/en/test_punctuation.py @@ -22,7 +22,11 @@ class TestPunctuation: normalizer_en = Normalizer( - input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True, + input_case='cased', + lang='en', + cache_dir=CACHE_DIR, + overwrite_cache=False, + post_process=True, ) # address is tagged by the measure class diff --git a/tests/nemo_text_processing/en/test_range.py b/tests/nemo_text_processing/en/test_range.py index ac93613be..64b47d898 100644 --- a/tests/nemo_text_processing/en/test_range.py +++ b/tests/nemo_text_processing/en/test_range.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_roman.py b/tests/nemo_text_processing/en/test_roman.py index dc9468fb3..3ef655c65 100644 --- a/tests/nemo_text_processing/en/test_roman.py +++ b/tests/nemo_text_processing/en/test_roman.py @@ -40,6 +40,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_serial.py b/tests/nemo_text_processing/en/test_serial.py index aab870abf..2a27b1f54 100644 --- a/tests/nemo_text_processing/en/test_serial.py +++ b/tests/nemo_text_processing/en/test_serial.py @@ -38,6 +38,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=-1, punct_post_process=False, + test_input, + n_tagged=-1, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_special_text.py b/tests/nemo_text_processing/en/test_special_text.py index a461fe703..73be5d382 100644 --- a/tests/nemo_text_processing/en/test_special_text.py +++ b/tests/nemo_text_processing/en/test_special_text.py @@ -41,6 +41,8 @@ def test_norm(self, test_input, expected): # Audio-based normalization will output only options without digits if self.normalizer_with_audio_en and sum([1 for ch in expected if ch.isdigit()]) == 0: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=True, + test_input, + n_tagged=30, + punct_post_process=True, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index e2cd7d4a2..1a48d6da8 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -62,6 +62,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=500, punct_post_process=False, + test_input, + n_tagged=500, + punct_post_process=False, ) assert expected in pred_non_deterministic