Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ci:

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
rev: v5.0.0
hooks:
- id: check-yaml
- id: check-case-conflict
Expand All @@ -37,15 +37,15 @@ repos:
- --select=W605

- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
name: Format imports
args: [ --multi-line=3, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=119, -rc, -ws ]
exclude: docs/

- repo: https://github.com/psf/black
rev: 19.10b0
rev: 24.10.0
hooks:
- id: black
name: Format code
Expand Down
15 changes: 13 additions & 2 deletions nemo_text_processing/hybrid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,11 @@ def _relax_diff(text):
return acceptable


def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]:
def get_labels(
targets: List[str],
norm_texts_weights: List[Tuple[str, str]],
lang="en",
) -> List[List[str]]:
"""
Assign labels to generated normalization options (1 - for ground truth, 0 - other options)
Args:
Expand Down Expand Up @@ -605,7 +609,14 @@ def print_df(df):
prints data frame
"""
with pd.option_context(
"display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400,
"display.max_rows",
None,
"display.max_columns",
None,
"display.width",
1000,
"display.max_colwidth",
400,
):
print(df)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def __init__(self, tn_cardinal):
self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize()

optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1,
pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE,
0,
1,
)

final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def __init__(self, tn_decimal):
super().__init__(name="decimal", kind="classify")

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space,
0,
1,
)

graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
+ graph_in_thousands
)

graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,)
graph = pynini.union(
(graph_int | graph_ind) + delete_space + graph_hundreds,
graph_zero,
)

graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ def __init__(self, ordinal: GraphFst, input_case: str):
+ pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+ pynutil.insert("\"")
)
optional_graph_year = pynini.closure(graph_year, 0, 1,)
optional_graph_year = pynini.closure(
graph_year,
0,
1,
)
graph_mdy = month_graph + (
(delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
point = pynutil.delete("point")

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space,
0,
1,
)

graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
delete_extra_space
+ url_symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username)
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)

protocol_default = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU
graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize()

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space,
0,
1,
)

unit_singular = convert_space(graph_unit_singular)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU
# "one fifty" -> "one hundred fifty"
with_hundred = pynini.compose(
pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA,
pynini.compose(cardinal_graph, NEMO_DIGIT ** 3),
pynini.compose(cardinal_graph, NEMO_DIGIT**3),
)
cardinal_graph |= with_hundred
graph_decimal_final = decimal.final_graph_wo_negative
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_serial_number(cardinal):
"""

digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT)
two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002)
two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002)
character = digit | two_digit | NEMO_ALPHA
sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2)
sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2)
Expand Down Expand Up @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
triple_digit.invert()

# to handle cases like "one twenty three"
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2)
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2)
double_digit_to_digit = (
pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
)
Expand All @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):

number_part = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4,
NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4,
).optimize()
number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"")

Expand All @@ -156,24 +156,24 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
graph = optional_country_code + number_part

# credit card number
space_four_digits = insert_space + NEMO_DIGIT ** 4
space_four_digits = insert_space + NEMO_DIGIT**4
space_five_digits = space_four_digits + NEMO_DIGIT
space_six_digits = space_five_digits + NEMO_DIGIT
credit_card_graph = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits,
NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits,
).optimize()

credit_card_graph |= pynini.compose(
single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits
single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits
).optimize()

graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"")

# SSN
ssn_graph = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4,
NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4,
).optimize()
graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")

Expand Down
22 changes: 20 additions & 2 deletions nemo_text_processing/inverse_text_normalization/en/taggers/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,32 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15")
oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",)
oclock = pynini.cross(
pynini.union(
"o' clock",
"o clock",
"o'clock",
"oclock",
"hundred hours",
),
"",
)

if input_case == INPUT_CASED:
minute_to_graph = capitalized_input_graph(minute_to_graph)
graph_minute_single = capitalized_input_graph(graph_minute_single)
graph_minute_double = capitalized_input_graph(graph_minute_double)
graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15")
oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",)
oclock |= pynini.cross(
pynini.union(
"O' clock",
"O clock",
"O'clock",
"Oclock",
"Hundred hours",
),
"",
)

final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"")
graph_minute = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,18 +160,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
self.graph_no_exception = graph.optimize()

# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3)
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand.optimize()

# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
NEMO_DIGIT
| (NEMO_DIGIT ** 2)
| (NEMO_DIGIT ** 3)
| (NEMO_DIGIT ** 4)
| (NEMO_DIGIT ** 5)
| (NEMO_DIGIT ** 6)
NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million.optimize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
delete_extra_space
+ symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username)
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)

protocol_default = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):

full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit)

ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,)
ordinal_graph_union = pynini.union(
graph_digit,
graph_teens,
graph_twenties,
full_graph_ties,
graph_hundreds,
)

accept_o_endings = NEMO_SIGMA + pynini.accep("o")
accept_a_endings = NEMO_SIGMA + pynini.accep("a")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
# Denormalized phone numbers are grouped in sets of 3 or 4 digits
group_of_two = pynini.union(doubled_digit, digit_twice, double_digits)

group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,)
group_of_three = pynini.union(
tripled_digit,
single_digits + pynutil.delete(" ") + group_of_two,
)

group_of_four = pynini.union(
group_of_two + pynutil.delete(" ") + group_of_two,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,13 @@ def __init__(self):
self.graph_no_exception = graph.optimize()

# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3)
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand

# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
NEMO_DIGIT
| (NEMO_DIGIT ** 2)
| (NEMO_DIGIT ** 3)
| (NEMO_DIGIT ** 4)
| (NEMO_DIGIT ** 5)
| (NEMO_DIGIT ** 6)
NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst):
day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates
day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"")
optional_graph_year = pynini.closure(
delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1,
delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""),
0,
1,
)
graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self):
super().__init__(name="decimal", kind="verbalize")

# Need parser to group digits by threes
exactly_three_digits = NEMO_DIGIT ** 3
exactly_three_digits = NEMO_DIGIT**3
at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)

space_every_three_integer = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ def __init__(self):
graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert()
graph_roman_zero_digit = pynutil.delete("0")

graph_roman_hundreds = NEMO_DIGIT ** 3 @ (
graph_roman_hundreds = NEMO_DIGIT**3 @ (
graph_roman_hundreds
+ pynini.union(graph_roman_ties, graph_roman_zero_digit)
+ pynini.union(graph_roman_digits, graph_roman_zero_digit)
)
graph_roman_ties = NEMO_DIGIT ** 2 @ (
graph_roman_ties = NEMO_DIGIT**2 @ (
graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit)
)
graph_roman_digits = NEMO_DIGIT @ graph_roman_digits
Expand Down
Loading
Loading