Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
हफ़्ते
सप्ताह
सदियां
सदियों

Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,4 @@ hp हॉर्सपॉवर
d दिन
month महीना
months महीने
हफ़्ते
सप्ताह
सदियां
सदियों

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
१ला पहला
१ली पहली
२रा दूसरा
२री दूसरी
३रा तीसरा
३री तीसरी
४था चौथा
४थी चौथी
५वां पाँचवां
५वीं पाँचवीं
६ठा छठा
६ठी छठी
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
वां
वीं
वें
वे वें
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
वे वें

7 changes: 7 additions & 0 deletions nemo_text_processing/text_normalization/hi/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@
NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize()
NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize()
NEMO_HI_ZERO = "०"

HI_DEDH = "डेढ़" # 1.5
HI_DHAI = "ढाई" # 2.5
HI_SAVVA = "सवा" # quarter more (1.25)
HI_SADHE = "साढ़े" # half more (X.5)
HI_PAUNE = "पौने" # quarter less (0.75)

NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
Expand Down
15 changes: 6 additions & 9 deletions nemo_text_processing/text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst):
(NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand
)

cardinal_graph = (
digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands
cardinal_graph = pynini.union(
digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands
)

graph_year = graph_year_thousands | graph_year_hundreds_as_thousands
graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands)

delete_dash = pynutil.delete("-")
delete_slash = pynutil.delete("/")
Expand Down Expand Up @@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst):
# Updated logic to use prefix_union
year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"")

graph_dd_mm_yyyy = (
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
)
delete_separator = pynini.union(delete_dash, delete_slash)
graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph

graph_mm_dd_yyyy = (
months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph
)
graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph

graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")

Expand Down
69 changes: 53 additions & 16 deletions nemo_text_processing/text_normalization/hi/taggers/fraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,21 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst
from nemo_text_processing.text_normalization.hi.graph_utils import (
HI_DEDH,
HI_DHAI,
HI_PAUNE,
HI_SADHE,
HI_SAVVA,
NEMO_SPACE,
GraphFst,
)
from nemo_text_processing.text_normalization.hi.utils import get_abs_path

HI_ONE_HALF = "१/२" # 1/2
HI_ONE_QUARTER = "१/४" # 1/4
HI_THREE_QUARTERS = "३/४" # 3/4


class FractionFst(GraphFst):
"""
Expand All @@ -40,37 +52,62 @@ def __init__(self, cardinal, deterministic: bool = True):
cardinal_graph = cardinal.final_graph

self.optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1
)
self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
self.numerator = (
pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ")
pynutil.insert("numerator: \"")
+ cardinal_graph
+ pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"")
+ pynutil.insert(NEMO_SPACE)
)
self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"")

dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")])
dedh_dhai_graph = pynini.string_map(
[("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)]
)

savva_numbers = cardinal_graph + pynini.cross(" १/४", "")
savva_graph = pynutil.insert("सवा ") + savva_numbers
savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "")
savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers

sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "")
sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers
sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "")
sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers

paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
paune_numbers = paune + pynini.cross(" ३/४", "")
paune_graph = pynutil.insert("पौने ") + paune_numbers

graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ")
paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "")
paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers

graph_dedh_dhai = (
pynutil.insert("morphosyntactic_features: \"")
+ dedh_dhai_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ")
graph_savva = (
pynutil.insert("morphosyntactic_features: \"")
+ savva_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ")
graph_sadhe = (
pynutil.insert("morphosyntactic_features: \"")
+ sadhe_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ")
graph_paune = (
pynutil.insert("morphosyntactic_features: \"")
+ paune_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

final_graph = (
self.optional_graph_negative
+ pynini.closure(self.integer + pynini.accep(" "), 0, 1)
+ pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1)
+ self.numerator
+ self.denominator
)
Expand Down
109 changes: 82 additions & 27 deletions nemo_text_processing/text_normalization/hi/taggers/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,24 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space
from nemo_text_processing.text_normalization.hi.graph_utils import (
HI_DEDH,
HI_DHAI,
HI_PAUNE,
HI_SADHE,
HI_SAVVA,
NEMO_SPACE,
GraphFst,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.hi.utils import get_abs_path

HI_POINT_FIVE = ".५" # .5
HI_ONE_POINT_FIVE = "१.५" # 1.5
HI_TWO_POINT_FIVE = "२.५" # 2.5
HI_DECIMAL_25 = ".२५" # .25
HI_DECIMAL_75 = ".७५" # .75

digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
Expand Down Expand Up @@ -54,7 +69,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))

# Load quarterly units from separate files: map (FST) and list (FSA)
quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv"))
quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv"))
quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list)

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space,
Expand All @@ -65,16 +84,28 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
# Define the quarterly measurements
quarter = pynini.string_map(
[
(".५", "साढ़े"),
("१.५", "डेढ़"),
("२.५", "ढाई"),
(HI_POINT_FIVE, HI_SADHE),
(HI_ONE_POINT_FIVE, HI_DEDH),
(HI_TWO_POINT_FIVE, HI_DHAI),
]
)
quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"")

# Define the unit handling
unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ")
units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ")
unit = (
pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ unit_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)
units = (
pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ quarterly_units_graph
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
)

# Handling symbols like x, X, *
symbol_graph = pynini.string_map(
Expand All @@ -94,24 +125,43 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+ unit
)

dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")])
dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)])
dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"")

savva_numbers = cardinal_graph + pynini.cross(".२५", "")
savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"")
savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "")
savva_graph = (
pynutil.insert("integer: \"")
+ pynutil.insert(HI_SAVVA)
+ pynutil.insert(NEMO_SPACE)
+ savva_numbers
+ pynutil.insert("\"")
)

sadhe_numbers = cardinal_graph + pynini.cross(".५", "")
sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"")
sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "")
sadhe_graph = (
pynutil.insert("integer: \"")
+ pynutil.insert(HI_SADHE)
+ pynutil.insert(NEMO_SPACE)
+ sadhe_numbers
+ pynutil.insert("\"")
)

paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
paune_numbers = paune + pynini.cross(".७५", "")
paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"")
paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "")
paune_graph = (
pynutil.insert("integer: \"")
+ pynutil.insert(HI_PAUNE)
+ pynutil.insert(NEMO_SPACE)
+ paune_numbers
+ pynutil.insert("\"")
)

graph_dedh_dhai = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ dedh_dhai_graph
+ pynutil.insert(" }")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ units
)
Expand All @@ -120,7 +170,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ savva_graph
+ pynutil.insert(" }")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ units
)
Expand All @@ -129,7 +180,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ sadhe_graph
+ pynutil.insert(" }")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ units
)
Expand All @@ -149,7 +201,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("}")
+ delete_space
+ unit
)
Expand All @@ -162,9 +215,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ pynutil.insert(" units: \"")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("units: \"")
+ symbol_graph
+ pynutil.insert("\" ")
+ pynutil.insert("\"")
+ pynutil.insert(NEMO_SPACE)
+ pynutil.insert("} }")
+ insert_space
+ pynutil.insert("tokens { cardinal { ")
Expand All @@ -175,13 +230,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
)

graph = (
pynutil.add_weight(graph_decimal, 0.01)
| pynutil.add_weight(graph_cardinal, 0.01)
| pynutil.add_weight(graph_exceptions, 0.01)
| pynutil.add_weight(graph_dedh_dhai, 0.001)
| pynutil.add_weight(graph_savva, 0.005)
| pynutil.add_weight(graph_sadhe, 0.005)
| pynutil.add_weight(graph_paune, -0.2)
pynutil.add_weight(graph_decimal, 0.1)
| pynutil.add_weight(graph_cardinal, 0.1)
| pynutil.add_weight(graph_exceptions, 0.1)
| pynutil.add_weight(graph_dedh_dhai, -0.2)
| pynutil.add_weight(graph_savva, -0.1)
| pynutil.add_weight(graph_sadhe, -0.1)
| pynutil.add_weight(graph_paune, -0.5)
)
self.graph = graph.optimize()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,14 @@ class OrdinalFst(GraphFst):
def __init__(self, cardinal: CardinalFst, deterministic: bool = True):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv"))
suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv"))
suffixes_fst = pynini.union(suffixes_list, suffixes_map)
exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv"))

graph = cardinal.final_graph + suffixes_fst
exceptions = pynutil.add_weight(exceptions, -0.1)
graph = pynini.union(exceptions, graph)

final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
final_graph = self.add_tokens(final_graph)
Expand Down
Loading