Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
सन्
सन
साल
10 changes: 10 additions & 0 deletions nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
में
का
की
के
से
तक
ईस्वी
शताब्दी
दशक
सदी
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ई. पू. ईसा पूर्व
ई. ईसवी
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
s सेकंड
hr घंटा
h घंटे
min मिनट
doz दर्जन
yr साल
yr वर्ष
hp हॉर्सपॉवर
d दिन
month महीना
months महीने
हफ़्ते हफ़्ते
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,16 @@ month महीना
months महीने
ct कैरेट
pH पीएच
km/h किलोमीटर प्रति घंटा
km/hr किलोमीटर प्रति घंटा
km/min किलोमीटर प्रति मिनट
m/h मीटर प्रति घंटा
m/hr मीटर प्रति घंटा
mi/s मील प्रति सेकंड
mi/h मील प्रति घंटा
mi/hr मील प्रति घंटा
mi/min मील प्रति मिनट
₹/ac रुपए प्रति एकड़
x बाई
X बाई
* बाई
- से
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
₹ रुपए
P पैसे
£ पाउंड
₩ वॉन
$ डॉलर
₺ लीरा
৳ टका
¥ येन
₦ नाइरा
€ यूरो
€ यूरो
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
रुपए पैसे
पाउंड पेंस
वॉन जिओन
डॉलर सेंट
लीरा कुरस
टका पैसे
येन सेन
नाइरा कोबो
यूरो सेंट
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@
८८ अट्ठासी
८९ नवासी
९० नब्बे
९१ इक्यानबे
९२ बानबे
९३ तिरानबे
९४ चौरानबे
९५ पंचानबे
९६ छियानबे
९७ सत्तानबे
९८ अट्ठानबे
९१ इक्यानबे
९२ बानबे
९३ तिरानबे
९४ चौरानबे
९५ पंचानबे
९६ छियानबे
९७ सत्तानबे
९८ अट्ठानबे
९९ निन्यानबे
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
० शून्य
१ एक
२ दो
३ तीन
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties)
graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds)
graph_ten_thousands.optimize()
self.graph_ten_thousands = graph_ten_thousands

# Lakhs graph and ten lakhs graph
suffix_lakhs = pynutil.insert(" लाख")
Expand All @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands)
graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands)
graph_lakhs.optimize()
self.graph_lakhs = graph_lakhs

graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5)
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit)
Expand All @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands)
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands)
graph_ten_lakhs.optimize()
self.graph_ten_lakhs = graph_ten_lakhs

# Crores graph ten crores graph
suffix_crores = pynutil.insert(" करोड़")
Expand Down
57 changes: 55 additions & 2 deletions nemo_text_processing/text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@

days = pynini.string_file(get_abs_path("data/date/days.tsv"))
months = pynini.string_file(get_abs_path("data/date/months.tsv"))
year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv"))
digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)

# Read suffixes from file into a list
with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f:
suffixes_list = f.read().splitlines()
with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f:
prefixes_list = f.read().splitlines()

# Create union of suffixes and prefixes
suffix_union = pynini.union(*suffixes_list)
prefix_union = pynini.union(*prefixes_list)


class DateFst(GraphFst):
Expand All @@ -51,10 +65,15 @@
(NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand
)

cardinal_graph = (
digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands
)

graph_year = graph_year_thousands | graph_year_hundreds_as_thousands

delete_dash = pynutil.delete("-")
delete_slash = pynutil.delete("/")
delete_comma = pynutil.delete(",")

days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space

Expand All @@ -68,6 +87,22 @@

graph_mm_dd += pynutil.insert(" preserve_order: true ")

# Graph for era
era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space

range_graph = pynini.cross("-", "से")

# Graph for year
century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं")
century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space

# Updated logic to use suffix_union
year_number = graph_year + suffix_union
year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space

# Updated logic to use prefix_union
year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"")

graph_dd_mm_yyyy = (
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
)
Expand All @@ -78,7 +113,20 @@

graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")

graph_mm_yyyy = months_graph + delete_dash + years_graph
graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph

graph_year_suffix = era_graph

graph_range = (
pynutil.insert("era: \"")
+ cardinal_graph
+ insert_space
+ range_graph
+ insert_space
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" preserve_order: true ")
)

# default assume dd_mm_yyyy

Expand All @@ -87,7 +135,12 @@
| graph_mm_dd
| pynutil.add_weight(graph_dd_mm_yyyy, -0.001)
| graph_mm_dd_yyyy
| graph_mm_yyyy
| pynutil.add_weight(graph_mm_yyyy, -0.2)
| pynutil.add_weight(graph_year_suffix, -0.001)
| pynutil.add_weight(graph_range, -0.005)
| pynutil.add_weight(century_text, -0.001)
| pynutil.add_weight(year_text, -0.001)
| pynutil.add_weight(year_prefix, -0.009)
)

self.final_graph = final_graph.optimize()
Expand Down
87 changes: 79 additions & 8 deletions nemo_text_processing/text_normalization/hi/taggers/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
from nemo_text_processing.text_normalization.hi.utils import get_abs_path


digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)


class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
Expand All @@ -35,39 +40,105 @@ class MeasureFst(GraphFst):
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")

cardinal_graph = cardinal.final_graph
decimal_graph = decimal.final_graph_wo_negative
cardinal_graph = (
digit
| teens_and_ties
| cardinal.graph_hundreds
| cardinal.graph_thousands
| cardinal.graph_ten_thousands
| cardinal.graph_lakhs
| cardinal.graph_ten_lakhs
)
point = pynutil.delete(".")
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space,
0,
1,
)

# Define the quarterly measurements
quarter = pynini.string_map(
[
(".५", "साढ़े"),
("१.५", "डेढ़"),
("२.५", "ढाई"),
]
)
quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"")

# Define the unit handling
self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")
unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ")
units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ")

# Handling symbols like x, X, *
symbol_graph = pynini.string_map(
[
("x", "बाई"),
("X", "बाई"),
("*", "बाई"),
]
)

graph_measurements = (
graph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal_graph
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)
graph_measurements |= (

graph_quarter = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ quarter_graph
+ pynutil.insert(" }")
+ delete_space
+ units
)

graph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)

graph = graph_measurements
# Handling cardinal clubbed with symbol as single token
graph_exceptions = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ pynutil.insert(" units: \"")
+ symbol_graph
+ pynutil.insert("\" ")
+ pynutil.insert("} }")
+ insert_space
+ pynutil.insert("tokens { cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
)

graph = (
pynutil.add_weight(graph_decimal, 0.01)
| pynutil.add_weight(graph_quarter, 0.005)
| pynutil.add_weight(graph_cardinal, 0.01)
| pynutil.add_weight(graph_exceptions, 0.01)
)
self.graph = graph.optimize()

final_graph = self.add_tokens(graph)
Expand Down
Loading
Loading