Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7ede1d1
Future Implementations for classes - Measure, Money, and Date
ngachchi Jan 24, 2025
d3ac9f0
Resolved the conflicts with mm_yyyy and date ranges and added the pre…
ngachchi Jan 27, 2025
5c67c52
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2025
767b56e
removed the unused empty string implementation
ngachchi Jan 28, 2025
a2be3f9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 28, 2025
a5a6800
minor fixes for the tagger files
ngachchi Jan 29, 2025
0b840db
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 29, 2025
dbd2cb3
reformatted decimal final graph
ngachchi Feb 12, 2025
2f9564d
incorporated the suggestion for decimal graph
ngachchi Feb 13, 2025
b3dc83a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 13, 2025
8cab492
Century implementations
ngachchi Mar 5, 2025
9143b0b
Working on the yyyy format for the date class
ngachchi Mar 10, 2025
d8ea246
reverted yyyy code
ngachchi Mar 12, 2025
751f7ff
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 12, 2025
a76a6ac
working on future implementations
ngachchi Mar 17, 2025
95c2237
working on improving the date class accuracy
ngachchi Mar 18, 2025
6736fe7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 18, 2025
3376e15
added year prefix for the date class
ngachchi Mar 20, 2025
8d3db99
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 20, 2025
89fcd6c
working on the commma cases for date class
ngachchi Mar 20, 2025
ea26dd3
minor fixes
ngachchi Mar 21, 2025
60d7fe3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 21, 2025
6a2b8c2
implemented mixed fractions
ngachchi Mar 25, 2025
15c411e
rectified the test case
ngachchi Mar 25, 2025
6d36734
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 25, 2025
6e724fe
working on quarterly measurements
ngachchi Apr 3, 2025
990c25a
reformatted the prefixes and suffixes for date tagger class
ngachchi Apr 17, 2025
4c3f426
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 17, 2025
7fe10cc
replaced text tag with era tag for the date class
ngachchi Apr 21, 2025
f0dcc0b
Removed the text tag reference from date class verbalizer
ngachchi Apr 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-12-25-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
सन्
सन
साल
10 changes: 10 additions & 0 deletions nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
में
का
की
के
से
तक
ईस्वी
शताब्दी
दशक
सदी
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ई. पू. ईसा पूर्व
ई. ईसवी
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
s सेकंड
hr घंटा
h घंटे
min मिनट
doz दर्जन
yr साल
yr वर्ष
hp हॉर्सपॉवर
d दिन
month महीना
months महीने
हफ़्ते हफ़्ते
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,16 @@ month महीना
months महीने
ct कैरेट
pH पीएच
km/h किलोमीटर प्रति घंटा
km/hr किलोमीटर प्रति घंटा
km/min किलोमीटर प्रति मिनट
m/h मीटर प्रति घंटा
m/hr मीटर प्रति घंटा
mi/s मील प्रति सेकंड
mi/h मील प्रति घंटा
mi/hr मील प्रति घंटा
mi/min मील प्रति मिनट
₹/ac रुपए प्रति एकड़
x बाई
X बाई
* बाई
- से
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
₹ रुपए
P पैसे
£ पाउंड
₩ वॉन
$ डॉलर
₺ लीरा
৳ टका
¥ येन
₦ नाइरा
€ यूरो
€ यूरो
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
रुपए पैसे
पाउंड पेंस
वॉन जिओन
डॉलर सेंट
लीरा कुरस
टका पैसे
येन सेन
नाइरा कोबो
यूरो सेंट
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@
८८ अट्ठासी
८९ नवासी
९० नब्बे
९१ इक्यानबे
९२ बानबे
९३ तिरानबे
९४ चौरानबे
९५ पंचानबे
९६ छियानबे
९७ सत्तानबे
९८ अट्ठानबे
९१ इक्यानबे
९२ बानबे
९३ तिरानबे
९४ चौरानबे
९५ पंचानबे
९६ छियानबे
९७ सत्तानबे
९८ अट्ठानबे
९९ निन्यानबे
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
० शून्य
१ एक
२ दो
३ तीन
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties)
graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds)
graph_ten_thousands.optimize()
self.graph_ten_thousands = graph_ten_thousands

# Lakhs graph and ten lakhs graph
suffix_lakhs = pynutil.insert(" लाख")
Expand All @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands)
graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands)
graph_lakhs.optimize()
self.graph_lakhs = graph_lakhs

graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5)
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit)
Expand All @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands)
graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands)
graph_ten_lakhs.optimize()
self.graph_ten_lakhs = graph_ten_lakhs

# Crores graph ten crores graph
suffix_crores = pynutil.insert(" करोड़")
Expand Down
57 changes: 55 additions & 2 deletions nemo_text_processing/text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@

days = pynini.string_file(get_abs_path("data/date/days.tsv"))
months = pynini.string_file(get_abs_path("data/date/months.tsv"))
year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv"))
digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)

# Read suffixes from file into a list
with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f:
suffixes_list = f.read().splitlines()
with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f:
prefixes_list = f.read().splitlines()

# Create union of suffixes and prefixes
suffix_union = pynini.union(*suffixes_list)
prefix_union = pynini.union(*prefixes_list)


class DateFst(GraphFst):
Expand All @@ -51,10 +65,15 @@ def __init__(self, cardinal: GraphFst):
(NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand
)

cardinal_graph = (
digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands
)

graph_year = graph_year_thousands | graph_year_hundreds_as_thousands

delete_dash = pynutil.delete("-")
delete_slash = pynutil.delete("/")
delete_comma = pynutil.delete(",")

days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space

Expand All @@ -68,6 +87,22 @@ def __init__(self, cardinal: GraphFst):

graph_mm_dd += pynutil.insert(" preserve_order: true ")

# Graph for era
era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space

range_graph = pynini.cross("-", "से")

# Graph for year
century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं")
century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space

# Updated logic to use suffix_union
year_number = graph_year + suffix_union
year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space

# Updated logic to use prefix_union
year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"")

graph_dd_mm_yyyy = (
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
)
Expand All @@ -78,7 +113,20 @@ def __init__(self, cardinal: GraphFst):

graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")

graph_mm_yyyy = months_graph + delete_dash + years_graph
graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph

graph_year_suffix = era_graph

graph_range = (
pynutil.insert("era: \"")
+ cardinal_graph
+ insert_space
+ range_graph
+ insert_space
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" preserve_order: true ")
)

# default assume dd_mm_yyyy

Expand All @@ -87,7 +135,12 @@ def __init__(self, cardinal: GraphFst):
| graph_mm_dd
| pynutil.add_weight(graph_dd_mm_yyyy, -0.001)
| graph_mm_dd_yyyy
| graph_mm_yyyy
| pynutil.add_weight(graph_mm_yyyy, -0.2)
| pynutil.add_weight(graph_year_suffix, -0.001)
| pynutil.add_weight(graph_range, -0.005)
| pynutil.add_weight(century_text, -0.001)
| pynutil.add_weight(year_text, -0.001)
| pynutil.add_weight(year_prefix, -0.009)
)

self.final_graph = final_graph.optimize()
Expand Down
75 changes: 67 additions & 8 deletions nemo_text_processing/text_normalization/hi/taggers/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
from nemo_text_processing.text_normalization.hi.utils import get_abs_path


digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)


class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
Expand All @@ -35,37 +40,91 @@ class MeasureFst(GraphFst):
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")

cardinal_graph = cardinal.final_graph
decimal_graph = decimal.final_graph_wo_negative
cardinal_graph = (
digit
| teens_and_ties
| cardinal.graph_hundreds
| cardinal.graph_thousands
| cardinal.graph_ten_thousands
| cardinal.graph_lakhs
| cardinal.graph_ten_lakhs
)
point = pynutil.delete(".")
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1,
)

# Define the quarterly measurements
quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),])
quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"")

# Define the unit handling
self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")
unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ")
units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ")

graph_measurements = (
# Handling symbols like x, X, *
symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),])

graph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal_graph
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)
graph_measurements |= (

graph_quarter = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ quarter_graph
+ pynutil.insert(" }")
+ delete_space
+ units
)

graph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)

graph = graph_measurements
# Handling cardinal clubbed with symbol as single token
graph_exceptions = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ pynutil.insert(" units: \"")
+ symbol_graph
+ pynutil.insert("\" ")
+ pynutil.insert("} }")
+ insert_space
+ pynutil.insert("tokens { cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
)

graph = (
pynutil.add_weight(graph_decimal, 0.01)
| pynutil.add_weight(graph_quarter, 0.005)
| pynutil.add_weight(graph_cardinal, 0.01)
| pynutil.add_weight(graph_exceptions, 0.01)
)
self.graph = graph.optimize()

final_graph = self.add_tokens(graph)
Expand Down
Loading