Skip to content

Commit 1d803aa

Browse files
committed
Add date class for french
1 parent 48ca992 commit 1d803aa

File tree

18 files changed

+647
-1
lines changed

18 files changed

+647
-1
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
20s twenties
2+
30s thirties
3+
40s forties
4+
50s fifties
5+
60s sixties
6+
70s seventies
7+
80s eighties
8+
90s nineties
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
1 janvier
2+
2 février
3+
3 mars
4+
4 avril
5+
5 mai
6+
6 juin
7+
7 juillet
8+
8 août
9+
9 septembre
10+
10 octobre
11+
11 novembre
12+
12 décembre
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import pynini
2+
from pynini.lib import pynutil
3+
from nemo_text_processing.text_normalization.fr.utils import get_abs_path
4+
5+
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, NEMO_DIGIT
6+
7+
8+
# TODO: add articles? 'le...'
9+
10+
month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
11+
eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
12+
delete_leading_zero = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT #reminder, NEMO_DIGIT = filter on digits
13+
14+
class DateFst(GraphFst):
15+
''' Finite state transducer for classyfing dates, e.g.:
16+
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
17+
'''
18+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
19+
super().__init__(name="dates", kind="classify")
20+
21+
cardinal_graph = cardinal.all_nums_no_tokens
22+
23+
# 'le' -> 'le', 'les' -> 'les'
24+
le_determiner = pynini.accep("le ") | pynini.accep("les ")
25+
self.optional_le = pynini.closure(le_determiner, 0, 1)
26+
27+
# '01' -> 'un'
28+
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
29+
valid_day_number = pynini.union(*[str(x) for x in range(1,32)])
30+
premier = pynini.string_map([("1", "premier")])
31+
day_number_to_word = premier | cardinal_graph
32+
33+
digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
34+
self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")
35+
36+
# '03' -> 'mars'
37+
normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
38+
number_to_month = month_numbers.optimize()
39+
month_graph = normalize_month_number @ number_to_month
40+
self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
41+
42+
# 2025 -> deux mille vingt cinq
43+
accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
44+
digits_to_year = accept_year_digits @ cardinal_graph
45+
self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")
46+
47+
# Putting it all together
48+
self.fst = pynini.accep("")
49+
50+
for separator in ["/", ".", "-"]:
51+
self.fst |= (
52+
pynutil.insert("date { ")
53+
+ self.day_graph
54+
+ pynutil.delete(separator)
55+
+ pynutil.insert(" ")
56+
+ self.month_graph
57+
+ pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
58+
+ pynutil.insert(" preserve_order: true }")
59+
)
60+
61+
# Accepts "janvier", "février", etc
62+
month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")
63+
64+
self.fst |= (
65+
pynutil.insert("date { ")
66+
+ self.day_graph
67+
+ pynini.accep(" ")
68+
+ month_name_graph
69+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
70+
+ pynutil.insert(" preserve_order: true}")
71+
)
72+
73+
# Accepts "70s", "80s", etc
74+
self.fst |= pynutil.insert("date { decade: \"") + eras + pynutil.insert("\" preserve_order: true }")
75+
76+
77+
# Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
78+
for separator in ["-", "/"]:
79+
day_range_graph = (
80+
pynutil.insert("day: \"")
81+
+ pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1)
82+
+ digit_to_day
83+
+ pynutil.insert("\"")
84+
)
85+
86+
self.fst |= (
87+
pynutil.insert("date { ")
88+
+ day_range_graph
89+
+ pynini.accep(" ")
90+
+ month_name_graph
91+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
92+
+ pynutil.insert(" preserve_order: true }")
93+
)
94+
95+
self.fst = self.fst.optimize()
96+
97+
98+
99+
def apply_fst(text, fst):
100+
try:
101+
output = pynini.shortestpath(text @ fst).string()
102+
print(f"'{text}' --> '{output}'")
103+
except pynini.FstOpError:
104+
print(f"Error: No valid output with given input: '{text}'")
105+
106+
if __name__ == "__main__":
107+
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
108+
fst = DateFst(CardinalFst())
109+
110+
print('DETERMINER')
111+
apply_fst("le ", fst.optional_le)
112+
apply_fst("", fst.optional_le)
113+
114+
print("\nDAY GRAPH")
115+
apply_fst("01", fst.day_graph)
116+
apply_fst("02", fst.day_graph)
117+
apply_fst("3", fst.day_graph)
118+
apply_fst("12", fst.day_graph)
119+
apply_fst("le 01", fst.day_graph)
120+
apply_fst("le 12", fst.day_graph)
121+
122+
print("\nMONTH GRAPH")
123+
apply_fst("1", fst.month_graph)
124+
apply_fst("3", fst.month_graph)
125+
apply_fst("06", fst.month_graph)
126+
127+
print("\nYEAR")
128+
apply_fst("2025", fst.year_graph)
129+
130+
print("\nDATE")
131+
apply_fst("02.03.2003", fst.fst)
132+
apply_fst("02/03/2003", fst.fst)
133+
apply_fst("02-03-2003", fst.fst)
134+
apply_fst("le 02.03.2003", fst.fst)
135+
136+
apply_fst("02.03", fst.fst)
137+
apply_fst("17 janvier", fst.fst)
138+
apply_fst("10 mars 2023", fst.fst)
139+
apply_fst("le 10 mars 2023", fst.fst)
140+
141+
print("\nERAS")
142+
apply_fst("80s", fst.fst)
143+
144+
print("\nDATE RANGES")
145+
apply_fst("les 17/18/19 juin", fst.fst) # returns: date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true }

nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
3232
from nemo_text_processing.text_normalization.fr.taggers.whitelist import WhiteListFst
3333
from nemo_text_processing.text_normalization.fr.taggers.word import WordFst
34+
from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
3435
from nemo_text_processing.utils.logging import logger
3536

3637

@@ -85,9 +86,13 @@ def __init__(
8586
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
8687
whitelist_graph = self.whitelist.fst
8788
punct_graph = PunctuationFst(deterministic=deterministic).fst
89+
90+
self.date = DateFst(self.cardinal, deterministic=deterministic)
91+
date_graph = self.date.fst
8892

8993
classify = (
9094
pynutil.add_weight(whitelist_graph, 1.01)
95+
| pynutil.add_weight(date_graph, 1.1)
9196
| pynutil.add_weight(cardinal_graph, 1.1)
9297
| pynutil.add_weight(fraction_graph, 1.09)
9398
| pynutil.add_weight(ordinal_graph, 1.1)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import (
19+
NEMO_NOT_QUOTE,
20+
NEMO_SPACE,
21+
GraphFst,
22+
delete_preserve_order
23+
)
24+
25+
class DateFst(GraphFst):
26+
"""
27+
Finite state transducer for verbalizing date, e.g.
28+
date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois
29+
30+
Args:
31+
ordinal: OrdinalFst
32+
deterministic: if True will provide a single transduction option,
33+
for False multiple transduction are generated (used for audio-based normalization)
34+
"""
35+
36+
def __init__(self, deterministic: bool = True):
37+
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
38+
39+
day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
40+
month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
41+
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
42+
decade = pynutil.delete("decade: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
43+
44+
graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
45+
graph_my = month + NEMO_SPACE + year + delete_preserve_order
46+
graph_decade = decade + delete_preserve_order
47+
48+
self.graph = graph_dmy | graph_my | graph_decade
49+
50+
delete_tokens = self.delete_tokens(self.graph)
51+
self.fst = delete_tokens.optimize()
52+
53+
def apply_fst(text, fst):
54+
try:
55+
output = pynini.shortestpath(text @ fst).string()
56+
print(f"'{text}' --> '{output}'")
57+
except pynini.FstOpError:
58+
print(f"Error: No valid output with given input: '{text}'")
59+
60+
if __name__ == "__main__":
61+
fst = DateFst()
62+
63+
# tagger output for "les 17/18/19 juin"
64+
apply_fst('date { day: "les dix-sept dix-huit dix-neuf" month: "juin" preserve_order: true }', fst.fst)

nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
1818
from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
1919
from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
20+
from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
2021

2122

2223
class VerbalizeFst(GraphFst):
@@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
4041
fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
4142
fraction_graph = fraction.fst
4243
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
44+
date = DateFst(deterministic=deterministic)
45+
date_graph = date.fst
4346

44-
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
47+
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
4548
self.fst = graph
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
zéro 0
2+
un 1
3+
une 1
4+
deux 2
5+
trois 3
6+
quatre 4
7+
cinq 5
8+
six 6
9+
sept 7
10+
huit 8
11+
neuf 9

0 commit comments

Comments
 (0)