Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/fr/data/dates/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
20s twenties
30s thirties
40s forties
50s fifties
60s sixties
70s seventies
80s eighties
90s nineties
12 changes: 12 additions & 0 deletions nemo_text_processing/text_normalization/fr/data/dates/months.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 janvier
2 février
3 mars
4 avril
5 mai
6 juin
7 juillet
8 août
9 septembre
10 octobre
11 novembre
12 décembre
150 changes: 150 additions & 0 deletions nemo_text_processing/text_normalization/fr/taggers/date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
from nemo_text_processing.text_normalization.fr.utils import get_abs_path

# TODO: add articles? 'le...'

month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
delete_leading_zero = (
pynutil.delete("0") | (NEMO_DIGIT - "0")
) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits


class DateFst(GraphFst):
''' Finite state transducer for classyfing dates, e.g.:
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
'''

def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="dates", kind="classify")

cardinal_graph = cardinal.all_nums_no_tokens

# 'le' -> 'le', 'les' -> 'les'
le_determiner = pynini.accep("le ") | pynini.accep("les ")
self.optional_le = pynini.closure(le_determiner, 0, 1)

# '01' -> 'un'
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
valid_day_number = pynini.union(*[str(x) for x in range(1, 32)])
premier = pynini.string_map([("1", "premier")])
day_number_to_word = premier | cardinal_graph

digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")

# '03' -> 'mars'
normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
number_to_month = month_numbers.optimize()
month_graph = normalize_month_number @ number_to_month
self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")

# 2025 -> deux mille vingt cinq
accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
digits_to_year = accept_year_digits @ cardinal_graph
self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")

# Putting it all together
self.fst = pynini.accep("")

for separator in ["/", ".", "-"]:
self.fst |= (
pynutil.insert("date { ")
+ self.day_graph
+ pynutil.delete(separator)
+ pynutil.insert(" ")
+ self.month_graph
+ pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true }")
)

# Accepts "janvier", "février", etc
month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")

self.fst |= (
pynutil.insert("date { ")
+ self.day_graph
+ pynini.accep(" ")
+ month_name_graph
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true}")
)

# Accepts "70s", "80s", etc
self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }")

# Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
for separator in ["-", "/"]:
day_range_graph = (
pynutil.insert("day: \"")
+ pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1)
+ digit_to_day
+ pynutil.insert("\"")
)

self.fst |= (
pynutil.insert("date { ")
+ day_range_graph
+ pynini.accep(" ")
+ month_name_graph
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true }")
)

self.fst = self.fst.optimize()


def apply_fst(text, fst):
try:
output = pynini.shortestpath(text @ fst).string()
print(f"'{text}' --> '{output}'")
except pynini.FstOpError:
print(f"Error: No valid output with given input: '{text}'")


if __name__ == "__main__":
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst

fst = DateFst(CardinalFst())

print('DETERMINER')
apply_fst("le ", fst.optional_le)
apply_fst("", fst.optional_le)

print("\nDAY GRAPH")
apply_fst("01", fst.day_graph)
apply_fst("02", fst.day_graph)
apply_fst("3", fst.day_graph)
apply_fst("12", fst.day_graph)
apply_fst("le 01", fst.day_graph)
apply_fst("le 12", fst.day_graph)

print("\nMONTH GRAPH")
apply_fst("1", fst.month_graph)
apply_fst("3", fst.month_graph)
apply_fst("06", fst.month_graph)

print("\nYEAR")
apply_fst("2025", fst.year_graph)

print("\nDATE")
apply_fst("02.03.2003", fst.fst)
apply_fst("02/03/2003", fst.fst)
apply_fst("02-03-2003", fst.fst)
apply_fst("le 02.03.2003", fst.fst)

apply_fst("02.03", fst.fst)
apply_fst("17 janvier", fst.fst)
apply_fst("10 mars 2023", fst.fst)
apply_fst("le 10 mars 2023", fst.fst)

print("\nERAS")
apply_fst("80s", fst.fst)

print("\nDATE RANGES")
apply_fst(
"les 17/18/19 juin", fst.fst
) # returns: date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true }
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst
from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
Expand Down Expand Up @@ -86,8 +87,12 @@ def __init__(
whitelist_graph = self.whitelist.fst
punct_graph = PunctuationFst(deterministic=deterministic).fst

self.date = DateFst(self.cardinal, deterministic=deterministic)
date_graph = self.date.fst

classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(date_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.09)
| pynutil.add_weight(ordinal_graph, 1.1)
Expand Down
67 changes: 67 additions & 0 deletions nemo_text_processing/text_normalization/fr/verbalizers/date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SPACE,
GraphFst,
delete_preserve_order,
)


class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois

Args:
ordinal: OrdinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="date", kind="verbalize", deterministic=deterministic)

day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
graph_my = month + NEMO_SPACE + year + delete_preserve_order
graph_decade = decade + delete_preserve_order

self.graph = graph_dmy | graph_my | graph_decade

delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()


def apply_fst(text, fst):
try:
output = pynini.shortestpath(text @ fst).string()
print(f"'{text}' --> '{output}'")
except pynini.FstOpError:
print(f"Error: No valid output with given input: '{text}'")


if __name__ == "__main__":
fst = DateFst()

# tagger output for "eighties"
apply_fst('date { year: "eighties" preserve_order: true }', fst.fst)
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
Expand All @@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
fraction_graph = fraction.fst
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
date = DateFst(deterministic=deterministic)
date_graph = date.fst

graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
self.fst = graph
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/fr_tutorial/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
zéro 0
un 1
une 1
deux 2
trois 3
quatre 4
cinq 5
six 6
sept 7
huit 8
neuf 9
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Mᵐᵉ madame
Mᵐᵉˢ mesdames
Mˡˡᵉ mademoiselle
Mˡˡᵉˢ mademoiselles
Dʳ docteur
Dʳˢ docteurs
Dʳᵉ docteure
Dʳᵉˢ docteures
apr. J.-C. après jésus-christ
av. J.-C. avant Jésus-Christ
le hon. l’honorable
le très hon. le très hononrable
% pour cent
Loading
Loading