Skip to content

Commit 29ff3ee

Browse files
Fix word range (#334)
* fix range and quote Signed-off-by: folivoramanh <[email protected]> * fix quote in post process Signed-off-by: folivoramanh <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix quote and range Signed-off-by: folivoramanh <[email protected]> --------- Signed-off-by: folivoramanh <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 9c19009 commit 29ff3ee

File tree

3 files changed

+46
-1
lines changed

3 files changed

+46
-1
lines changed

nemo_text_processing/text_normalization/vi/verbalizers/post_processing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def get_vietnamese_punct_config(self) -> Dict[str, List[str]]:
5555
"""
5656
return {
5757
# Punctuation that should not have space before them
58-
'no_space_before': [",", ".", "!", "?", ":", ";", ")", r"\]", "}", "\""],
58+
'no_space_before': [",", ".", "!", "?", ":", ";", ")", r"\]", "}"],
5959
# Punctuation that should not have space after them
6060
'no_space_after': ["(", r"\[", "{"],
6161
# Punctuation that can have space before them (exceptions)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space
19+
20+
21+
class RangeFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Vietnamese ranges.
24+
Range tokens are already verbalized by the tagger, so this just extracts the content.
25+
e.g. tokens { name: "mười nghìn đến hai mười nghìn" } -> mười nghìn đến hai mười nghìn
26+
27+
Args:
28+
deterministic: if True will provide a single transduction option,
29+
for False multiple transduction are generated (used for audio-based normalization)
30+
"""
31+
32+
def __init__(self, deterministic: bool = True):
33+
super().__init__(name="range", kind="verbalize", deterministic=deterministic)
34+
35+
# Range content is already verbalized by the tagger, just extract it
36+
chars = pynini.closure(NEMO_CHAR - " ", 1)
37+
char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
38+
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
39+
40+
self.fst = graph.optimize()

nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nemo_text_processing.text_normalization.vi.verbalizers.measure import MeasureFst
2121
from nemo_text_processing.text_normalization.vi.verbalizers.money import MoneyFst
2222
from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst
23+
from nemo_text_processing.text_normalization.vi.verbalizers.range import RangeFst
2324
from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst
2425
from nemo_text_processing.text_normalization.vi.verbalizers.time import TimeFst
2526
from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst
@@ -63,6 +64,9 @@ def __init__(self, deterministic: bool = True):
6364
measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic)
6465
measure_graph = measure.fst
6566

67+
range_fst = RangeFst(deterministic=deterministic)
68+
range_graph = range_fst.fst
69+
6670
graph = (
6771
cardinal_graph
6872
| whitelist_graph
@@ -75,6 +79,7 @@ def __init__(self, deterministic: bool = True):
7579
| time_graph
7680
| money_graph
7781
| measure_graph
82+
| range_graph
7883
)
7984

8085
self.fst = graph

0 commit comments

Comments
 (0)