Skip to content

Commit eddebf3

Browse files
Second draft of Korean Cardinal ITN (#284)
* First draft of Korean Cardinal ITN Sparrowhawk testing is not done yet. Signed-off-by: hmlee245 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing all the feedbacks Signed-off-by: hmlee245 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * This reverts commit f893d89, reversing changes made to 9f7e876. Signed-off-by: hmlee245 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * third draft of korean ITN work. Mainly fixing minor issues and adding test cases Signed-off-by: hmlee245 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: hmlee245 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ac07488 commit eddebf3

File tree

24 files changed

+914
-2
lines changed

24 files changed

+914
-2
lines changed

Jenkinsfile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ pipeline {
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
3030
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
31+
KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0'
32+
3133
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3234
}
3335
stages {
@@ -318,6 +320,22 @@ pipeline {
318320
}
319321
}
320322
}
323+
stage('L0: Create KO ITN Grammars') {
324+
when {
325+
anyOf {
326+
branch 'main'
327+
changeRequest target: 'main'
328+
}
329+
}
330+
failFast true
331+
parallel {
332+
stage('L0: KO ITN grammars') {
333+
steps {
334+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}'
335+
}
336+
}
337+
}
338+
}
321339

322340

323341
// L1 Tests starts here
@@ -406,6 +424,11 @@ pipeline {
406424
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}'
407425
}
408426
}
427+
stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') {
428+
steps {
429+
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}'
430+
}
431+
}
409432
}
410433
}
411434

nemo_text_processing/inverse_text_normalization/inverse_normalize.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ def __init__(
131131
from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import (
132132
VerbalizeFinalFst,
133133
)
134+
elif lang == 'ko': # Korean
135+
from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
136+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
137+
VerbalizeFinalFst,
138+
)
134139

135140
self.tagger = ClassifyFst(
136141
cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
@@ -175,7 +180,7 @@ def parse_args():
175180
parser.add_argument(
176181
"--language",
177182
help="language",
178-
choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'],
183+
choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'],
179184
default="en",
180185
type=str,
181186
)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
16+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst
17+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
1
2+
2
3+
3
4+
4
5+
5
6+
6
7+
7
8+
8
9+
9
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0

0 commit comments

Comments
 (0)