Skip to content

Commit 91e51ca

Browse files
authored
[itn] fix issue#237, digit + union("百", "千", "万") + digit + unit (#255)
1 parent 667cbe0 commit 91e51ca

File tree

5 files changed

+53
-5
lines changed

5 files changed

+53
-5
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
3+
4+
5+
6+
7+
8+
9+
10+

itn/chinese/rules/measure.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from tn.processor import Processor
1717
from tn.utils import get_abs_path
1818

19-
from pynini import string_file, accep, cross
19+
from pynini import string_file, accep, cross, union
2020
from pynini.lib.pynutil import delete, insert, add_weight
2121

2222

@@ -36,6 +36,11 @@ def build_tagger(self):
3636
get_abs_path('../itn/chinese/data/measure/units_zh.tsv'))
3737
sign = string_file(
3838
get_abs_path('../itn/chinese/data/number/sign.tsv')) # + -
39+
digit = string_file(
40+
get_abs_path('../itn/chinese/data/number/digit.tsv')) # 1 ~ 9
41+
digit_zh = string_file(
42+
get_abs_path('../itn/chinese/data/number/digit_zh.tsv')) # 1 ~ 9
43+
addzero = insert('0')
3944
to = cross('到', '~') | cross('到百分之', '~')
4045

4146
units = add_weight(
@@ -55,8 +60,35 @@ def build_tagger(self):
5560

5661
# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
5762
measure = number + (to + number).ques + units
58-
tagger = insert('value: "') + (measure | percent) + insert('"')
5963

64+
# XXX: 特殊case处理, ignore enable_standalone_number
65+
# digit + union("百", "千", "万") + digit + unit
66+
unit_sp_case1 = [
67+
'年',
68+
'月',
69+
'个月',
70+
'周',
71+
'天',
72+
'位',
73+
'次',
74+
'个',
75+
'顿',
76+
]
77+
if self.enable_0_to_9:
78+
measure_sp = add_weight(
79+
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
80+
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
81+
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
82+
insert(' ') + digit + union(*unit_sp_case1), -0.5)
83+
else:
84+
measure_sp = add_weight(
85+
((digit + delete('百') + add_weight(addzero**2, 1.0)) |
86+
(digit + delete('千') + add_weight(addzero**3, 1.0)) |
87+
(digit + delete('万') + add_weight(addzero**4, 1.0))) +
88+
digit_zh + union(*unit_sp_case1), -0.5)
89+
90+
tagger = insert('value: "') + (measure | measure_sp
91+
| percent) + insert('"')
6092
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
6193
tagger |= (insert('denominator: "') + delete('每') + units +
6294
insert('" numerator: "') + measure + insert('"'))

itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,6 @@
3636
这是九十九九千 => 这是九十九九千
3737
这是十二一千 => 这是十二一千
3838
这是零百 => 这是零百
39-
这是零千 => 这是零千
39+
这是零千 => 这是零千
40+
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
41+
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年

itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@
55
这是九十九九千 => 这是九十九九千
66
这是十二一千 => 这是十二一千
77
这是零百 => 这是零百
8-
这是零千 => 这是零千
8+
这是零千 => 这是零千
9+
这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天
10+
这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年

itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,6 @@
3636
这是九十九九千 => 这是99 9000
3737
这是十二一千 => 这是12 1000
3838
这是零百 => 这是零百
39-
这是零千 => 这是零千
39+
这是零千 => 这是零千
40+
这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天
41+
这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年

0 commit comments

Comments
 (0)