Skip to content

Commit 4673bba

Browse files
authored
[itn] fix 一二三四 (#187)
1 parent 093a6f9 commit 4673bba

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ repos:
1616
rev: 'v17.0.6'
1717
hooks:
1818
- id: clang-format
19+
exclude: '.*\.(json|java|js|m|mm|proto)'
1920
- repo: https://github.com/cpplint/cpplint
2021
rev: '1.6.1'
2122
hooks:

itn/chinese/rules/cardinal.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def build_tagger(self):
8686
ten_thousand |= (thousand | hundred) + accep("万") + delete(
8787
"零").ques + (thousand | hundred | tens | teen | digits).ques
8888

89-
# 1. 利用基础数字所构建的包含0~9的完整数字
89+
# 1. 利用基础数字所构建的包含0~9的标准数字
9090
# 个/十/百/千/万
9191
number = digits | teen | tens | hundred | thousand | ten_thousand
9292
# 兆/亿
@@ -100,13 +100,13 @@ def build_tagger(self):
100100
_special_dash = cross('十', '1') + special_dash
101101
_special_dash |= digit + delete('十') + special_dash
102102
_special_dash |= digit + delete('百') + special_dash
103-
number |= add_weight(_special_dash, -100.0)
103+
number |= _special_dash
104104

105105
self.number = number.optimize()
106106
self.special_tilde = special_tilde.optimize()
107107
self.special_dash = _special_dash.optimize()
108108

109-
# 2. 利用基础数字所构建的不包含0~9的完整数字
109+
# 2. 利用基础数字所构建的不包含0~9的标准数字
110110
# 十/百/千/万
111111
number_exclude_0_to_9 = teen | tens | hundred | thousand | ten_thousand
112112
# 兆/亿
@@ -137,13 +137,15 @@ def build_tagger(self):
137137
cardinal |= (digits**3 | digits**4 | digits**5 | digits**11
138138
| digits**18)
139139

140-
# 4. 特殊格式的数字 + 包含或不包含0~9的完整数字
140+
# 4. 特殊格式的数字 + 标准数字
141141
# cardinal string like 23
142142
if self.enable_standalone_number:
143143
if self.enable_0_to_9:
144-
cardinal |= number
144+
# 特殊格式数字为第一优先级, 标准数字为第二优先级, 如 "一二三四"
145+
# 优先转译为 "1234" 而非 "1~2 3~4"
146+
cardinal |= add_weight(number, 0.1)
145147
else:
146-
cardinal |= number_exclude_0_to_9
148+
cardinal |= add_weight(number_exclude_0_to_9, 0.1)
147149
tagger = insert('value: "') + cardinal + (insert(" ") + cardinal).star \
148150
+ insert('"')
149151
self.tagger = self.add_tokens(tagger)

itn/chinese/test/data/cardinal.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,9 @@
1414
我的身份证号是三四零二零三一九三七零幺零幺零五幺七 => 我的身份证号是340203193701010517
1515
给一三三四五三一二二二一打电话 => 给13345312221打电话
1616
给一三三四五三一二二二一拨电话 => 给13345312221拨电话
17+
一二三四 => 1234
18+
二二三四 => 2234
19+
拨打幺二三零六 => 拨打12306
20+
九幺幺是报警电话 => 911是报警电话
21+
尾号幺七零二 => 尾号1702
22+
尾号一二三四 => 尾号1234

0 commit comments

Comments
 (0)