Skip to content

Commit 85581f4

Browse files
authored
feat(all): format all files (#174)
* feat(all): format all files * feat(all): format all files * feat(all): format all files
1 parent bd44df0 commit 85581f4

File tree

24 files changed

+286
-279
lines changed

24 files changed

+286
-279
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
exclude: '.*\.(txt|tsv)$'
12
repos:
23
- repo: https://github.com/pre-commit/pre-commit-hooks
34
rev: v4.5.0

itn/chinese/inverse_normalizer.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131

3232
class InverseNormalizer(Processor):
3333

34-
def __init__(self, cache_dir=None, overwrite_cache=False,
34+
def __init__(self,
35+
cache_dir=None,
36+
overwrite_cache=False,
3537
enable_standalone_number=True,
3638
enable_0_to_9=False,
3739
enable_million=False):
@@ -44,32 +46,39 @@ def __init__(self, cache_dir=None, overwrite_cache=False,
4446
self.build_fst('zh_itn', cache_dir, overwrite_cache)
4547

4648
def build_tagger(self):
47-
tagger = (add_weight(Date().tagger, 1.02)
48-
| add_weight(Whitelist().tagger, 1.01)
49-
| add_weight(Fraction().tagger, 1.05)
50-
| add_weight(Measure(enable_0_to_9=self.enable_0_to_9).tagger, 1.05) # noqa
51-
| add_weight(Money(enable_0_to_9=self.enable_0_to_9).tagger, 1.04) # noqa
52-
| add_weight(Time().tagger, 1.05)
53-
| add_weight(Cardinal(self.convert_number, self.enable_0_to_9, self.enable_million).tagger, 1.06) # noqa
54-
| add_weight(Math().tagger, 1.10)
55-
| add_weight(LicensePlate().tagger, 1.0)
56-
| add_weight(Char().tagger, 100)).optimize()
49+
tagger = (
50+
add_weight(Date().tagger, 1.02)
51+
| add_weight(Whitelist().tagger, 1.01)
52+
| add_weight(Fraction().tagger, 1.05)
53+
| add_weight(
54+
Measure(enable_0_to_9=self.enable_0_to_9).tagger, 1.05) # noqa
55+
| add_weight(Money(enable_0_to_9=self.enable_0_to_9).tagger,
56+
1.04) # noqa
57+
| add_weight(Time().tagger, 1.05)
58+
| add_weight(
59+
Cardinal(self.convert_number, self.enable_0_to_9,
60+
self.enable_million).tagger, 1.06) # noqa
61+
| add_weight(Math().tagger, 1.10)
62+
| add_weight(LicensePlate().tagger, 1.0)
63+
| add_weight(Char().tagger, 100)).optimize()
5764

5865
tagger = tagger.star
5966
# remove the last space
6067
self.tagger = tagger @ self.build_rule(delete(' '), '', '[EOS]')
6168

6269
def build_verbalizer(self):
63-
verbalizer = (Cardinal(self.convert_number, self.enable_0_to_9, self.enable_million).verbalizer # noqa
64-
| Char().verbalizer
65-
| Date().verbalizer
66-
| Fraction().verbalizer
67-
| Math().verbalizer
68-
| Measure(enable_0_to_9=self.enable_0_to_9).verbalizer
69-
| Money(enable_0_to_9=self.enable_0_to_9).verbalizer
70-
| Time().verbalizer
71-
| LicensePlate().verbalizer
72-
| Whitelist().verbalizer).optimize()
70+
verbalizer = (
71+
Cardinal(self.convert_number, self.enable_0_to_9,
72+
self.enable_million).verbalizer # noqa
73+
| Char().verbalizer
74+
| Date().verbalizer
75+
| Fraction().verbalizer
76+
| Math().verbalizer
77+
| Measure(enable_0_to_9=self.enable_0_to_9).verbalizer
78+
| Money(enable_0_to_9=self.enable_0_to_9).verbalizer
79+
| Time().verbalizer
80+
| LicensePlate().verbalizer
81+
| Whitelist().verbalizer).optimize()
7382
postprocessor = PostProcessor(remove_interjections=True).processor
7483

7584
self.verbalizer = (verbalizer @ postprocessor).star

itn/chinese/rules/cardinal.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
class Cardinal(Processor):
2222

23-
def __init__(self, enable_standalone_number=True, enable_0_to_9=True,
23+
def __init__(self,
24+
enable_standalone_number=True,
25+
enable_0_to_9=True,
2426
enable_million=False):
2527
super().__init__('cardinal')
2628
self.number = None
@@ -32,10 +34,10 @@ def __init__(self, enable_standalone_number=True, enable_0_to_9=True,
3234
self.build_verbalizer()
3335

3436
def build_tagger(self):
35-
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
37+
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
3638
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
37-
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
38-
dot = string_file('itn/chinese/data/number/dot.tsv') # .
39+
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
40+
dot = string_file('itn/chinese/data/number/dot.tsv') # .
3941

4042
addzero = insert('0')
4143
digits = zero | digit # 0 ~ 9
@@ -52,33 +54,33 @@ def build_tagger(self):
5254
| add_weight(addzero**2, 1.0)))
5355
# 一千一百一十一 => 1111, 一千零一十一 => 1011, 一千零一 => 1001
5456
# 一千一 => 1100, 一千 => 1000
55-
thousand = ((hundred | teen | tens | digits) + delete('千') + (
56-
hundred
57-
| add_weight(zero + (tens | teen), 0.1)
58-
| add_weight(addzero + zero + digit, 0.5)
59-
| add_weight(digit + addzero**2, 0.8)
60-
| add_weight(addzero**3, 1.0)))
57+
thousand = ((hundred | teen | tens | digits) + delete('千') +
58+
(hundred
59+
| add_weight(zero + (tens | teen), 0.1)
60+
| add_weight(addzero + zero + digit, 0.5)
61+
| add_weight(digit + addzero**2, 0.8)
62+
| add_weight(addzero**3, 1.0)))
6163
# 10001111, 1001111, 101111, 11111, 10111, 10011, 10001, 10000
6264
if self.enable_million:
63-
ten_thousand = ((thousand | hundred | teen | tens | digits)
64-
+ delete('万')
65-
+ (thousand
66-
| add_weight(zero + hundred, 0.1)
67-
| add_weight(addzero + zero + (tens | teen), 0.5)
68-
| add_weight(addzero + addzero + zero + digit, 0.5)
69-
| add_weight(digit + addzero**3, 0.8)
70-
| add_weight(addzero**4, 1.0)))
65+
ten_thousand = (
66+
(thousand | hundred | teen | tens | digits) + delete('万') +
67+
(thousand
68+
| add_weight(zero + hundred, 0.1)
69+
| add_weight(addzero + zero + (tens | teen), 0.5)
70+
| add_weight(addzero + addzero + zero + digit, 0.5)
71+
| add_weight(digit + addzero**3, 0.8)
72+
| add_weight(addzero**4, 1.0)))
7173
else:
72-
ten_thousand = ((teen | tens | digits)
73-
+ delete('万')
74-
+ (thousand
75-
| add_weight(zero + hundred, 0.1)
76-
| add_weight(addzero + zero + (tens | teen), 0.5)
77-
| add_weight(addzero + addzero + zero + digit, 0.5)
78-
| add_weight(digit + addzero**3, 0.8)
79-
| add_weight(addzero**4, 1.0)))
80-
ten_thousand |= (thousand | hundred) + accep("万") + delete("零").ques + (
81-
thousand | hundred | tens | teen | digits).ques
74+
ten_thousand = (
75+
(teen | tens | digits) + delete('万') +
76+
(thousand
77+
| add_weight(zero + hundred, 0.1)
78+
| add_weight(addzero + zero + (tens | teen), 0.5)
79+
| add_weight(addzero + addzero + zero + digit, 0.5)
80+
| add_weight(digit + addzero**3, 0.8)
81+
| add_weight(addzero**4, 1.0)))
82+
ten_thousand |= (thousand | hundred) + accep("万") + delete(
83+
"零").ques + (thousand | hundred | tens | teen | digits).ques
8284
# 个/十/百/千/万
8385
number = digits | teen | tens | hundred | thousand | ten_thousand
8486
# 兆/亿
@@ -107,31 +109,31 @@ def build_tagger(self):
107109
# 十/百/千/万
108110
number_exclude_0_to_9 = teen | tens | hundred | thousand | ten_thousand
109111
# 兆/亿
110-
number_exclude_0_to_9 = (
111-
((number_exclude_0_to_9 | digits) + accep('兆') + delete('零').ques).ques +
112-
((number_exclude_0_to_9 | digits) + accep('亿') + delete('零').ques).ques +
113-
number_exclude_0_to_9
114-
)
112+
number_exclude_0_to_9 = (((number_exclude_0_to_9 | digits) +
113+
accep('兆') + delete('零').ques).ques +
114+
((number_exclude_0_to_9 | digits) +
115+
accep('亿') + delete('零').ques).ques +
116+
number_exclude_0_to_9)
115117
# 负的xxx 1.11, 1.01
116-
number_exclude_0_to_9 |= (
117-
(number_exclude_0_to_9 | digits) +
118-
(dot + digits.plus).plus
119-
)
118+
number_exclude_0_to_9 |= ((number_exclude_0_to_9 | digits) +
119+
(dot + digits.plus).plus)
120120
# 五六万,三五千,六七百,三四十
121121
# 十七八美元 => $17~18, 四十五六岁 => 45-6岁,
122122
# 三百七八公里 => 370-80km, 三百七八十千克 => 370-80kg
123123
number_exclude_0_to_9 |= special_2number
124124
number_exclude_0_to_9 |= add_weight(special_3number, -100.0)
125125

126-
self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa
126+
self.number_exclude_0_to_9 = (sign.ques +
127+
number_exclude_0_to_9).optimize() # noqa
127128

128129
# cardinal string like 127.0.0.1, used in ID, IP, etc.
129130
cardinal = digits.plus + (dot + digits.plus).plus
130131
# float number like 1.11
131132
cardinal |= (number + dot + digits.plus)
132133
# cardinal string like 110 or 12306 or 13125617878, used in phone,
133134
# 340621199806051223, used in ID card
134-
cardinal |= (digits**3 | digits**4 | digits**5 | digits**11 | digits**18)
135+
cardinal |= (digits**3 | digits**4 | digits**5 | digits**11
136+
| digits**18)
135137
# cardinal string like 23
136138
if self.enable_standalone_number:
137139
if self.enable_0_to_9:

itn/chinese/rules/date.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ def __init__(self):
2727

2828
def build_tagger(self):
2929
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
30-
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
30+
zero = string_file('itn/chinese/data/number/zero.tsv') # 0
3131

3232
yyyy = digit + (digit | zero)**3 # 二零零八年
33-
yyy = digit + (digit | zero)**2 # 公元一六八年
34-
yy = (digit | zero)**2 # 零八年奥运会
33+
yyy = digit + (digit | zero)**2 # 公元一六八年
34+
yy = (digit | zero)**2 # 零八年奥运会
3535
mm = string_file('itn/chinese/data/date/mm.tsv')
3636
dd = string_file('itn/chinese/data/date/dd.tsv')
3737

itn/chinese/rules/fraction.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,15 @@ def __init__(self):
2828

2929
def build_tagger(self):
3030
number = Cardinal().number
31-
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
31+
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
3232

3333
# NOTE(xcsong): default weight = 1.0, set to -1.0 means higher priority
3434
# For example,
3535
# 1.0, 负二分之三 -> { sign: "" denominator: "-2" numerator: "3" }
3636
# -1.0,负二分之三 -> { sign: "-" denominator: "2" numerator: "3" }
3737
tagger = (insert('sign: "') + add_weight(sign, -1.0).ques +
38-
insert('" denominator: "') + number +
39-
delete('分之') + insert('" numerator: "') +
40-
number + insert('"'))
38+
insert('" denominator: "') + number + delete('分之') +
39+
insert('" numerator: "') + number + insert('"'))
4140
self.tagger = self.add_tokens(tagger)
4241

4342
def build_verbalizer(self):

itn/chinese/rules/license_plate.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ def __init__(self):
2727

2828
def build_tagger(self):
2929
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
30-
province = string_file('itn/chinese/data/license_plate/province.tsv') # 皖
30+
province = string_file(
31+
'itn/chinese/data/license_plate/province.tsv') # 皖
3132
license_plate = province + self.ALPHA + (self.ALPHA | digit)**5
3233
tagger = insert('value: "') + license_plate + insert('"')
3334
self.tagger = self.add_tokens(tagger)

itn/chinese/rules/measure.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,23 +32,23 @@ def build_tagger(self):
3232
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
3333
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
3434
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
35-
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
35+
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
3636
to = cross('到', '~') | cross('到百分之', '~')
3737

38-
units = add_weight((accep('亿') | accep('兆') | accep('万')), -0.5).ques + units_zh
39-
units |= add_weight((cross('亿', '00M') | cross('兆', 'T') |
40-
cross('万', 'W')), -0.5).ques + (
41-
add_weight(units_en, -1.0)
42-
)
38+
units = add_weight(
39+
(accep('亿') | accep('兆') | accep('万')), -0.5).ques + units_zh
40+
units |= add_weight(
41+
(cross('亿', '00M') | cross('兆', 'T') | cross('万', 'W')),
42+
-0.5).ques + (add_weight(units_en, -1.0))
4343

4444
number = Cardinal().number if self.enable_0_to_9 else \
4545
Cardinal().number_exclude_0_to_9
4646
# 百分之三十, 百分三十, 百分之百,百分之三十到四十, 百分之三十到百分之五十五
4747
percent = ((sign + delete('的').ques).ques + delete('百分') +
4848
delete('之').ques +
4949
((Cardinal().number + (to + Cardinal().number).ques) |
50-
((Cardinal().number + to).ques + cross('百', '100')))
51-
+ insert('%'))
50+
((Cardinal().number + to).ques + cross('百', '100'))) +
51+
insert('%'))
5252

5353
# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
5454
measure = number + (to + number).ques + units
@@ -57,9 +57,8 @@ def build_tagger(self):
5757
tagger = insert('value: "') + (measure | percent) + insert('"')
5858

5959
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
60-
tagger |= (
61-
insert('denominator: "') + delete('每') + units +
62-
insert('" numerator: "') + measure + insert('"'))
60+
tagger |= (insert('denominator: "') + delete('每') + units +
61+
insert('" numerator: "') + measure + insert('"'))
6362

6463
self.tagger = self.add_tokens(tagger)
6564

itn/chinese/rules/money.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ def build_tagger(self):
3939
# 三千三百八十元五毛八分 => ¥3380.58
4040
tagger = (insert('value: "') + number + insert('"') +
4141
insert(' currency: "') + (code | symbol) + insert('"') +
42-
insert(' decimal: "') + (
43-
insert(".") + digit + (delete("毛") | delete("角")) + (digit + delete("分")).ques
44-
).ques + insert('"'))
42+
insert(' decimal: "') +
43+
(insert(".") + digit + (delete("毛") | delete("角")) +
44+
(digit + delete("分")).ques).ques + insert('"'))
4545
self.tagger = self.add_tokens(tagger)
4646

4747
def build_verbalizer(self):

itn/chinese/rules/time.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,10 @@ def build_tagger(self):
3131
s = string_file('itn/chinese/data/time/second.tsv')
3232
noon = string_file('itn/chinese/data/time/noon.tsv')
3333

34-
tagger = (
35-
(insert('noon: "') + noon + insert('" ')).ques +
36-
insert('hour: "') + h + insert('"') +
37-
insert(' minute: "') + m + delete('分').ques + insert('"') +
38-
(insert(' second: "') + s + insert('"')).ques)
34+
tagger = ((insert('noon: "') + noon + insert('" ')).ques +
35+
insert('hour: "') + h + insert('"') + insert(' minute: "') +
36+
m + delete('分').ques + insert('"') +
37+
(insert(' second: "') + s + insert('"')).ques)
3938
self.tagger = self.add_tokens(tagger)
4039

4140
def build_verbalizer(self):
@@ -44,6 +43,6 @@ def build_verbalizer(self):
4443
minute = delete(' minute: "') + self.SIGMA + delete('"')
4544
second = delete(' second: "') + self.SIGMA + delete('"')
4645
noon = delete(' noon: "') + self.SIGMA + delete('"')
47-
verbalizer = (hour + addcolon + minute +
48-
(addcolon + second).ques + noon.ques)
46+
verbalizer = (hour + addcolon + minute + (addcolon + second).ques +
47+
noon.ques)
4948
self.verbalizer = self.delete_tokens(verbalizer)

0 commit comments

Comments
 (0)