Skip to content

Commit 3172999

Browse files
tsingggggCheng Qian
andauthored
feat: add zh-cn, zh-tw support (#69)
Co-authored-by: Cheng Qian <[email protected]>
1 parent e35c6e9 commit 3172999

File tree

10 files changed

+244
-7
lines changed

10 files changed

+244
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y
1111
- Why is the assistant responding incorrectly to this question?
1212
- How do I improve my assistant’s ability to understand questions?
1313

14-
Currently Supported Languages: en, fr, cs, de, es, it, pt, nl
14+
Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw
1515

1616
## Usage
1717
If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
<
2+
>
3+
|
4+
-
5+
,
6+
;
7+
:
8+
!
9+
?
10+
.
11+
''
12+
'
13+
"
14+
(
15+
)
16+
[
17+
]
18+
{
19+
}
20+
*
21+
%
22+
+
23+
24+
<SE>
25+
26+
一会儿
27+
一边
28+
一面
29+
30+
31+
32+
不但
33+
不光
34+
不可
35+
不如
36+
不是
37+
不管
38+
不论
39+
40+
与其
41+
42+
43+
44+
45+
之所以
46+
47+
也不
48+
也许
49+
也许是
50+
51+
52+
53+
54+
他们
55+
56+
57+
58+
你们
59+
便
60+
倘若
61+
62+
63+
64+
65+
66+
67+
68+
即使
69+
70+
71+
72+
73+
只有
74+
只要
75+
76+
可以
77+
可是
78+
可能
79+
80+
81+
82+
83+
哪怕
84+
因为
85+
因此
86+
87+
88+
89+
90+
她们
91+
如果
92+
宁可
93+
94+
它们
95+
96+
97+
98+
99+
尽管
100+
101+
已经
102+
103+
并且
104+
105+
106+
我们
107+
108+
109+
所以
110+
111+
112+
113+
无论
114+
115+
既然
116+
117+
118+
是因为
119+
120+
121+
122+
123+
124+
125+
126+
没有
127+
然后
128+
然而
129+
130+
131+
由于
132+
133+
134+
135+
136+
137+
而且
138+
而是
139+
140+
自己
141+
142+
虽然
143+
144+
145+
认为
146+
147+
148+
149+
还是
150+
151+
通过
152+
那么
153+
154+
155+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
the
2+
of
3+
is
4+
and
5+
to
6+
in
7+
that
8+
we
9+
for
10+
an
11+
are
12+
by
13+
be
14+
as
15+
on
16+
with
17+
can
18+
if
19+
from
20+
which
21+
you
22+
it
23+
this
24+
then
25+
at
26+
have
27+
all
28+
not
29+
one
30+
has
31+
or
32+
that
33+
34+
35+
36+
37+
38+
39+
40+
41+
42+
43+
44+
一個
45+
沒有
46+
我們
47+
你們
48+
妳們
49+
他們
50+
她們
51+
是否

assistant_skill_analysis/utils/lang_utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,31 @@
11
import os
22
import re
3+
from types import SimpleNamespace
34
import sys
5+
import jieba
46
from nltk.stem.snowball import SnowballStemmer
57
from spacy.tokenizer import Tokenizer
68
import unicodedata
79
import assistant_skill_analysis
810

911

10-
SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"]
12+
SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"]
1113
PUNCTUATION = [
1214
"\\" + chr(i)
1315
for i in range(sys.maxunicode)
1416
if unicodedata.category(chr(i)).startswith("P")
1517
]
1618

1719

20+
class _JiebaTokenizerWrapper:
21+
"""for zh-cn and zh-tw"""
22+
23+
def __call__(self, *args, **kwargs):
24+
text = args[0]
25+
for token in jieba.tokenize(text):
26+
yield SimpleNamespace(text=token[0])
27+
28+
1829
class LanguageUtility:
1930
def __init__(self, language_code):
2031
if language_code not in SUPPORTED_LANGUAGE:
@@ -96,6 +107,11 @@ def init_resources(self):
96107
self.tokenizer = Tokenizer(Dutch().vocab)
97108
self.stemmer = SnowballStemmer(language="dutch")
98109
self.stop_words = self.load_stop_words(stopwords_path)
110+
111+
elif self.language_code in ["zh-cn", "zh-tw"]:
112+
self.tokenizer = _JiebaTokenizerWrapper()
113+
self.stop_words = self.load_stop_words(stopwords_path)
114+
99115
else:
100116
raise Exception("language code %s is not supported", self.language_code)
101117

classic_dialog_skill_analysis.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
"metadata": {},
7474
"source": [
7575
"Pick the language code correspond to your workspace data: \n",
76-
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
76+
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
7777
]
7878
},
7979
{

classic_dialog_skill_analysis_cp4d.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
"metadata": {},
7474
"source": [
7575
"Pick the language code correspond to your workspace data: \n",
76-
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
76+
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
7777
]
7878
},
7979
{

new_experience_skill_analysis.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"### Assistant Settings\n",
8181
"Please set values for the variables in the cell below to configure this notebook.\n",
8282
"\n",
83-
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
83+
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
8484
"\n",
8585
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
8686
"\n",

new_experience_skill_analysis_cp4d.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"### Assistant Settings\n",
8181
"Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n",
8282
"\n",
83-
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
83+
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
8484
"\n",
8585
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
8686
"\n",

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ scipy>=1.2.0
1111
jupyter
1212
spacy~=2.3.2
1313
ibm-cos-sdk>=2.11.0
14-
nbconvert>=7.7.1
14+
nbconvert>=7.7.1
15+
jieba

tests/utils/test_lang_utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,20 @@ def test_de(self):
6161
sent = util.tokenize(sent)
6262
self.assertEqual(sent, ["autobahn"])
6363

64+
def test_zh_cn(self):
65+
util = LanguageUtility("zh-cn")
66+
sent = util.preprocess("不想当兼职")
67+
self.assertEqual(sent, "不想当兼职")
68+
sent = util.tokenize(sent)
69+
self.assertEqual(sent, ['不想', '当', '兼职'])
70+
71+
def test_zh_tw(self):
72+
util = LanguageUtility("zh-tw")
73+
sent = util.preprocess("畀到機會我嘗試")
74+
self.assertEqual(sent, "畀到機會我嘗試")
75+
sent = util.tokenize(sent)
76+
self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試'])
77+
6478
def tearDown(self):
6579
unittest.TestCase.tearDown(self)
6680
self.skill_file.close()

0 commit comments

Comments
 (0)