feat: add zh-cn, zh-tw support (#69)

tsinggggg · Cheng Qian · web-flow · commit 317299904bcd · 2024-01-22T10:26:01.000-05:00
Co-authored-by: Cheng Qian &lt;cheng.qian@ibm.com&gt;
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y
 - Why is the assistant responding incorrectly to this question?
 - How do I improve my assistant’s ability to understand questions?
 
-Currently Supported Languages: en, fr, cs, de, es, it, pt, nl
+Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw
 
 ## Usage
 If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the 
diff --git a/assistant_skill_analysis/resources/zh-cn/stopwords b/assistant_skill_analysis/resources/zh-cn/stopwords
@@ -0,0 +1,155 @@
+<
+>
+|
+-
+,
+;
+:
+!
+?
+.
+''
+'
+"
+(
+)
+[
+]
+{
+}
+*
+%
++
+。
+<SE>
+一
+一会儿
+一边
+一面
+上
+下
+不
+不但
+不光
+不可
+不如
+不是
+不管
+不论
+与
+与其
+个
+中
+为
+之
+之所以
+也
+也不
+也许
+也许是
+了
+于
+从
+他
+他们
+以
+会
+但
+你们
+便
+倘若
+先
+全
+其
+再
+到
+前
+十
+即使
+却
+又
+及
+只
+只有
+只要
+可
+可以
+可是
+可能
+各
+后
+向
+和
+哪怕
+因为
+因此
+在
+地
+多
+她
+她们
+如果
+宁可
+它
+它们
+对
+将
+小
+就
+尽管
+已
+已经
+并
+并且
+很
+我
+我们
+或
+所
+所以
+才
+把
+据
+无论
+既
+既然
+时
+是
+是因为
+更
+最
+有
+未
+来
+此
+每
+没有
+然后
+然而
+用
+由
+由于
+的
+看
+着
+种
+而
+而且
+而是
+能
+自己
+至
+虽然
+被
+要
+认为
+让
+该
+还
+还是
+这
+通过
+那么
+都
+非
+、
diff --git a/assistant_skill_analysis/resources/zh-tw/stopwords b/assistant_skill_analysis/resources/zh-tw/stopwords
@@ -0,0 +1,51 @@
+the
+of
+is
+and
+to
+in
+that
+we
+for
+an
+are
+by
+be
+as
+on
+with
+can
+if
+from
+which
+you
+it
+this
+then
+at
+have
+all
+not
+one
+has
+or
+that
+的
+了
+和
+是
+就
+都
+而
+及
+與
+著
+或
+一個
+沒有
+我們
+你們
+妳們
+他們
+她們
+是否
diff --git a/assistant_skill_analysis/utils/lang_utils.py b/assistant_skill_analysis/utils/lang_utils.py
@@ -1,20 +1,31 @@
 import os
 import re
+from types import SimpleNamespace
 import sys
+import jieba
 from nltk.stem.snowball import SnowballStemmer
 from spacy.tokenizer import Tokenizer
 import unicodedata
 import assistant_skill_analysis
 
 
-SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"]
+SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"]
 PUNCTUATION = [
     "\\" + chr(i)
     for i in range(sys.maxunicode)
     if unicodedata.category(chr(i)).startswith("P")
 ]
 
 
+class _JiebaTokenizerWrapper:
+    """for zh-cn and zh-tw"""
+
+    def __call__(self, *args, **kwargs):
+        text = args[0]
+        for token in jieba.tokenize(text):
+            yield SimpleNamespace(text=token[0])
+
+
 class LanguageUtility:
     def __init__(self, language_code):
         if language_code not in SUPPORTED_LANGUAGE:
@@ -96,6 +107,11 @@ def init_resources(self):
             self.tokenizer = Tokenizer(Dutch().vocab)
             self.stemmer = SnowballStemmer(language="dutch")
             self.stop_words = self.load_stop_words(stopwords_path)
+
+        elif self.language_code in ["zh-cn", "zh-tw"]:
+            self.tokenizer = _JiebaTokenizerWrapper()
+            self.stop_words = self.load_stop_words(stopwords_path)
+
         else:
             raise Exception("language code %s is not supported", self.language_code)
 
diff --git a/classic_dialog_skill_analysis.ipynb b/classic_dialog_skill_analysis.ipynb
@@ -73,7 +73,7 @@
    "metadata": {},
    "source": [
     "Pick the language code correspond to your workspace data:   \n",
-    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
+    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
    ]
   },
   {
diff --git a/classic_dialog_skill_analysis_cp4d.ipynb b/classic_dialog_skill_analysis_cp4d.ipynb
@@ -73,7 +73,7 @@
    "metadata": {},
    "source": [
     "Pick the language code correspond to your workspace data:   \n",
-    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
+    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
    ]
   },
   {
diff --git a/new_experience_skill_analysis.ipynb b/new_experience_skill_analysis.ipynb
@@ -80,7 +80,7 @@
     "### Assistant Settings\n",
     "Please set values for the variables in the cell below to configure this notebook.\n",
     "\n",
-    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
+    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
     "\n",
     "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
     "\n",
diff --git a/new_experience_skill_analysis_cp4d.ipynb b/new_experience_skill_analysis_cp4d.ipynb
@@ -80,7 +80,7 @@
     "### Assistant Settings\n",
     "Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n",
     "\n",
-    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
+    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
     "\n",
     "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
     "\n",
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ scipy>=1.2.0
 jupyter
 spacy~=2.3.2
 ibm-cos-sdk>=2.11.0
-nbconvert>=7.7.1
+nbconvert>=7.7.1
+jieba
diff --git a/tests/utils/test_lang_utils.py b/tests/utils/test_lang_utils.py
@@ -61,6 +61,20 @@ def test_de(self):
         sent = util.tokenize(sent)
         self.assertEqual(sent, ["autobahn"])
 
+    def test_zh_cn(self):
+        util = LanguageUtility("zh-cn")
+        sent = util.preprocess("不想当兼职")
+        self.assertEqual(sent, "不想当兼职")
+        sent = util.tokenize(sent)
+        self.assertEqual(sent, ['不想', '当', '兼职'])
+
+    def test_zh_tw(self):
+        util = LanguageUtility("zh-tw")
+        sent = util.preprocess("畀到機會我嘗試")
+        self.assertEqual(sent, "畀到機會我嘗試")
+        sent = util.tokenize(sent)
+        self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試'])
+
     def tearDown(self):
         unittest.TestCase.tearDown(self)
         self.skill_file.close()

-Original file line number
+Diff line change
@@ @@ -0,0 +1,51 @@ @@
 +the
 +of
 +is
 +and
 +to
 +in
 +that
 +we
 +for
 +an
 +are
 +by
 +be
 +as
 +on
 +with
 +can
 +if
 +from
 +which
 +you
 +it
 +this
 +then
 +at
 +have
 +all
 +not
 +one
 +has
 +or
 +that
 +的
 +了
 +和
 +是
 +就
 +都
 +而
 +及
 +與
 +著
 +或
 +一個
 +沒有
 +我們
 +你們
 +妳們
 +他們
 +她們
 +是否
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@`
`73`	`73`	`"metadata": {},`
`74`	`74`	`"source": [`
`75`	`75`	`"Pick the language code correspond to your workspace data: \n",`
`76`		`- "Supported Language codes: en, fr, de, es, cs, it, pt, nl"`
	`76`	`+ "Supported Language codes: en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw"`
`77`	`77`	`]`
`78`	`78`	`},`
`79`	`79`	`{`