|
1 | 1 | import os |
2 | 2 | import re |
| 3 | +from types import SimpleNamespace |
3 | 4 | import sys |
| 5 | +import jieba |
4 | 6 | from nltk.stem.snowball import SnowballStemmer |
5 | 7 | from spacy.tokenizer import Tokenizer |
6 | 8 | import unicodedata |
7 | 9 | import assistant_skill_analysis |
8 | 10 |
|
9 | 11 |
|
10 | | -SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"] |
| 12 | +SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"] |
11 | 13 | PUNCTUATION = [ |
12 | 14 | "\\" + chr(i) |
13 | 15 | for i in range(sys.maxunicode) |
14 | 16 | if unicodedata.category(chr(i)).startswith("P") |
15 | 17 | ] |
16 | 18 |
|
17 | 19 |
|
| 20 | +class _JiebaTokenizerWrapper: |
| 21 | + """for zh-cn and zh-tw""" |
| 22 | + |
| 23 | + def __call__(self, *args, **kwargs): |
| 24 | + text = args[0] |
| 25 | + for token in jieba.tokenize(text): |
| 26 | + yield SimpleNamespace(text=token[0]) |
| 27 | + |
| 28 | + |
18 | 29 | class LanguageUtility: |
19 | 30 | def __init__(self, language_code): |
20 | 31 | if language_code not in SUPPORTED_LANGUAGE: |
@@ -96,6 +107,11 @@ def init_resources(self): |
96 | 107 | self.tokenizer = Tokenizer(Dutch().vocab) |
97 | 108 | self.stemmer = SnowballStemmer(language="dutch") |
98 | 109 | self.stop_words = self.load_stop_words(stopwords_path) |
| 110 | + |
| 111 | + elif self.language_code in ["zh-cn", "zh-tw"]: |
| 112 | + self.tokenizer = _JiebaTokenizerWrapper() |
| 113 | + self.stop_words = self.load_stop_words(stopwords_path) |
| 114 | + |
99 | 115 | else: |
100 | 116 | raise Exception("language code %s is not supported", self.language_code) |
101 | 117 |
|
|
0 commit comments