Skip to content

Commit 34b253e

Browse files
authored
Merge pull request #19 from dataiku/feature/japanese-support
Japanese support
2 parents 4686186 + 4b5e37c commit 34b253e

File tree

7 files changed

+18
-7
lines changed

7 files changed

+18
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
![Build status](https://github.com/dataiku/dss-plugin-nlp-visualization/actions/workflows/auto-make.yml/badge.svg) ![GitHub release (latest by date)](https://img.shields.io/github/v/release/dataiku/dss-plugin-nlp-visualization?logo=github) ![Support level](https://img.shields.io/badge/support-Unsupported-orange)
44

5-
This Dataiku DSS plugin provides a recipe to visualize text data in 58 languages using word clouds.
5+
This Dataiku DSS plugin provides a recipe to visualize text data in 59 languages using word clouds.
66

77
Documentation: https://www.dataiku.com/product/plugins/nlp-visualization/
88

code-env/python/spec/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ pymorphy2==0.9.1
44
jieba==0.42.1
55
pyvi==0.1
66
regex==2020.11.13
7-
spacy[lookups,th]==2.3.5
7+
spacy[ja,lookups,th]==2.3.5
88
emoji==1.2.0
99
tqdm==4.50.2
1010
matplotlib==3.3.1

plugin.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"meta": {
55
"label": "Text visualization",
66
"category": "Natural Language Processing",
7-
"description": "Visualize text data in 58 languages using word clouds",
7+
"description": "Visualize text data in 59 languages using word clouds",
88
"author": "Dataiku (Alex LANDEAU, Alex COMBESSIE)",
99
"icon": "icon-quote-left",
1010
"tags": [
@@ -14,4 +14,4 @@
1414
"licenseInfo": "Apache Software License",
1515
"supportLevel": "NOT_SUPPORTED"
1616
}
17-
}
17+
}

python-lib/language_dict.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"id": "Indonesian",
2929
"is": "Icelandic",
3030
"it": "Italian",
31+
"ja": "Japanese",
3132
"kn": "Kannada",
3233
"lb": "Luxembourgish",
3334
"lt": "Lithuanian",
@@ -64,7 +65,7 @@
6465
"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages
6566
6667
Dictionary with ISO 639-1 language code (key) and language name (value)
67-
Japanese and Korean were excluded for now because of system installation issues
68+
Korean is excluded for now because of system installation issues
6869
"""
6970

7071
SPACY_LANGUAGE_MODELS = {

python-lib/wordcloud_visualizer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class WordcloudVisualizer:
6565
"""
6666
FONT_EXCEPTIONS_DICT = {
6767
"gu": "NotoSansMerged-Regular-2048upem.ttf",
68+
"ja": "NotoSansCJKjp-Regular.otf",
6869
"kn": "NotoSansMerged-Regular-2048upem.ttf",
6970
"ml": "NotoSansMerged-Regular-2048upem.ttf",
7071
"te": "NotoSansMerged-Regular-2048upem.ttf",
15.7 MB
Binary file not shown.

tests/python/unit/test_spacy_tokenizer.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,28 @@ def test_tokenize_df_english():
2020
assert len(tokenized_document) == 15
2121

2222

23+
def test_tokenize_df_japanese():
24+
input_df = pd.DataFrame({"input_text": ["期一会。 異体同心。 そうです。"]})
25+
tokenizer = MultilingualTokenizer()
26+
output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language="ja")
27+
tokenized_document = output_df[tokenizer.tokenized_column][0]
28+
assert len(tokenized_document) == 9
29+
30+
2331
def test_tokenize_df_multilingual():
2432
input_df = pd.DataFrame(
2533
{
2634
"input_text": [
2735
"I hope nothing. I fear nothing. I am free.",
2836
" Les sanglots longs des violons d'automne",
2937
"子曰:“學而不思則罔,思而不學則殆。”",
38+
"期一会。 異体同心。 そうです。",
3039
],
31-
"language": ["en", "fr", "zh"],
40+
"language": ["en", "fr", "zh", "ja"],
3241
}
3342
)
3443
tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
3544
output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language_column="language")
3645
tokenized_documents = output_df[tokenizer.tokenized_column]
3746
tokenized_documents_length = [len(doc) for doc in tokenized_documents]
38-
assert tokenized_documents_length == [12, 8, 13]
47+
assert tokenized_documents_length == [12, 8, 13, 9]

0 commit comments

Comments
 (0)