Skip to content

Commit 16f7f56

Browse files
committed
fix(*): fix missing Japanese tokenization
1 parent 1f5a24d commit 16f7f56

File tree

7 files changed

+415
-396
lines changed

7 files changed

+415
-396
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@
88
"debug.javascript.defaultRuntimeExecutable": {
99
"pwa-node": "/Users/tjnickerson/.local/share/mise/shims/node"
1010
},
11-
"python.defaultInterpreterPath": "${workspaceFolder}/.venv"
11+
"python.defaultInterpreterPath": "${workspaceFolder}/.venv",
12+
"python.languageServer": "None"
1213
}

Cargo.lock

Lines changed: 24 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ tokenize = [
4040
]
4141
tokenize-latin = ["dep:charabia"]
4242
tokenize-chinese = ["tokenize-latin", "charabia/chinese-segmentation"]
43-
tokenize-japanese = ["tokenize-latin", "charabia/japanese-segmentation-unidic"]
43+
tokenize-japanese = ["tokenize-latin", "charabia/japanese"]
4444
tokenize-thai = ["tokenize-latin", "charabia/thai"]
4545
tokenize-korean = ["tokenize-latin", "charabia/korean"]
4646
tokenize-khmer = ["tokenize-latin", "charabia/khmer"]

lib/tests/resolve.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ mod helpers;
22

33
#[cfg(test)]
44
mod resolve_tests {
5-
use indexmap::indexset;
65

76
use odict::{
87
entryset,

lib/tests/tokenize.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,22 @@ mod tokenize_tests {
6767
}
6868
}
6969

70+
#[test]
71+
fn test_tokenize_japanese() {
72+
let dict = EXAMPLE_DICT_1.contents().unwrap();
73+
let result = dict.tokenize("今日は良い天気です", TokenizeOptions::default());
74+
let res = result.as_ref().unwrap();
75+
let expected_lemmas = ["今日", "は", "良い", "天気", "です"];
76+
77+
assert!(result.is_ok());
78+
assert_eq!(res.len(), expected_lemmas.len());
79+
80+
for (i, token) in res.iter().enumerate() {
81+
assert_eq!(token.language, Some(Language::Jpn));
82+
assert_eq!(token.lemma, expected_lemmas[i]);
83+
}
84+
}
85+
7086
#[test]
7187
fn test_tokenize_case_sensitive() {
7288
let dict = EXAMPLE_DICT_1.contents().unwrap();

0 commit comments

Comments
 (0)