Skip to content

Commit 158de3c

Browse files
author
Craig Cornelius
authored
ICU4X Segmenter now working for 2.0 beta2 (#465)
* Starting Segmenter tests in ICU4X * ICU4X segmenter in beta2.02 and later * Remove option to break lines within words
1 parent d8476cc commit 158de3c

File tree

4 files changed

+101
-1
lines changed

4 files changed

+101
-1
lines changed

executors/rust/common/run_all_tests.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// 9. DONE Modularize into separate files for each type of test
1313
// 10. Fix test_type and switch statement
1414
// 11. DONE Add language names --> locale names
15+
// 12. How to add new component in certain versions?
1516

1617
// References for ICU4X:
1718
// https://unicode-org.github.io/icu4x-docs/doc/icu_collator/index.html
@@ -34,6 +35,7 @@ pub struct ExecutorFns {
3435
pub run_numberformat_test: ExecutorFn,
3536
pub run_plural_rules_test: ExecutorFn,
3637
pub run_relativedatetimeformat_test: ExecutorFn,
38+
pub run_segmenter_test: ExecutorFn,
3739
}
3840

3941
pub fn main() -> io::Result<()> {
@@ -47,6 +49,10 @@ pub fn main() -> io::Result<()> {
4749
run_plural_rules_test: executors::pluralrules::run_plural_rules_test,
4850
run_relativedatetimeformat_test:
4951
executors::relativedatetime_fmt::run_relativedatetimeformat_test,
52+
#[cfg(not(any(ver = "1.3", ver = "1.4", ver = "1.5", ver = "2.0-beta1")))]
53+
run_segmenter_test: executors::segmenter::run_segmenter_test,
54+
#[cfg(any(ver = "1.3", ver = "1.4", ver = "1.5", ver = "2.0-beta1"))]
55+
run_segmenter_test: |_| Err("segmenter not supported".to_string()),
5056
};
5157
run_all_tests(executor_fns)
5258
}
@@ -126,6 +132,8 @@ pub fn run_all_tests(fns: ExecutorFns) -> io::Result<()> {
126132
(fns.run_plural_rules_test)(&json_info)
127133
} else if test_type == "rdt_fmt" {
128134
(fns.run_relativedatetimeformat_test)(&json_info)
135+
} else if test_type == "segmenter" {
136+
(fns.run_segmenter_test)(&json_info)
129137
} else {
130138
Err(test_type.to_string())
131139
};

executors/rust/src/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ pub mod localenames;
1010
pub mod numberfmt;
1111
pub mod pluralrules;
1212
pub mod relativedatetime_fmt;
13+
#[cfg(not(any(ver = "1.3", ver = "1.4", ver = "1.5", ver = "2.0-beta1")))]
14+
pub mod segmenter;
1315

1416
#[cfg(any(ver = "1.3", ver = "1.4", ver = "1.5"))]
1517
#[path = "datetime_1.rs"]

executors/rust/src/segmenter.rs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
//! Executor provides tests for segmenter in locale-sensitive manner.
2+
3+
use serde::{Deserialize, Serialize};
4+
use serde_json::{json, Value};
5+
6+
#[cfg(not(any(ver = "2.0-beta1")))]
7+
use icu::segmenter::{options::*, *};
8+
9+
use super::compat::Locale;
10+
11+
#[derive(Deserialize, Serialize, Debug)]
12+
#[serde(rename_all = "snake_case")]
13+
struct SegmenterOptions {
14+
granularity: Option<String>,
15+
}
16+
17+
// Function runs segmenter tests
18+
pub fn run_segmenter_test(json_obj: &Value) -> Result<Value, String> {
19+
// To use the locale
20+
let label = &json_obj["label"].as_str().unwrap();
21+
22+
let locale_str: &str = json_obj["locale"].as_str().unwrap();
23+
let locale = locale_str.parse::<Locale>().unwrap();
24+
let lang_identifier = locale.id;
25+
26+
let options = &json_obj["options"]; // This will be an array.
27+
let option_struct: SegmenterOptions = serde_json::from_str(&options.to_string()).unwrap();
28+
29+
let granularity: &str = option_struct.granularity.as_ref().unwrap();
30+
31+
let input_string: &str = json_obj["input"].as_str().unwrap();
32+
33+
// Get desired segmenter, then comput the break points from the input.
34+
let breakpoints: Vec<usize> = match granularity {
35+
"grapheme_cluster" => {
36+
let segmenter = GraphemeClusterSegmenter::new();
37+
segmenter.segment_str(input_string).collect()
38+
}
39+
"word" => {
40+
// Get options
41+
let mut options = WordBreakOptions::default();
42+
options.content_locale = Some(&lang_identifier);
43+
44+
let segmenter = WordSegmenter::try_new_auto(options).unwrap();
45+
segmenter.segment_str(input_string).collect()
46+
}
47+
"sentence" => {
48+
// Get options
49+
let mut options = SentenceBreakOptions::default();
50+
options.content_locale = Some(&lang_identifier);
51+
let segmenter = SentenceSegmenter::try_new(options).unwrap();
52+
segmenter.segment_str(input_string).collect()
53+
}
54+
"line" => {
55+
// Get options
56+
let mut options = LineBreakOptions::default();
57+
options.strictness = Some(LineBreakStrictness::Strict);
58+
options.content_locale = Some(&lang_identifier);
59+
let segmenter = LineSegmenter::new_auto(options);
60+
segmenter.segment_str(input_string).collect()
61+
}
62+
_ => {
63+
// This is an error
64+
return Ok(json!({
65+
"label": label,
66+
"locale_label": locale_str,
67+
"error": "Unknown segmenter option",
68+
"error_type": "unsupported",
69+
"unsupported": granularity.to_string(),
70+
"error_detail": {"unsupported_locale": locale_str}
71+
}));
72+
}
73+
};
74+
75+
// Create output as list of strings broken ad the breakpoints.
76+
// For breakpoints, extract each part and push to result;
77+
let mut segments: Vec<String> = vec![];
78+
let mut start: usize = 0;
79+
for index in breakpoints.iter() {
80+
if *index > 0 {
81+
segments.push(input_string[start..*index].to_string());
82+
}
83+
start = *index;
84+
}
85+
Ok(json!({
86+
"label": label,
87+
"result": segments
88+
}))
89+
}

run_config.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,8 @@
522522
"likely_subtags",
523523
"list_fmt",
524524
"plural_rules",
525-
"rdt_fmt"
525+
"rdt_fmt",
526+
"segmenter"
526527
],
527528
"per_execution": 10000
528529
}

0 commit comments

Comments
 (0)