|
| 1 | +//! Executor provides tests for segmenter in locale-sensitive manner. |
| 2 | +
|
| 3 | +use serde::{Deserialize, Serialize}; |
| 4 | +use serde_json::{json, Value}; |
| 5 | + |
| 6 | +#[cfg(not(any(ver = "2.0-beta1")))] |
| 7 | +use icu::segmenter::{options::*, *}; |
| 8 | + |
| 9 | +use super::compat::Locale; |
| 10 | + |
| 11 | +#[derive(Deserialize, Serialize, Debug)] |
| 12 | +#[serde(rename_all = "snake_case")] |
| 13 | +struct SegmenterOptions { |
| 14 | + granularity: Option<String>, |
| 15 | +} |
| 16 | + |
| 17 | +// Function runs segmenter tests |
| 18 | +pub fn run_segmenter_test(json_obj: &Value) -> Result<Value, String> { |
| 19 | + // To use the locale |
| 20 | + let label = &json_obj["label"].as_str().unwrap(); |
| 21 | + |
| 22 | + let locale_str: &str = json_obj["locale"].as_str().unwrap(); |
| 23 | + let locale = locale_str.parse::<Locale>().unwrap(); |
| 24 | + let lang_identifier = locale.id; |
| 25 | + |
| 26 | + let options = &json_obj["options"]; // This will be an array. |
| 27 | + let option_struct: SegmenterOptions = serde_json::from_str(&options.to_string()).unwrap(); |
| 28 | + |
| 29 | + let granularity: &str = option_struct.granularity.as_ref().unwrap(); |
| 30 | + |
| 31 | + let input_string: &str = json_obj["input"].as_str().unwrap(); |
| 32 | + |
| 33 | + // Get desired segmenter, then comput the break points from the input. |
| 34 | + let breakpoints: Vec<usize> = match granularity { |
| 35 | + "grapheme_cluster" => { |
| 36 | + let segmenter = GraphemeClusterSegmenter::new(); |
| 37 | + segmenter.segment_str(input_string).collect() |
| 38 | + } |
| 39 | + "word" => { |
| 40 | + // Get options |
| 41 | + let mut options = WordBreakOptions::default(); |
| 42 | + options.content_locale = Some(&lang_identifier); |
| 43 | + |
| 44 | + let segmenter = WordSegmenter::try_new_auto(options).unwrap(); |
| 45 | + segmenter.segment_str(input_string).collect() |
| 46 | + } |
| 47 | + "sentence" => { |
| 48 | + // Get options |
| 49 | + let mut options = SentenceBreakOptions::default(); |
| 50 | + options.content_locale = Some(&lang_identifier); |
| 51 | + let segmenter = SentenceSegmenter::try_new(options).unwrap(); |
| 52 | + segmenter.segment_str(input_string).collect() |
| 53 | + } |
| 54 | + "line" => { |
| 55 | + // Get options |
| 56 | + let mut options = LineBreakOptions::default(); |
| 57 | + options.strictness = Some(LineBreakStrictness::Strict); |
| 58 | + options.content_locale = Some(&lang_identifier); |
| 59 | + let segmenter = LineSegmenter::new_auto(options); |
| 60 | + segmenter.segment_str(input_string).collect() |
| 61 | + } |
| 62 | + _ => { |
| 63 | + // This is an error |
| 64 | + return Ok(json!({ |
| 65 | + "label": label, |
| 66 | + "locale_label": locale_str, |
| 67 | + "error": "Unknown segmenter option", |
| 68 | + "error_type": "unsupported", |
| 69 | + "unsupported": granularity.to_string(), |
| 70 | + "error_detail": {"unsupported_locale": locale_str} |
| 71 | + })); |
| 72 | + } |
| 73 | + }; |
| 74 | + |
| 75 | + // Create output as list of strings broken ad the breakpoints. |
| 76 | + // For breakpoints, extract each part and push to result; |
| 77 | + let mut segments: Vec<String> = vec![]; |
| 78 | + let mut start: usize = 0; |
| 79 | + for index in breakpoints.iter() { |
| 80 | + if *index > 0 { |
| 81 | + segments.push(input_string[start..*index].to_string()); |
| 82 | + } |
| 83 | + start = *index; |
| 84 | + } |
| 85 | + Ok(json!({ |
| 86 | + "label": label, |
| 87 | + "result": segments |
| 88 | + })) |
| 89 | +} |
0 commit comments