Skip to content

Commit 7bfbd17

Browse files
author
Craig Cornelius
authored
Segmenter icu4j added (#462)
* Add segmenter as test type for nodejs with minimal data * Update test and verify data * Add segmenter test cases * Updated generator to produce segmentation tests from NodeJS * Updating data gen and characterizing differences in lists * Remove temporary code * Fix so segmenter data is recomputed * Segmenter with ICU4J * More fixes * Fixing some compile problems * ICU4J segmenter ready to run * Update as per comments on this PR * Removing unneeded .gitignore items * Include segmenter in ICU4J executor * Adding Segmenter in ICU4J * Hardcoding line break results. Adding new locales in new scripts. * Make segmenter type an enum * Add segmenter for versions ICU75, ICU76, ICU77
1 parent 03f0ad1 commit 7bfbd17

File tree

8 files changed

+322
-47
lines changed

8 files changed

+322
-47
lines changed

executors/icu4j/74/executor-icu4j/src/main/java/org/unicode/conformance/Icu4jExecutor.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.unicode.conformance.testtype.numberformatter.NumberFormatterTester;
2020
import org.unicode.conformance.testtype.pluralrules.PluralRulesTester;
2121
import org.unicode.conformance.testtype.relativedatetimeformat.RelativeDateTimeFormatTester;
22+
import org.unicode.conformance.testtype.segmenter.SegmenterTester;
2223

2324
/**
2425
* Hello world!
@@ -140,6 +141,8 @@ public static String getTestCaseResponse(String inputLine) throws Exception {
140141
testType = PluralRulesTester.INSTANCE;
141142
} else if (testTypeStr.equals("rdt_fmt")) {
142143
testType = RelativeDateTimeFormatTester.INSTANCE;
144+
} else if (testTypeStr.equals("segmenter")) {
145+
testType = SegmenterTester.INSTANCE;
143146
} else {
144147
io.lacuna.bifurcan.IMap<String,Object> response =
145148
parsedInputPersistentMap
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package org.unicode.conformance.testtype.segmenter;
2+
3+
import java.util.Collection;
4+
import org.unicode.conformance.testtype.ITestTypeInputJson;
5+
6+
public class SegmenterInputJson implements ITestTypeInputJson {
7+
8+
public String testType;
9+
10+
public String label;
11+
12+
public String locale;
13+
14+
public SegmenterType segmenterType;
15+
16+
public String inputString;
17+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package org.unicode.conformance.testtype.segmenter;
2+
3+
import java.util.List;
4+
import org.unicode.conformance.testtype.ITestTypeOutputJson;
5+
6+
import java.util.Arrays;
7+
8+
public class SegmenterOutputJson implements ITestTypeOutputJson {
9+
10+
public String test_type;
11+
12+
public String label;
13+
14+
public List<String> result;
15+
16+
public String error;
17+
18+
public String error_message;
19+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package org.unicode.conformance.testtype.segmenter;
2+
3+
import static com.ibm.icu.text.BreakIterator.getCharacterInstance;
4+
import static com.ibm.icu.text.BreakIterator.getWordInstance;
5+
import static com.ibm.icu.text.BreakIterator.getLineInstance;
6+
import static com.ibm.icu.text.BreakIterator.getSentenceInstance;
7+
8+
import com.ibm.icu.text.BreakIterator;
9+
import com.ibm.icu.util.ULocale;
10+
import io.lacuna.bifurcan.IMap;
11+
import io.lacuna.bifurcan.Map;
12+
import java.util.ArrayList;
13+
import java.util.Collection;
14+
import java.util.List;
15+
import org.unicode.conformance.ExecutorUtils;
16+
import org.unicode.conformance.testtype.ITestType;
17+
import org.unicode.conformance.testtype.ITestTypeInputJson;
18+
import org.unicode.conformance.testtype.ITestTypeOutputJson;
19+
20+
21+
public class SegmenterTester implements ITestType {
22+
23+
public static SegmenterTester INSTANCE = new SegmenterTester();
24+
25+
@Override
26+
public ITestTypeInputJson inputMapToJson(Map<String, Object> inputMapData) {
27+
SegmenterInputJson result = new SegmenterInputJson();
28+
29+
result.label = (String) inputMapData.get("label", null);
30+
result.locale = (String) inputMapData.get("locale", null);
31+
// The string to be segmented
32+
result.locale = (String) inputMapData.get("input", null);
33+
34+
java.util.Map<String, Object> inputOptions =
35+
(java.util.Map<String, Object>) inputMapData.get("options", null);
36+
result.segmenterType = SegmenterType.getFromString((String) inputOptions.get("granularity"));
37+
38+
result.inputString = (String) inputMapData.get("input", null);
39+
40+
return result;
41+
}
42+
43+
@Override
44+
public ITestTypeOutputJson execute(ITestTypeInputJson inputJson) {
45+
SegmenterInputJson input = (SegmenterInputJson) inputJson;
46+
47+
// partially construct output
48+
SegmenterOutputJson output = (SegmenterOutputJson) getDefaultOutputJson();
49+
output.label = input.label;
50+
51+
try {
52+
output.result = getSegmenterResult(input);
53+
} catch (Exception e) {
54+
output.error = e.getMessage();
55+
output.error_message = e.getMessage();
56+
return output;
57+
}
58+
59+
// If we get here, it's a pass/fail result (supported options and no runtime errors/exceptions)
60+
return output;
61+
}
62+
63+
@Override
64+
public ITestTypeOutputJson getDefaultOutputJson() {
65+
return new SegmenterOutputJson();
66+
}
67+
68+
@Override
69+
public IMap<String, Object> convertOutputToMap(ITestTypeOutputJson outputJson) {
70+
SegmenterOutputJson output = (SegmenterOutputJson) outputJson;
71+
return new Map<String, Object>()
72+
.put("label", output.label)
73+
.put("result", output.result);
74+
}
75+
76+
@Override
77+
public String formatOutputJson(ITestTypeOutputJson outputJson) {
78+
return ExecutorUtils.GSON.toJson((SegmenterOutputJson) outputJson);
79+
}
80+
81+
public List<String> getSegmenterResult(SegmenterInputJson input) {
82+
ULocale locale = ULocale.forLanguageTag(input.locale);
83+
84+
BreakIterator segmenter;
85+
switch (input.segmenterType) {
86+
default:
87+
case GRAPHEME_CLUSTER:
88+
segmenter = getCharacterInstance(locale);
89+
break;
90+
case WORD:
91+
segmenter = getWordInstance(locale);
92+
break;
93+
case SENTENCE:
94+
segmenter = getSentenceInstance(locale);
95+
break;
96+
case LINE:
97+
segmenter = getLineInstance(locale);
98+
break;
99+
}
100+
segmenter.setText(input.inputString);
101+
// Segment the input, creating a list of strings as output.
102+
List<String> result = new ArrayList<>();
103+
int start_pos = segmenter.first();
104+
int end_pos = segmenter.next();
105+
106+
while (end_pos != BreakIterator.DONE) {
107+
String target = input.inputString.substring(start_pos, end_pos);
108+
start_pos = end_pos;
109+
end_pos = segmenter.next();
110+
result.add(target);
111+
}
112+
113+
return result;
114+
}
115+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package org.unicode.conformance.testtype.segmenter;
2+
3+
public enum SegmenterType {
4+
GRAPHEME_CLUSTER,
5+
WORD,
6+
SENTENCE,
7+
LINE;
8+
9+
public static SegmenterType DEFAULT = GRAPHEME_CLUSTER;
10+
11+
public static SegmenterType getFromString(String s) {
12+
try {
13+
return SegmenterType.valueOf(s.toUpperCase());
14+
} catch (Exception e){
15+
return DEFAULT;
16+
}
17+
}
18+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package org.unicode.conformance.segmenter.icu74;
2+
3+
import static org.hamcrest.CoreMatchers.is;
4+
import static org.junit.Assert.assertThat;
5+
6+
import java.util.List;
7+
import java.util.Arrays;
8+
import org.junit.Test;
9+
10+
import org.unicode.conformance.testtype.segmenter.SegmenterOutputJson;
11+
import org.unicode.conformance.testtype.segmenter.SegmenterTester;
12+
13+
public class SegmenterTest {
14+
15+
@Test
16+
public void testEnGraphemeCluster() {
17+
String testInput =
18+
"\t{\"locale\":\"en-US\",\"options\":{\"granularity\":\"grapheme_cluster\"},\"input\":\"The cat;\",\"hexhash\":\"123\",\"label\":\"000\"}";
19+
SegmenterOutputJson output =
20+
(SegmenterOutputJson) SegmenterTester.INSTANCE.getStructuredOutputFromInputStr(testInput);
21+
List<String> expected = Arrays.asList("T", "h", "e", " ", "c", "a", "t", ";");
22+
assertThat(expected, is(output.result));
23+
}
24+
25+
@Test
26+
public void testEnWord() {
27+
String testInput =
28+
"\t{\"locale\":\"en-US\",\"options\":{\"granularity\":\"word\"},\"input\":\"The cat;\",\"hexhash\":\"123\",\"label\":\"000\"}";
29+
SegmenterOutputJson output =
30+
(SegmenterOutputJson) SegmenterTester.INSTANCE.getStructuredOutputFromInputStr(testInput);
31+
List<String> expected = Arrays.asList("The", " ", "cat", ";");
32+
assertThat(expected, is(output.result));
33+
}
34+
35+
@Test
36+
public void testEnSentence() {
37+
String testInput =
38+
"\t{\"locale\":\"en-US\",\"options\":{\"granularity\":\"sentence\"},\"input\":\"The cat. A dog.\",\"hexhash\":\"123\",\"label\":\"000\"}";
39+
SegmenterOutputJson output =
40+
(SegmenterOutputJson) SegmenterTester.INSTANCE.getStructuredOutputFromInputStr(testInput);
41+
List<String> expected = Arrays.asList("The cat. ", "A dog.");
42+
assertThat(expected, is(output.result));
43+
}
44+
45+
@Test
46+
public void testEnLine() {
47+
String testInput =
48+
"\t{\"locale\":\"en-US\",\"options\":{\"granularity\":\"line\"},\"input\":\"The cat. A dog.\",\"hexhash\":\"123\",\"label\":\"000\"}";
49+
SegmenterOutputJson output =
50+
(SegmenterOutputJson) SegmenterTester.INSTANCE.getStructuredOutputFromInputStr(testInput);
51+
List<String> expected = Arrays.asList("The ", "cat. ", "A ", "dog.");
52+
assertThat(expected, is(output.result));
53+
}
54+
}

run_config.json

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@
196196
"lang_names",
197197
"likely_subtags",
198198
"rdt_fmt",
199-
"plural_rules"
199+
"plural_rules",
200+
"segmenter"
200201
],
201202
"per_execution": 10000
202203
}
@@ -217,8 +218,9 @@
217218
"number_fmt",
218219
"lang_names",
219220
"likely_subtags",
221+
"plural_rules",
220222
"rdt_fmt",
221-
"plural_rules"
223+
"segmenter"
222224
],
223225
"per_execution": 10000
224226
}
@@ -584,7 +586,8 @@
584586
"message_fmt2",
585587
"number_fmt",
586588
"plural_rules",
587-
"rdt_fmt"
589+
"rdt_fmt",
590+
"segmenter"
588591
],
589592
"per_execution": 10000
590593
}
@@ -607,7 +610,8 @@
607610
"message_fmt2",
608611
"number_fmt",
609612
"plural_rules",
610-
"rdt_fmt"
613+
"rdt_fmt",
614+
"segmenter"
611615
],
612616
"per_execution": 10000
613617
}
@@ -630,7 +634,8 @@
630634
"message_fmt2",
631635
"number_fmt",
632636
"plural_rules",
633-
"rdt_fmt"
637+
"rdt_fmt",
638+
"segmenter"
634639
],
635640
"per_execution": 10000
636641
}

0 commit comments

Comments
 (0)