Skip to content

Commit 03f0ad1

Browse files
author
Craig Cornelius
authored
Add segmenter as test type for nodejs with minimal data (#458)
* Add segmenter as test type for nodejs with minimal data * Update test and verify data * Add segmenter test cases * Updated generator to produce segmentation tests from NodeJS * Updating data gen and characterizing differences in lists * Remove temporary code * Fix so segmenter data is recomputed * Update as per comments on this PR * Removing unneeded .gitignore items * Add classification type
1 parent eb2b33a commit 03f0ad1

File tree

17 files changed

+656
-39
lines changed

17 files changed

+656
-39
lines changed

.gitignore

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#*#
22
*~
33
*#
4+
*.#
45
.vscode
56

7+
*Minibuf-*
8+
69
.pylintrc
710
.idea
811
.devcontainer
@@ -50,6 +53,3 @@ logrotate.state
5053

5154
# Maven
5255
dependency-reduced-pom.xml
53-
54-
# Dart
55-
executors/dart/.dart_tool/

executors/node/executor.js

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ let plural_rules = require('./plural_rules.js');
3434

3535
let rdt_fmt = require('./relativedatetime_fmt.js');
3636

37+
let segmenter = require('./segmenter.js');
38+
3739
/**
3840
* TODOs:
3941
* 1. Handle other types of test cases.
@@ -64,7 +66,8 @@ const testTypes = {
6466
TestDisplayNames : Symbol("display_names"),
6567
TestListFmt : Symbol("list_fmt"),
6668
TestLocaleDisplayNames : Symbol("language_display_name"),
67-
TestRelativeDateTimeFormat : Symbol("rdt_fmt")
69+
TestRelativeDateTimeFormat : Symbol("rdt_fmt"),
70+
TestSegmenter : Symbol("segmenter")
6871
};
6972

7073
const supported_test_types = [
@@ -79,8 +82,9 @@ const supported_test_types = [
7982
Symbol("local_info"),
8083
Symbol("datetime_fmt"),
8184
Symbol("list_fmt"),
85+
Symbol("plural_rules"),
8286
Symbol("rdt_fmt"),
83-
Symbol("plural_rules")
87+
Symbol("segmenter")
8488
];
8589

8690
const supported_tests_json = {
@@ -93,8 +97,9 @@ const supported_tests_json = {
9397
"lang_names",
9498
"language_display_name",
9599
"list_fmt",
100+
"plural_rules",
96101
"rdt_fmt",
97-
"plural_rules"
102+
"segmenter"
98103
]};
99104

100105
// Test line-by-line input, with output as string.
@@ -166,7 +171,6 @@ function parseJsonForTestId(parsed) {
166171
let lineId = 0;
167172
rl.on('line', function(line) {
168173

169-
// if logging input.
170174
if (doLogInput > 0) {
171175
console.log("## NODE RECEIVED " + lineId + ' ' + line + ' !!!!!');
172176
}
@@ -247,6 +251,9 @@ rl.on('line', function(line) {
247251
} else
248252
if (test_type == "plural_rules") {
249253
outputLine = plural_rules.testPluralRules(parsedJson);
254+
} else
255+
if (test_type == "segmenter") {
256+
outputLine = segmenter.testSegmenter(parsedJson);
250257
} else {
251258
outputLine = {'error': 'unknown test type',
252259
'test_type': test_type,

executors/node/segmenter.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Tests Intl segmenter
2+
3+
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter
4+
5+
const supported_options = ['grapheme', 'word', 'sentence'];
6+
7+
module.exports = {
8+
testSegmenter: function (json) {
9+
const label = json['label'];
10+
const locale = json['locale'];
11+
let options;
12+
let ecma_intl_test_option;
13+
if (json['options']) {
14+
options = json['options'];
15+
ecma_intl_test_option = options['granularity'];
16+
if (options['granularity'] == 'grapheme_cluster') {
17+
// Change to use ECMAScript's enum.
18+
ecma_intl_test_option = 'grapheme';
19+
}
20+
}
21+
22+
let return_json = {'label': label};
23+
if (!supported_options.includes(ecma_intl_test_option)) {
24+
// Not supported
25+
return_json['unsupported'] = 'granularity';
26+
return_json['error_detail'] = ecma_intl_test_option;
27+
return return_json;
28+
}
29+
let segmented_result = [];
30+
try {
31+
segmenter = new Intl.Segmenter(locale, {'granularity': ecma_intl_test_option});
32+
} catch (error) {
33+
/* Something is wrong with the constructor */
34+
return_json['error'] = 'CONSTRUCTOR: ' + error.message;
35+
return return_json;
36+
}
37+
38+
let input;
39+
try {
40+
input = json['input'];
41+
} catch (error) {
42+
return_json['error'] = 'INPUT ERROR: ' + error.message;
43+
return return_json;
44+
}
45+
46+
try {
47+
// Iterate through the results until error
48+
const iterator = segmenter.segment(input)[Symbol.iterator]();
49+
let seg_item = iterator.next();
50+
while (! seg_item.done) {
51+
segmented_result.push(seg_item.value.segment);
52+
seg_item = iterator.next();
53+
}
54+
} catch (error) {
55+
return_json['unsupported'] = 'SEGMENTER UNKNOWN ERROR: ' + error.message;
56+
}
57+
return_json['result'] = segmented_result;
58+
return return_json;
59+
60+
61+
}
62+
}

run_config.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@
173173
"lang_names",
174174
"likely_subtags",
175175
"rdt_fmt",
176-
"plural_rules"
176+
"plural_rules",
177+
"segmenter"
177178
],
178179
"per_execution": 10000
179180
}

schema/schema_files.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
'message_fmt2',
99
'number_format',
1010
'plural_rules',
11-
'rdt_fmt'
11+
'rdt_fmt',
12+
'segmenter'
1213
]
1314

1415
TEST_FILE_TO_TEST_TYPE_MAP = {
@@ -26,7 +27,8 @@
2627
'plural_rules_test_file': 'plural_rules',
2728
'plural_rules_test': 'plural_rules',
2829
'rdt_fmt_test_file': 'rdt_fmt',
29-
'rdt_fmt_test': 'rdt_fmt'
30+
'rdt_fmt_test': 'rdt_fmt',
31+
'segmenter_test': 'segmenter'
3032
}
3133

3234
SCHEMA_FILE_MAP = {
@@ -107,6 +109,7 @@
107109
"prod_file": "lang_name_test_file.json"
108110
}
109111
},
112+
110113
"likely_subtags": {
111114
"test_data": {
112115
"schema_file": "likely_subtags/test_schema.json",
@@ -121,6 +124,7 @@
121124
"prod_file": "likely_subtags_test.json"
122125
}
123126
},
127+
124128
"list_fmt": {
125129
"test_data": {
126130
"schema_file": "list_fmt/test_schema.json",
@@ -135,6 +139,7 @@
135139
"prod_file": "list_fmt_test.json"
136140
}
137141
},
142+
138143
"plural_rules": {
139144
"test_data": {
140145
"schema_file": "plural_rules/test_schema.json",
@@ -149,6 +154,7 @@
149154
"prod_file": "plural_rules_test.json"
150155
}
151156
},
157+
152158
"message_fmt2": {
153159
"test_data": {
154160
"schema_file": "message_fmt2/test_schema.json",
@@ -163,6 +169,7 @@
163169
'prod_file': 'message_fmt2_test.json'
164170
}
165171
},
172+
166173
"rdt_fmt": {
167174
"test_data": {
168175
"schema_file": "rdt_fmt/test_schema.json",
@@ -177,6 +184,21 @@
177184
"prod_file": "rdt_fmt_test.json"
178185
}
179186
},
187+
188+
"segmenter": {
189+
"test_data": {
190+
"schema_file": "segmenter/test_schema.json",
191+
'prod_file': 'segmenter_test.json'
192+
},
193+
"verify_data": {
194+
"schema_file": "segmenter/verify_schema.json",
195+
'prod_file': 'segmenter2_verify.json'
196+
},
197+
"result_data": {
198+
"schema_file": "segmenter/result_schema.json",
199+
"prod_file": "segmenter_test.json"
200+
}
201+
},
180202
# Additional tests
181203

182204
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
{"$schema": "https://json-schema.org/draft/2020-12/schema",
2+
"$id": "https://github.com/unicode/conformance/segmenter_verify_schema.json'",
3+
"title": "ICU Segmenter verify data description",
4+
"description": "This documents of verify data for a segmenter",
5+
"type": "object",
6+
"properties": {
7+
"additionalProperties": false,
8+
"test_type": {
9+
"description": "The name of the test",
10+
"const": "segmenter"
11+
},
12+
"tests": {
13+
"type": "array",
14+
"items": {
15+
"type": "object",
16+
"additionalProperties": true,
17+
"properties": {
18+
"label": {
19+
"description": "A numeric ID, unique for the set of tests",
20+
"type": "string"
21+
},
22+
"verify": {
23+
"description": "The expected value for the test result",
24+
"type": "string"
25+
},
26+
"result": {
27+
"description": "The actual result found",
28+
"type": "array",
29+
"items": {
30+
"type": "string"
31+
}
32+
33+
},
34+
"line" : {
35+
"description": "line of the source of test data",
36+
"type": "integer"
37+
},
38+
"error": {
39+
"description": "What was unexpected",
40+
"type": "string"
41+
},
42+
"error_message": {
43+
"description": "More about the error",
44+
"type": "string"
45+
},
46+
"actual_options": {
47+
"description": "What was sent to the collation function",
48+
"type": "string"
49+
},
50+
"input_data": {
51+
"type": "string",
52+
"description": "Information provided to the executor"
53+
},
54+
"actual_locale": {
55+
"type": "string",
56+
"description": "If present, the substitute locale actually used in the test"
57+
}
58+
}
59+
},
60+
"required": [
61+
"label",
62+
"result"
63+
]
64+
}
65+
},
66+
"required": ["tests", "test_type"]
67+
}

0 commit comments

Comments
 (0)