Skip to content

Commit dfbc189

Browse files
author
Craig Cornelius
authored
Use workaround to fix many collation failures in ICU4C. (#475)
* Use workaround to fix many collation failures in ICU4C. * Fix schema types * Another fix to schema * Another fix
1 parent de0022c commit dfbc189

File tree

5 files changed

+113
-11
lines changed

5 files changed

+113
-11
lines changed

executors/cpp/coll.cpp

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,60 @@ using icu::RuleBasedCollator;
4242

4343
const char error_message[] = "error";
4444

45+
UnicodeString get_char_from_hex_list(json_object* str_codes_obj,
46+
int debug_level) {
47+
// Get the hex codes and assemble into a string with \u
48+
int input_length = json_object_array_length(str_codes_obj);
49+
50+
// Construct the list of Unicode Strings
51+
string hex_list = "";
52+
for (int i = 0; i < input_length; i++) {
53+
// get the i-th object in the input list
54+
json_object* item = json_object_array_get_idx(str_codes_obj, i);
55+
string hex_string = json_object_get_string(item);
56+
string escape_prefix;
57+
switch (hex_string.size()) {
58+
case 5:
59+
escape_prefix = "\\U000";
60+
break;
61+
case 6:
62+
escape_prefix = "\\U00";
63+
break;
64+
case 7:
65+
escape_prefix = "\\U0";
66+
break;
67+
case 4:
68+
default:
69+
escape_prefix = "\\u";
70+
break;
71+
case 3:
72+
escape_prefix = "\\u0";
73+
break;
74+
case 2:
75+
escape_prefix = "\\u00";
76+
break;
77+
case 1:
78+
escape_prefix = "\\u000";
79+
break;
80+
}
81+
hex_list += escape_prefix + hex_string;
82+
}
83+
// Finally, unescape this list.
84+
UnicodeString u_hex = UnicodeString::fromUTF8(hex_list);
85+
UnicodeString s_new = u_hex.unescape();
86+
if (debug_level > 0) {
87+
string target;
88+
s_new.toUTF8String(target);
89+
cout << "# hex_list: " << hex_list << " == >" << target << "<" << endl;
90+
}
91+
92+
return s_new;
93+
}
94+
4595
/**
4696
* TestCollator -- process JSON inputs, run comparator, return result
4797
*/
48-
auto TestCollator(json_object *json_in) -> string {
98+
auto TestCollator(json_object *json_in, int debug_level) -> string {
4999
UErrorCode status = U_ZERO_ERROR;
50100

51101
json_object *label_obj = json_object_object_get(json_in, "label");
@@ -62,6 +112,17 @@ auto TestCollator(json_object *json_in) -> string {
62112
UnicodeString us1 = UnicodeString::fromUTF8(string1);
63113
UnicodeString us2 = UnicodeString::fromUTF8(string2);
64114

115+
json_object *str1_codes_obj = json_object_object_get(json_in, "s1_codes");
116+
json_object *str2_codes_obj = json_object_object_get(json_in, "s2_codes");
117+
118+
// Use the hex codes if they are provided rather than s1 and s2.
119+
if (str1_codes_obj) {
120+
us1 = get_char_from_hex_list(str1_codes_obj, debug_level);
121+
}
122+
if (str2_codes_obj) {
123+
us2 = get_char_from_hex_list(str2_codes_obj, debug_level);
124+
}
125+
65126
string test_result;
66127
int uni_result_utf8;
67128

executors/cpp/main.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ using std::endl;
3939
using std::string;
4040

4141
// Test functions
42-
extern auto TestCollator(json_object *json_in) -> const string;
42+
extern auto TestCollator(json_object *json_in,
43+
int debug_level) -> const string;
4344
extern auto TestDatetimeFmt(json_object *json_in) -> const string;
4445
extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
4546
extern auto TestLikelySubtags(json_object *json_in) -> const string;
@@ -62,6 +63,8 @@ extern auto TestRelativeDateTimeFmt(json_object *json_in) -> const string;
6263
* test data is JSON format
6364
*/
6465
auto main(int argc, const char** argv) -> int {
66+
int debug_level = 0;
67+
6568
// All the currently supported test types.
6669
std::vector <string> supported_tests;
6770
supported_tests = {
@@ -76,6 +79,15 @@ auto main(int argc, const char** argv) -> int {
7679
"segmenter"
7780
};
7881

82+
if (argc > 1) {
83+
for (int i = 1; i < argc; i++) {
84+
string arg_string = argv[i];
85+
if (arg_string == "DEBUG") {
86+
debug_level += 1;
87+
}
88+
}
89+
}
90+
7991
for (std::string line; std::getline(cin, line);) {
8092
if (line == "#EXIT") {
8193
return 0;
@@ -115,7 +127,7 @@ auto main(int argc, const char** argv) -> int {
115127
std::string test_type = json_object_get_string(test_type_obj);
116128

117129
if (test_type == "collation") {
118-
outputLine = TestCollator(json_input);
130+
outputLine = TestCollator(json_input, debug_level);
119131
} else if (test_type == "datetime_fmt") {
120132
outputLine = TestDatetimeFmt(json_input);
121133
#if U_ICU_VERSION_MAJOR_NUM >= 75

schema/collation/test_schema.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,22 @@
4040
"description": "Second string for comparison",
4141
"type": "string"
4242
},
43+
"s1_codes": {
44+
"type": "array",
45+
"description": "List hex values for characters of the string",
46+
"items": {
47+
"type": "string",
48+
"description": "hex values in string form of the Unicode value"
49+
}
50+
},
51+
"s2_codes": {
52+
"type": "array",
53+
"description": "List hex values for characters of the string",
54+
"items": {
55+
"type": "string",
56+
"description": "hex values in string form of the Unicode value"
57+
}
58+
},
4359
"locale": {
4460
"description": "optional field indication locale tag for running test",
4561
"type": "string"

testgen/generators/base.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import requests
1212

13+
1314
def remove_none(obj):
1415
# Recursively removes any parts with None as value
1516
if isinstance(obj, str):
@@ -28,6 +29,7 @@ def remove_none(obj):
2829
result[i] = remove_none(value)
2930
return result
3031

32+
3133
class DataGenerator(ABC):
3234
def __init__(self, icu_version, run_limit=None):
3335
self.icu_version = icu_version
@@ -40,15 +42,14 @@ def __init__(self, icu_version, run_limit=None):
4042
def process_test_data(self):
4143
pass
4244

43-
4445
def generateTestHashValues(self, testdata):
4546
# For each test item, copy it. Omit 'label' from that copy.
4647
# Create the string representation of that copy with json.dumps()
4748
# Then make a hex hash value for that string.
4849
# Add it to that item.
4950

5051
try:
51-
all_tests = testdata['tests']
52+
all_tests = testdata['tests']
5253
except BaseException as error:
5354
logging.error('# generateTestHashValues: %s does not have "tests": %s',
5455
error, testdata.keys())
@@ -58,7 +59,7 @@ def generateTestHashValues(self, testdata):
5859
try:
5960
test_no_label = test.copy()
6061
except BaseException as error:
61-
logging.error('error: %s, Item with no label found here: %s, %s' ,
62+
logging.error('error: %s, Item with no label found here: %s, %s',
6263
error, testdata['test_type'], test)
6364
continue
6465
del test_no_label['label']

testgen/generators/collation.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -437,26 +437,33 @@ def generateCollTestDataObjects(self, filename, icu_version, ignorePunctuation,
437437
# It's a data line. Include in testing.
438438
if not prev:
439439
# Just getting started.
440-
prev = self.parseCollTestData(item)
440+
prev, prev_codepoints = self.parseCollTestData(item)
441441
continue
442442

443443
# Get the code points for each test
444-
next = self.parseCollTestData(item)
444+
try:
445+
next, next_codepoints = self.parseCollTestData(item)
446+
except BaseException as e:
447+
pass
445448

446449
if not next:
447450
# This is a problem with the data input. D80[0-F] is the high surrogate
448451
data_errors.append([index, item])
449452
continue
450453

451454
label = str(count).rjust(max_digits, "0")
452-
new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number, "source_file": filename}
455+
new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number,
456+
"source_file": filename,
457+
"s1_codes": prev_codepoints,
458+
"s2_codes": next_codepoints}
453459
if ignorePunctuation:
454460
new_test["ignorePunctuation"] = True
455461
test_list.append(new_test)
456462

457463
verify_list.append({"label": label, "verify": True})
458464

459465
prev = next # set up for next pair
466+
prev_codepoints = next_codepoints
460467
count += 1
461468
index += 1
462469

@@ -476,16 +483,21 @@ def parseCollTestData(self, testdata):
476483
recodepoint = re.compile(r"[0-9a-fA-F]{4,6}")
477484

478485
return_list = []
486+
code_text = []
479487
codepoints = recodepoint.findall(testdata)
480488
for code in codepoints:
489+
code_text.append(code)
481490
num_code = int(code, 16)
482491
if num_code >= 0xD800 and num_code <= 0xDFFF:
483-
return None
492+
return None, None
484493
return_list.append(self.stringifyCode(num_code))
485-
return "".join(return_list)
494+
return "".join(return_list), code_text
486495

487496
def stringifyCode(self, cp):
488497
# Just include character and escaping will work in JSONification
498+
if cp >= 0x10000:
499+
# This is an SMP character. Do we handle it differently?
500+
pass
489501
try:
490502
teststring = chr(cp)
491503
except ValueError as err:

0 commit comments

Comments
 (0)