Skip to content

Commit 28fd834

Browse files
authored
Revert "Use workaround to fix many collation failures in ICU4C. (#475)" (#487)
This reverts commit dfbc189.
1 parent 98a088d commit 28fd834

File tree

5 files changed

+11
-113
lines changed

5 files changed

+11
-113
lines changed

executors/cpp/coll.cpp

Lines changed: 1 addition & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -42,60 +42,10 @@ using icu::RuleBasedCollator;
4242

4343
const char error_message[] = "error";
4444

45-
UnicodeString get_char_from_hex_list(json_object* str_codes_obj,
46-
int debug_level) {
47-
// Get the hex codes and assemble into a string with \u
48-
int input_length = json_object_array_length(str_codes_obj);
49-
50-
// Construct the list of Unicode Strings
51-
string hex_list = "";
52-
for (int i = 0; i < input_length; i++) {
53-
// get the i-th object in the input list
54-
json_object* item = json_object_array_get_idx(str_codes_obj, i);
55-
string hex_string = json_object_get_string(item);
56-
string escape_prefix;
57-
switch (hex_string.size()) {
58-
case 5:
59-
escape_prefix = "\\U000";
60-
break;
61-
case 6:
62-
escape_prefix = "\\U00";
63-
break;
64-
case 7:
65-
escape_prefix = "\\U0";
66-
break;
67-
case 4:
68-
default:
69-
escape_prefix = "\\u";
70-
break;
71-
case 3:
72-
escape_prefix = "\\u0";
73-
break;
74-
case 2:
75-
escape_prefix = "\\u00";
76-
break;
77-
case 1:
78-
escape_prefix = "\\u000";
79-
break;
80-
}
81-
hex_list += escape_prefix + hex_string;
82-
}
83-
// Finally, unescape this list.
84-
UnicodeString u_hex = UnicodeString::fromUTF8(hex_list);
85-
UnicodeString s_new = u_hex.unescape();
86-
if (debug_level > 0) {
87-
string target;
88-
s_new.toUTF8String(target);
89-
cout << "# hex_list: " << hex_list << " == >" << target << "<" << endl;
90-
}
91-
92-
return s_new;
93-
}
94-
9545
/**
9646
* TestCollator -- process JSON inputs, run comparator, return result
9747
*/
98-
auto TestCollator(json_object *json_in, int debug_level) -> string {
48+
auto TestCollator(json_object *json_in) -> string {
9949
UErrorCode status = U_ZERO_ERROR;
10050

10151
json_object *label_obj = json_object_object_get(json_in, "label");
@@ -112,17 +62,6 @@ auto TestCollator(json_object *json_in, int debug_level) -> string {
11262
UnicodeString us1 = UnicodeString::fromUTF8(string1);
11363
UnicodeString us2 = UnicodeString::fromUTF8(string2);
11464

115-
json_object *str1_codes_obj = json_object_object_get(json_in, "s1_codes");
116-
json_object *str2_codes_obj = json_object_object_get(json_in, "s2_codes");
117-
118-
// Use the hex codes if they are provided rather than s1 and s2.
119-
if (str1_codes_obj) {
120-
us1 = get_char_from_hex_list(str1_codes_obj, debug_level);
121-
}
122-
if (str2_codes_obj) {
123-
us2 = get_char_from_hex_list(str2_codes_obj, debug_level);
124-
}
125-
12665
string test_result;
12766
int uni_result_utf8;
12867

executors/cpp/main.cpp

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ using std::endl;
3939
using std::string;
4040

4141
// Test functions
42-
extern auto TestCollator(json_object *json_in,
43-
int debug_level) -> const string;
42+
extern auto TestCollator(json_object *json_in) -> const string;
4443
extern auto TestDatetimeFmt(json_object *json_in) -> const string;
4544
extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
4645
extern auto TestLikelySubtags(json_object *json_in) -> const string;
@@ -63,8 +62,6 @@ extern auto TestRelativeDateTimeFmt(json_object *json_in) -> const string;
6362
* test data is JSON format
6463
*/
6564
auto main(int argc, const char** argv) -> int {
66-
int debug_level = 0;
67-
6865
// All the currently supported test types.
6966
std::vector <string> supported_tests;
7067
supported_tests = {
@@ -79,15 +76,6 @@ auto main(int argc, const char** argv) -> int {
7976
"segmenter"
8077
};
8178

82-
if (argc > 1) {
83-
for (int i = 1; i < argc; i++) {
84-
string arg_string = argv[i];
85-
if (arg_string == "DEBUG") {
86-
debug_level += 1;
87-
}
88-
}
89-
}
90-
9179
for (std::string line; std::getline(cin, line);) {
9280
if (line == "#EXIT") {
9381
return 0;
@@ -127,7 +115,7 @@ auto main(int argc, const char** argv) -> int {
127115
std::string test_type = json_object_get_string(test_type_obj);
128116

129117
if (test_type == "collation") {
130-
outputLine = TestCollator(json_input, debug_level);
118+
outputLine = TestCollator(json_input);
131119
} else if (test_type == "datetime_fmt") {
132120
outputLine = TestDatetimeFmt(json_input);
133121
#if U_ICU_VERSION_MAJOR_NUM >= 75

schema/collation/test_schema.json

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,6 @@
4040
"description": "Second string for comparison",
4141
"type": "string"
4242
},
43-
"s1_codes": {
44-
"type": "array",
45-
"description": "List hex values for characters of the string",
46-
"items": {
47-
"type": "string",
48-
"description": "hex values in string form of the Unicode value"
49-
}
50-
},
51-
"s2_codes": {
52-
"type": "array",
53-
"description": "List hex values for characters of the string",
54-
"items": {
55-
"type": "string",
56-
"description": "hex values in string form of the Unicode value"
57-
}
58-
},
5943
"locale": {
6044
"description": "optional field indication locale tag for running test",
6145
"type": "string"

testgen/generators/base.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import os
1111
import requests
1212

13-
1413
def remove_none(obj):
1514
# Recursively removes any parts with None as value
1615
if isinstance(obj, str):
@@ -29,7 +28,6 @@ def remove_none(obj):
2928
result[i] = remove_none(value)
3029
return result
3130

32-
3331
class DataGenerator(ABC):
3432
def __init__(self, icu_version, run_limit=None):
3533
self.icu_version = icu_version
@@ -42,14 +40,15 @@ def __init__(self, icu_version, run_limit=None):
4240
def process_test_data(self):
4341
pass
4442

43+
4544
def generateTestHashValues(self, testdata):
4645
# For each test item, copy it. Omit 'label' from that copy.
4746
# Create the string representation of that copy with json.dumps()
4847
# Then make a hex hash value for that string.
4948
# Add it to that item.
5049

5150
try:
52-
all_tests = testdata['tests']
51+
all_tests = testdata['tests']
5352
except BaseException as error:
5453
logging.error('# generateTestHashValues: %s does not have "tests": %s',
5554
error, testdata.keys())
@@ -59,7 +58,7 @@ def generateTestHashValues(self, testdata):
5958
try:
6059
test_no_label = test.copy()
6160
except BaseException as error:
62-
logging.error('error: %s, Item with no label found here: %s, %s',
61+
logging.error('error: %s, Item with no label found here: %s, %s' ,
6362
error, testdata['test_type'], test)
6463
continue
6564
del test_no_label['label']

testgen/generators/collation.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -437,33 +437,26 @@ def generateCollTestDataObjects(self, filename, icu_version, ignorePunctuation,
437437
# It's a data line. Include in testing.
438438
if not prev:
439439
# Just getting started.
440-
prev, prev_codepoints = self.parseCollTestData(item)
440+
prev = self.parseCollTestData(item)
441441
continue
442442

443443
# Get the code points for each test
444-
try:
445-
next, next_codepoints = self.parseCollTestData(item)
446-
except BaseException as e:
447-
pass
444+
next = self.parseCollTestData(item)
448445

449446
if not next:
450447
# This is a problem with the data input. D80[0-F] is the high surrogate
451448
data_errors.append([index, item])
452449
continue
453450

454451
label = str(count).rjust(max_digits, "0")
455-
new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number,
456-
"source_file": filename,
457-
"s1_codes": prev_codepoints,
458-
"s2_codes": next_codepoints}
452+
new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number, "source_file": filename}
459453
if ignorePunctuation:
460454
new_test["ignorePunctuation"] = True
461455
test_list.append(new_test)
462456

463457
verify_list.append({"label": label, "verify": True})
464458

465459
prev = next # set up for next pair
466-
prev_codepoints = next_codepoints
467460
count += 1
468461
index += 1
469462

@@ -483,21 +476,16 @@ def parseCollTestData(self, testdata):
483476
recodepoint = re.compile(r"[0-9a-fA-F]{4,6}")
484477

485478
return_list = []
486-
code_text = []
487479
codepoints = recodepoint.findall(testdata)
488480
for code in codepoints:
489-
code_text.append(code)
490481
num_code = int(code, 16)
491482
if num_code >= 0xD800 and num_code <= 0xDFFF:
492-
return None, None
483+
return None
493484
return_list.append(self.stringifyCode(num_code))
494-
return "".join(return_list), code_text
485+
return "".join(return_list)
495486

496487
def stringifyCode(self, cp):
497488
# Just include character and escaping will work in JSONification
498-
if cp >= 0x10000:
499-
# This is an SMP character. Do we handle it differently?
500-
pass
501489
try:
502490
teststring = chr(cp)
503491
except ValueError as err:

0 commit comments

Comments
 (0)