Revert "Use workaround to fix many collation failures in ICU4C. (#475)" (#487)

echeran · web-flow · commit 28fd8345bcd6 · 2025-07-21T16:01:37.000-07:00
This reverts commit dfbc189.
diff --git a/executors/cpp/coll.cpp b/executors/cpp/coll.cpp
@@ -42,60 +42,10 @@ using icu::RuleBasedCollator;
 
 const char error_message[] = "error";
 
-UnicodeString get_char_from_hex_list(json_object* str_codes_obj,
-                                     int debug_level) {
-    // Get the hex codes and assemble into a string with \u
-    int input_length = json_object_array_length(str_codes_obj);
-
-    // Construct the list of Unicode Strings
-    string hex_list = "";
-    for (int i = 0; i < input_length; i++) {
-      // get the i-th object in the input list
-      json_object* item = json_object_array_get_idx(str_codes_obj, i);
-      string hex_string = json_object_get_string(item);
-      string escape_prefix;
-      switch (hex_string.size()) {
-        case 5:
-          escape_prefix = "\\U000";
-          break;
-        case 6:
-          escape_prefix = "\\U00";
-          break;
-        case 7:
-          escape_prefix = "\\U0";
-          break;
-        case 4:
-        default:
-          escape_prefix = "\\u";
-          break;
-        case 3:
-          escape_prefix = "\\u0";
-          break;
-        case 2:
-          escape_prefix = "\\u00";
-          break;
-        case 1:
-          escape_prefix = "\\u000";
-          break;
-      }
-      hex_list += escape_prefix + hex_string;
-    }
-    // Finally, unescape this list.
-    UnicodeString u_hex = UnicodeString::fromUTF8(hex_list);
-    UnicodeString s_new = u_hex.unescape();
-    if (debug_level > 0) {
-      string target;
-      s_new.toUTF8String(target);
-      cout << "# hex_list: " << hex_list << " == >" << target << "<" << endl;
-    }
-
-    return s_new;
-}
-
 /**
  * TestCollator  --  process JSON inputs, run comparator, return result
  */
-auto TestCollator(json_object *json_in, int debug_level) -> string {
+auto TestCollator(json_object *json_in) -> string {
   UErrorCode status = U_ZERO_ERROR;
 
   json_object *label_obj = json_object_object_get(json_in, "label");
@@ -112,17 +62,6 @@ auto TestCollator(json_object *json_in, int debug_level) -> string {
   UnicodeString us1 = UnicodeString::fromUTF8(string1);
   UnicodeString us2 = UnicodeString::fromUTF8(string2);
 
-  json_object *str1_codes_obj = json_object_object_get(json_in, "s1_codes");
-  json_object *str2_codes_obj = json_object_object_get(json_in, "s2_codes");
-
-  // Use the hex codes if they are provided rather than s1 and s2.
-  if (str1_codes_obj) {
-    us1 = get_char_from_hex_list(str1_codes_obj, debug_level);
-  }
-  if (str2_codes_obj) {
-    us2 = get_char_from_hex_list(str2_codes_obj, debug_level);
-  }
-
   string test_result;
   int uni_result_utf8;
 
diff --git a/executors/cpp/main.cpp b/executors/cpp/main.cpp
@@ -39,8 +39,7 @@ using std::endl;
 using std::string;
 
 // Test functions
-extern auto TestCollator(json_object *json_in,
-                         int debug_level) -> const string;
+extern auto TestCollator(json_object *json_in) -> const string;
 extern auto TestDatetimeFmt(json_object *json_in) -> const string;
 extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
 extern auto TestLikelySubtags(json_object *json_in) -> const string;
@@ -63,8 +62,6 @@ extern auto TestRelativeDateTimeFmt(json_object *json_in) -> const string;
  *            test data is JSON format
  */
 auto main(int argc, const char** argv) -> int {
-  int debug_level = 0;
-
   // All the currently supported test types.
   std::vector <string> supported_tests;
   supported_tests = {
@@ -79,15 +76,6 @@ auto main(int argc, const char** argv) -> int {
     "segmenter"
   };
 
-  if (argc > 1) {
-    for (int i = 1; i < argc; i++) {
-      string arg_string = argv[i];
-      if (arg_string == "DEBUG") {
-        debug_level += 1;
-      }
-    }
-  }
-
   for (std::string line; std::getline(cin, line);) {
     if (line == "#EXIT") {
       return 0;
@@ -127,7 +115,7 @@ auto main(int argc, const char** argv) -> int {
       std::string test_type = json_object_get_string(test_type_obj);
 
       if (test_type == "collation") {
-        outputLine = TestCollator(json_input, debug_level);
+        outputLine = TestCollator(json_input);
       } else if (test_type == "datetime_fmt") {
          outputLine = TestDatetimeFmt(json_input);
 #if U_ICU_VERSION_MAJOR_NUM >= 75
diff --git a/schema/collation/test_schema.json b/schema/collation/test_schema.json
@@ -40,22 +40,6 @@
             "description": "Second string for comparison",
             "type": "string"
           },
-          "s1_codes": {
-            "type": "array",
-            "description": "List hex values for characters of the string",
-            "items": {
-              "type": "string",
-              "description": "hex values in string form of the Unicode value"
-            }
-          },
-          "s2_codes": {
-            "type": "array",
-            "description": "List hex values for characters of the string",
-            "items": {
-              "type": "string",
-              "description": "hex values in string form of the Unicode value"
-            }
-          },
           "locale": {
             "description": "optional field indication locale tag for running test",
             "type": "string"
diff --git a/testgen/generators/base.py b/testgen/generators/base.py
@@ -10,7 +10,6 @@
 import os
 import requests
 
-
 def remove_none(obj):
     # Recursively removes any parts with None as value
     if isinstance(obj, str):
@@ -29,7 +28,6 @@ def remove_none(obj):
             result[i] = remove_none(value)
     return result
 
-
 class DataGenerator(ABC):
     def __init__(self, icu_version, run_limit=None):
         self.icu_version = icu_version
@@ -42,14 +40,15 @@ def __init__(self, icu_version, run_limit=None):
     def process_test_data(self):
         pass
 
+
     def generateTestHashValues(self, testdata):
         # For each test item, copy it. Omit 'label' from that copy.
         # Create the string representation of that copy with json.dumps()
         # Then make  a hex hash value for that string.
         # Add it to that item.
 
         try:
-            all_tests = testdata['tests']
+            all_tests =  testdata['tests']
         except BaseException as error:
             logging.error('# generateTestHashValues: %s does not have "tests": %s',
                           error, testdata.keys())
@@ -59,7 +58,7 @@ def generateTestHashValues(self, testdata):
             try:
                 test_no_label = test.copy()
             except BaseException as error:
-                logging.error('error: %s, Item with no label found here: %s, %s',
+                logging.error('error: %s, Item with no label found here: %s, %s' ,
                               error, testdata['test_type'], test)
                 continue
             del test_no_label['label']
diff --git a/testgen/generators/collation.py b/testgen/generators/collation.py
@@ -437,33 +437,26 @@ def generateCollTestDataObjects(self, filename, icu_version, ignorePunctuation,
             # It's a data line. Include in testing.
             if not prev:
                 # Just getting started.
-                prev, prev_codepoints = self.parseCollTestData(item)
+                prev = self.parseCollTestData(item)
                 continue
 
             # Get the code points for each test
-            try:
-                next, next_codepoints = self.parseCollTestData(item)
-            except BaseException as e:
-                pass
+            next = self.parseCollTestData(item)
 
             if not next:
                 # This is a problem with the data input. D80[0-F] is the high surrogate
                 data_errors.append([index, item])
                 continue
 
             label = str(count).rjust(max_digits, "0")
-            new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number,
-                        "source_file": filename,
-                        "s1_codes": prev_codepoints,
-                        "s2_codes": next_codepoints}
+            new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number, "source_file": filename}
             if ignorePunctuation:
                 new_test["ignorePunctuation"] = True
             test_list.append(new_test)
 
             verify_list.append({"label": label, "verify": True})
 
             prev = next  # set up for next pair
-            prev_codepoints = next_codepoints
             count += 1
             index += 1
 
@@ -483,21 +476,16 @@ def parseCollTestData(self, testdata):
         recodepoint = re.compile(r"[0-9a-fA-F]{4,6}")
 
         return_list = []
-        code_text = []
         codepoints = recodepoint.findall(testdata)
         for code in codepoints:
-            code_text.append(code)
             num_code = int(code, 16)
             if num_code >= 0xD800 and num_code <= 0xDFFF:
-                return None, None
+                return None
             return_list.append(self.stringifyCode(num_code))
-        return "".join(return_list), code_text
+        return "".join(return_list)
 
     def stringifyCode(self, cp):
         # Just include character and escaping will work in JSONification
-        if cp >= 0x10000:
-            # This is an SMP character. Do we handle it differently?
-            pass
         try:
             teststring = chr(cp)
         except ValueError as err: