Use workaround to fix many collation failures in ICU4C. (#475)

Craig Cornelius · web-flow · commit dfbc189b0b8f · 2025-07-16T15:04:54.000-07:00
* Use workaround to fix many collation failures in ICU4C.

* Fix schema types

* Another fix to schema

* Another fix
diff --git a/executors/cpp/coll.cpp b/executors/cpp/coll.cpp
@@ -42,10 +42,60 @@ using icu::RuleBasedCollator;
 
 const char error_message[] = "error";
 
+UnicodeString get_char_from_hex_list(json_object* str_codes_obj,
+                                     int debug_level) {
+    // Get the hex codes and assemble into a string with \u
+    int input_length = json_object_array_length(str_codes_obj);
+
+    // Construct the list of Unicode Strings
+    string hex_list = "";
+    for (int i = 0; i < input_length; i++) {
+      // get the i-th object in the input list
+      json_object* item = json_object_array_get_idx(str_codes_obj, i);
+      string hex_string = json_object_get_string(item);
+      string escape_prefix;
+      switch (hex_string.size()) {
+        case 5:
+          escape_prefix = "\\U000";
+          break;
+        case 6:
+          escape_prefix = "\\U00";
+          break;
+        case 7:
+          escape_prefix = "\\U0";
+          break;
+        case 4:
+        default:
+          escape_prefix = "\\u";
+          break;
+        case 3:
+          escape_prefix = "\\u0";
+          break;
+        case 2:
+          escape_prefix = "\\u00";
+          break;
+        case 1:
+          escape_prefix = "\\u000";
+          break;
+      }
+      hex_list += escape_prefix + hex_string;
+    }
+    // Finally, unescape this list.
+    UnicodeString u_hex = UnicodeString::fromUTF8(hex_list);
+    UnicodeString s_new = u_hex.unescape();
+    if (debug_level > 0) {
+      string target;
+      s_new.toUTF8String(target);
+      cout << "# hex_list: " << hex_list << " == >" << target << "<" << endl;
+    }
+
+    return s_new;
+}
+
 /**
  * TestCollator  --  process JSON inputs, run comparator, return result
  */
-auto TestCollator(json_object *json_in) -> string {
+auto TestCollator(json_object *json_in, int debug_level) -> string {
   UErrorCode status = U_ZERO_ERROR;
 
   json_object *label_obj = json_object_object_get(json_in, "label");
@@ -62,6 +112,17 @@ auto TestCollator(json_object *json_in) -> string {
   UnicodeString us1 = UnicodeString::fromUTF8(string1);
   UnicodeString us2 = UnicodeString::fromUTF8(string2);
 
+  json_object *str1_codes_obj = json_object_object_get(json_in, "s1_codes");
+  json_object *str2_codes_obj = json_object_object_get(json_in, "s2_codes");
+
+  // Use the hex codes if they are provided rather than s1 and s2.
+  if (str1_codes_obj) {
+    us1 = get_char_from_hex_list(str1_codes_obj, debug_level);
+  }
+  if (str2_codes_obj) {
+    us2 = get_char_from_hex_list(str2_codes_obj, debug_level);
+  }
+
   string test_result;
   int uni_result_utf8;
 
diff --git a/executors/cpp/main.cpp b/executors/cpp/main.cpp
@@ -39,7 +39,8 @@ using std::endl;
 using std::string;
 
 // Test functions
-extern auto TestCollator(json_object *json_in) -> const string;
+extern auto TestCollator(json_object *json_in,
+                         int debug_level) -> const string;
 extern auto TestDatetimeFmt(json_object *json_in) -> const string;
 extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
 extern auto TestLikelySubtags(json_object *json_in) -> const string;
@@ -62,6 +63,8 @@ extern auto TestRelativeDateTimeFmt(json_object *json_in) -> const string;
  *            test data is JSON format
  */
 auto main(int argc, const char** argv) -> int {
+  int debug_level = 0;
+
   // All the currently supported test types.
   std::vector <string> supported_tests;
   supported_tests = {
@@ -76,6 +79,15 @@ auto main(int argc, const char** argv) -> int {
     "segmenter"
   };
 
+  if (argc > 1) {
+    for (int i = 1; i < argc; i++) {
+      string arg_string = argv[i];
+      if (arg_string == "DEBUG") {
+        debug_level += 1;
+      }
+    }
+  }
+
   for (std::string line; std::getline(cin, line);) {
     if (line == "#EXIT") {
       return 0;
@@ -115,7 +127,7 @@ auto main(int argc, const char** argv) -> int {
       std::string test_type = json_object_get_string(test_type_obj);
 
       if (test_type == "collation") {
-        outputLine = TestCollator(json_input);
+        outputLine = TestCollator(json_input, debug_level);
       } else if (test_type == "datetime_fmt") {
          outputLine = TestDatetimeFmt(json_input);
 #if U_ICU_VERSION_MAJOR_NUM >= 75
diff --git a/schema/collation/test_schema.json b/schema/collation/test_schema.json
@@ -40,6 +40,22 @@
             "description": "Second string for comparison",
             "type": "string"
           },
+          "s1_codes": {
+            "type": "array",
+            "description": "List hex values for characters of the string",
+            "items": {
+              "type": "string",
+              "description": "hex values in string form of the Unicode value"
+            }
+          },
+          "s2_codes": {
+            "type": "array",
+            "description": "List hex values for characters of the string",
+            "items": {
+              "type": "string",
+              "description": "hex values in string form of the Unicode value"
+            }
+          },
           "locale": {
             "description": "optional field indication locale tag for running test",
             "type": "string"
diff --git a/testgen/generators/base.py b/testgen/generators/base.py
@@ -10,6 +10,7 @@
 import os
 import requests
 
+
 def remove_none(obj):
     # Recursively removes any parts with None as value
     if isinstance(obj, str):
@@ -28,6 +29,7 @@ def remove_none(obj):
             result[i] = remove_none(value)
     return result
 
+
 class DataGenerator(ABC):
     def __init__(self, icu_version, run_limit=None):
         self.icu_version = icu_version
@@ -40,15 +42,14 @@ def __init__(self, icu_version, run_limit=None):
     def process_test_data(self):
         pass
 
-
     def generateTestHashValues(self, testdata):
         # For each test item, copy it. Omit 'label' from that copy.
         # Create the string representation of that copy with json.dumps()
         # Then make  a hex hash value for that string.
         # Add it to that item.
 
         try:
-            all_tests =  testdata['tests']
+            all_tests = testdata['tests']
         except BaseException as error:
             logging.error('# generateTestHashValues: %s does not have "tests": %s',
                           error, testdata.keys())
@@ -58,7 +59,7 @@ def generateTestHashValues(self, testdata):
             try:
                 test_no_label = test.copy()
             except BaseException as error:
-                logging.error('error: %s, Item with no label found here: %s, %s' ,
+                logging.error('error: %s, Item with no label found here: %s, %s',
                               error, testdata['test_type'], test)
                 continue
             del test_no_label['label']
diff --git a/testgen/generators/collation.py b/testgen/generators/collation.py
@@ -437,26 +437,33 @@ def generateCollTestDataObjects(self, filename, icu_version, ignorePunctuation,
             # It's a data line. Include in testing.
             if not prev:
                 # Just getting started.
-                prev = self.parseCollTestData(item)
+                prev, prev_codepoints = self.parseCollTestData(item)
                 continue
 
             # Get the code points for each test
-            next = self.parseCollTestData(item)
+            try:
+                next, next_codepoints = self.parseCollTestData(item)
+            except BaseException as e:
+                pass
 
             if not next:
                 # This is a problem with the data input. D80[0-F] is the high surrogate
                 data_errors.append([index, item])
                 continue
 
             label = str(count).rjust(max_digits, "0")
-            new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number, "source_file": filename}
+            new_test = {"label": label, "s1": prev, "s2": next, "strength": "identical", "line": line_number,
+                        "source_file": filename,
+                        "s1_codes": prev_codepoints,
+                        "s2_codes": next_codepoints}
             if ignorePunctuation:
                 new_test["ignorePunctuation"] = True
             test_list.append(new_test)
 
             verify_list.append({"label": label, "verify": True})
 
             prev = next  # set up for next pair
+            prev_codepoints = next_codepoints
             count += 1
             index += 1
 
@@ -476,16 +483,21 @@ def parseCollTestData(self, testdata):
         recodepoint = re.compile(r"[0-9a-fA-F]{4,6}")
 
         return_list = []
+        code_text = []
         codepoints = recodepoint.findall(testdata)
         for code in codepoints:
+            code_text.append(code)
             num_code = int(code, 16)
             if num_code >= 0xD800 and num_code <= 0xDFFF:
-                return None
+                return None, None
             return_list.append(self.stringifyCode(num_code))
-        return "".join(return_list)
+        return "".join(return_list), code_text
 
     def stringifyCode(self, cp):
         # Just include character and escaping will work in JSONification
+        if cp >= 0x10000:
+            # This is an SMP character. Do we handle it differently?
+            pass
         try:
             teststring = chr(cp)
         except ValueError as err: