minor #414 Support escaped unicode character sequences in idna test files (TRowbotham)

nicolas-grekas · nicolas-grekas · commit 1b5d247d6dec · 2022-11-03T12:22:23.000+01:00
This PR was merged into the 1.26-dev branch. Discussion ---------- Support escaped unicode character sequences in idna test files The IDNA [test file](https://www.unicode.org/Public/idna/15.0.0/IdnaTestV2.txt) for Unicode 15.0.0 has started using escaped unicode character sequences. The [spec](https://www.unicode.org/reports/tr46/#Format) says that they can be in the form of either \uXXXX or \x{XXXX}. Additionally, remove a now irrelevant conditional that worked around a bug in previous test files. Commits ------- ca44f85 Support escaped unicode character sequences
diff --git a/tests/Intl/Idn/IdnTest.php b/tests/Intl/Idn/IdnTest.php
@@ -81,7 +81,13 @@ public function getData()
             }
 
             [$line] = explode('#', $line);
-            [$source, $toUnicode, $toUnicodeStatus, $toAsciiN, $toAsciiNStatus, $toAsciiT, $toAsciiTStatus] = array_map('trim', explode(';', $line));
+            [$source, $toUnicode, $toUnicodeStatus, $toAsciiN, $toAsciiNStatus, $toAsciiT, $toAsciiTStatus] = preg_replace_callback(
+                '/\\\\(?:u([[:xdigit:]]{4})|x{([[:xdigit:]]{4})})/u',
+                static function (array $matches): string {
+                    return mb_chr(hexdec($matches[1]), 'utf-8');
+                },
+                array_map('trim', explode(';', $line))
+            );
 
             if ('' === $toUnicode) {
                 $toUnicode = $source;
@@ -182,16 +188,6 @@ public function testToAsciiTransitional($source, $toUnicode, $toUnicodeStatus, $
             $this->markTestSkipped('PHP Bug #72506.');
         }
 
-        // There is currently a bug in the test data, where it is expected that the following 2
-        // source strings result in an empty string. However, due to the way the test files are setup
-        // it currently isn't possible to represent an empty string as an expected value. So, we
-        // skip these 2 problem tests. I have notified the Unicode Consortium about this and they
-        // have passed the information along to the spec editors.
-        // U+200C or U+200D
-        if ("\xE2\x80\x8C" === $source || "\xE2\x80\x8D" === $source) {
-            $toAsciiT = '';
-        }
-
         if ($toAsciiTStatus === []) {
             $this->assertSame($toAsciiT, $info['result']);
             $this->assertSame(0, $info['errors'], sprintf('Expected no errors, but found %d.', $info['errors']));