Make Html5libTest handle double-escaped tests

sideshowbarker · sideshowbarker · commit 4b8f36cdd553 · 2020-09-04T11:18:00.000+09:00
This change makes Html5libTest correctly handle tests in the
html5lib-tests suite which have cases with so-called “double-escaped”
“input” and “output” values — for example, values that contain the
literals “\\u0000” and “\\uFFFD" rather than “\u0000” and “\uFFFD”.
diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -22,8 +22,10 @@
 
 package nu.validator.htmlparser.test;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -52,7 +54,7 @@ public void testEncoding() throws Exception {
     public void testTokenizer() throws Exception {
         Files.walkFileTree(testDir.resolve("tokenizer"),
                 new TestVisitor(true, ".test", file -> //
-                new TokenizerTester(Files.newInputStream(file)).runTests()));
+                new TokenizerTester(getDoubleEscapedInput(file)).runTests()));
         if (TokenizerTester.exitStatus != 0) {
             assert false : "Tokenizer test failed";
         }
@@ -67,6 +69,15 @@ public void testTree() throws Exception {
         }
     }
 
+    private ByteArrayInputStream getDoubleEscapedInput(Path file)
+            throws IOException {
+        byte[] fileBytes = Files.readAllBytes(file);
+        String fileContent = new String(fileBytes, StandardCharsets.UTF_8);
+        String unescapedContent = fileContent.replace("\\\\u", "\\u");
+        byte[] newBytes = unescapedContent.getBytes(StandardCharsets.UTF_8);
+        return new ByteArrayInputStream(newBytes);
+    }
+
     private interface TestConsumer extends Consumer<Path> {
 
         @Override