Added an instance method Parser#unsescape, to enable error tracking (#2396)

jhy · web-flow · commit e8d5b97979a3 · 2025-09-08T11:26:48.000+10:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,10 @@
 
 ## 1.22.1 (PENDING)
 
+### Improvements
+* Added a non-static `Parser#unescape(String, boolean)` method to unescape entities with the ability to track errors. [#2396](https://github.com/jhy/jsoup/pull/2396)
+* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser’s configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
+
 ### Bug Fixes
 * Previously cached child Elements of an Element were not correctly invalidated in `Node#replaceWith(Node)`, which could lead to incorrect results when subsequently calling `Element#children()`. [#2391](https://github.com/jhy/jsoup/issues/2391)
 * Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380)
diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java
@@ -300,17 +300,36 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
     }
 
     /**
-     * Utility method to unescape HTML entities from a string
-     * @param string HTML escaped string
-     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
-     * @return an unescaped string
+     Utility method to unescape HTML entities from a string.
+     <p>To track errors while unescaping, use
+     {@link #unescape(String, boolean)} with a Parser instance that has error tracking enabled.</p>
+
+     @param string HTML escaped string
+     @param inAttribute if the string is to be escaped in strict mode (as attributes are)
+     @return an unescaped string
+     @see #unescape(String, boolean)
      */
     public static String unescapeEntities(String string, boolean inAttribute) {
         Validate.notNull(string);
         if (string.indexOf('&') < 0) return string; // nothing to unescape
-        Parser parser = Parser.htmlParser();
-        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
-        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
+        return Parser.htmlParser().unescape(string, inAttribute);
+    }
+
+    /**
+     Utility method to unescape HTML entities from a string, using this {@code Parser}'s configuration (for example, to
+     collect errors while unescaping).
+
+     @param string HTML escaped string
+     @param inAttribute if the string is to be escaped in strict mode (as attributes are)
+     @return an unescaped string
+     @see #setTrackErrors(int)
+     @see #unescapeEntities(String, boolean)
+     */
+    public String unescape(String string, boolean inAttribute) {
+        Validate.notNull(string);
+        if (string.indexOf('&') < 0) return string; // nothing to unescape
+        this.treeBuilder.initialiseParse(new StringReader(string), "", this);
+        Tokeniser tokeniser = new Tokeniser(this.treeBuilder);
         return tokeniser.unescapeEntities(inAttribute);
     }
 
diff --git a/src/test/java/org/jsoup/parser/ParserTest.java b/src/test/java/org/jsoup/parser/ParserTest.java
@@ -30,6 +30,24 @@ public void unescapeEntitiesHandlesLargeInput() {
         assertEquals(body, Parser.unescapeEntities(body, false));
     }
 
+    @Test public void unescapeTracksErrors() {
+        Parser parser = Parser.htmlParser();
+        parser.setTrackErrors(10);
+
+        String s = parser.unescape("One &bogus; &amp; &gt Two", false);
+        assertEquals("One &bogus; & > Two", s);
+        ParseErrorList errors = parser.getErrors();
+        assertEquals(2, errors.size());
+        assertEquals("<1:6>: Invalid character reference: invalid named reference [bogus]", errors.get(0).toString());
+        assertEquals("<1:22>: Invalid character reference: missing semicolon on [&gt]", errors.get(1).toString());
+
+        // can reuse parser; errors will be reset
+        s = parser.unescape("One &amp; &bogus; Two", false);
+        assertEquals("One & &bogus; Two", s);
+        assertEquals(1, parser.getErrors().size());
+        assertEquals("<1:12>: Invalid character reference: invalid named reference [bogus]", parser.getErrors().get(0).toString());
+    }
+
     @Test
     public void testUtf8() throws IOException {
         // testcase for https://github.com/jhy/jsoup/issues/1557. no repro.