Skip to content

Commit e8d5b97

Browse files
authored
Added an instance method Parser#unsescape, to enable error tracking (#2396)
1 parent c42f4c3 commit e8d5b97

File tree

3 files changed

+48
-7
lines changed

3 files changed

+48
-7
lines changed

CHANGES.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## 1.22.1 (PENDING)
44

5+
### Improvements
6+
* Added a non-static `Parser#unescape(String, boolean)` method to unescape entities with the ability to track errors. [#2396](https://github.com/jhy/jsoup/pull/2396)
7+
* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser’s configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
8+
59
### Bug Fixes
610
* Previously cached child Elements of an Element were not correctly invalidated in `Node#replaceWith(Node)`, which could lead to incorrect results when subsequently calling `Element#children()`. [#2391](https://github.com/jhy/jsoup/issues/2391)
711
* Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380)

src/main/java/org/jsoup/parser/Parser.java

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -300,17 +300,36 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
300300
}
301301

302302
/**
303-
* Utility method to unescape HTML entities from a string
304-
* @param string HTML escaped string
305-
* @param inAttribute if the string is to be escaped in strict mode (as attributes are)
306-
* @return an unescaped string
303+
Utility method to unescape HTML entities from a string.
304+
<p>To track errors while unescaping, use
305+
{@link #unescape(String, boolean)} with a Parser instance that has error tracking enabled.</p>
306+
307+
@param string HTML escaped string
308+
@param inAttribute if the string is to be escaped in strict mode (as attributes are)
309+
@return an unescaped string
310+
@see #unescape(String, boolean)
307311
*/
308312
public static String unescapeEntities(String string, boolean inAttribute) {
309313
Validate.notNull(string);
310314
if (string.indexOf('&') < 0) return string; // nothing to unescape
311-
Parser parser = Parser.htmlParser();
312-
parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
313-
Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
315+
return Parser.htmlParser().unescape(string, inAttribute);
316+
}
317+
318+
/**
319+
Utility method to unescape HTML entities from a string, using this {@code Parser}'s configuration (for example, to
320+
collect errors while unescaping).
321+
322+
@param string HTML escaped string
323+
@param inAttribute if the string is to be escaped in strict mode (as attributes are)
324+
@return an unescaped string
325+
@see #setTrackErrors(int)
326+
@see #unescapeEntities(String, boolean)
327+
*/
328+
public String unescape(String string, boolean inAttribute) {
329+
Validate.notNull(string);
330+
if (string.indexOf('&') < 0) return string; // nothing to unescape
331+
this.treeBuilder.initialiseParse(new StringReader(string), "", this);
332+
Tokeniser tokeniser = new Tokeniser(this.treeBuilder);
314333
return tokeniser.unescapeEntities(inAttribute);
315334
}
316335

src/test/java/org/jsoup/parser/ParserTest.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,24 @@ public void unescapeEntitiesHandlesLargeInput() {
3030
assertEquals(body, Parser.unescapeEntities(body, false));
3131
}
3232

33+
@Test public void unescapeTracksErrors() {
34+
Parser parser = Parser.htmlParser();
35+
parser.setTrackErrors(10);
36+
37+
String s = parser.unescape("One &bogus; &amp; &gt Two", false);
38+
assertEquals("One &bogus; & > Two", s);
39+
ParseErrorList errors = parser.getErrors();
40+
assertEquals(2, errors.size());
41+
assertEquals("<1:6>: Invalid character reference: invalid named reference [bogus]", errors.get(0).toString());
42+
assertEquals("<1:22>: Invalid character reference: missing semicolon on [&gt]", errors.get(1).toString());
43+
44+
// can reuse parser; errors will be reset
45+
s = parser.unescape("One &amp; &bogus; Two", false);
46+
assertEquals("One & &bogus; Two", s);
47+
assertEquals(1, parser.getErrors().size());
48+
assertEquals("<1:12>: Invalid character reference: invalid named reference [bogus]", parser.getErrors().get(0).toString());
49+
}
50+
3351
@Test
3452
public void testUtf8() throws IOException {
3553
// testcase for https://github.com/jhy/jsoup/issues/1557. no repro.

0 commit comments

Comments
 (0)