Skip to content

Commit dedf6ab

Browse files
committed
Normalize (remove or replace) nulls on body
We were checking that the character data was only a null, not if the data contained a null.
1 parent d1e3106 commit dedf6ab

File tree

5 files changed

+73
-25
lines changed

5 files changed

+73
-25
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380)
1111
* When using the JDK HttpClient, any system default proxy (`ProxySelector.getDefault()`) was ignored. Now, the system proxy is used if a per-request proxy is not set. [#2388](https://github.com/jhy/jsoup/issues/2388), [#2390](https://github.com/jhy/jsoup/pull/2390)
1212
* A ValidationException could be thrown in the adoption agency algorithm with particularly broken input. Now logged as a parse error. [#2393](https://github.com/jhy/jsoup/issues/2393)
13+
* Null characters in the HTML body were not consistently removed; and in foreign content were not correctly replaced. [#2395](https://github.com/jhy/jsoup/issues/2395)
1314

1415

1516
## 1.21.2 (2025-Aug-25)

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,20 @@ void insertCommentNode(Token.Comment token) {
407407
onNodeInserted(node);
408408
}
409409

410-
/** Inserts the provided character token into the current element. */
410+
/** Inserts the provided character token into the current element. Any nulls in the data will be removed. */
411411
void insertCharacterNode(Token.Character characterToken) {
412+
insertCharacterNode(characterToken, false);
413+
}
414+
415+
/**
416+
Inserts the provided character token into the current element. The tokenizer will have already raised precise character errors.
417+
418+
@param characterToken the character token to insert
419+
@param replace if true, replaces any null chars in the data with the replacement char (U+FFFD). If false, removes
420+
null chars.
421+
*/
422+
void insertCharacterNode(Token.Character characterToken, boolean replace) {
423+
characterToken.normalizeNulls(replace);
412424
Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
413425
insertCharacterToElement(characterToken, el);
414426
}

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -286,15 +286,12 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
286286
switch (t.type) {
287287
case Character: {
288288
Token.Character c = t.asCharacter();
289-
if (c.getData().equals(nullString)) {
290-
tb.error(this);
291-
return false;
292-
} else if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed
289+
if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed
293290
tb.reconstructFormattingElements();
294291
tb.insertCharacterNode(c);
295292
} else {
296293
tb.reconstructFormattingElements();
297-
tb.insertCharacterNode(c);
294+
tb.insertCharacterNode(c); // strips nulls
298295
tb.framesetOk(false);
299296
}
300297
break;
@@ -1115,13 +1112,7 @@ boolean anythingElse(Token t, HtmlTreeBuilder tb) {
11151112
InTableText {
11161113
@Override boolean process(Token t, HtmlTreeBuilder tb) {
11171114
if (t.type == Token.TokenType.Character) {
1118-
Token.Character c = t.asCharacter();
1119-
if (c.getData().equals(nullString)) {
1120-
tb.error(this);
1121-
return false;
1122-
} else {
1123-
tb.addPendingTableCharacters(c);
1124-
}
1115+
tb.addPendingTableCharacters(t.asCharacter()); // gets to insertCharacterNode, which strips nulls
11251116
} else {
11261117
// insert gathered table text into the correct element:
11271118
if (tb.getPendingTableCharacters().size() > 0) {
@@ -1454,13 +1445,7 @@ private void closeCell(HtmlTreeBuilder tb) {
14541445

14551446
switch (t.type) {
14561447
case Character:
1457-
Token.Character c = t.asCharacter();
1458-
if (c.getData().equals(nullString)) {
1459-
tb.error(this);
1460-
return false;
1461-
} else {
1462-
tb.insertCharacterNode(c);
1463-
}
1448+
tb.insertCharacterNode(t.asCharacter());
14641449
break;
14651450
case Comment:
14661451
tb.insertCommentNode(t.asComment());
@@ -1790,12 +1775,10 @@ else if (name.equals("col")) {
17901775
switch (t.type) {
17911776
case Character:
17921777
Token.Character c = t.asCharacter();
1793-
if (c.getData().equals(nullString))
1794-
tb.error(this);
1795-
else if (HtmlTreeBuilderState.isWhitespace(c))
1778+
if (HtmlTreeBuilderState.isWhitespace(c))
17961779
tb.insertCharacterNode(c);
17971780
else {
1798-
tb.insertCharacterNode(c);
1781+
tb.insertCharacterNode(c, true); // replace nulls
17991782
tb.framesetOk(false);
18001783
}
18011784
break;

src/main/java/org/jsoup/parser/Token.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,20 @@ public String toString() {
415415
return getData();
416416
}
417417

418+
/**
419+
Normalize null chars in the data. If replace is true, replaces with the replacement char; if false, removes.
420+
*/
421+
public void normalizeNulls(boolean replace) {
422+
String data = this.data.value();
423+
if (data.indexOf(TokeniserState.nullChar) == -1) return;
424+
425+
data = (replace ?
426+
data.replace(TokeniserState.nullChar, Tokeniser.replacementChar) :
427+
data.replace(nullString, ""));
428+
this.data.set(data);
429+
}
430+
431+
private static final String nullString = String.valueOf(TokeniserState.nullChar);
418432
}
419433

420434
final static class CData extends Character {

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ private static Stream<Arguments> dupeAttributeData() {
856856

857857
@Test public void handlesNullInData() {
858858
Document doc = Jsoup.parse("<p id=\u0000>Blah \u0000</p>");
859-
assertEquals("<p id=\"\uFFFD\">Blah &#x0;</p>", doc.body().html()); // replaced in attr, NOT replaced in data (but is escaped as control char <0x20)
859+
assertEquals("<p id=\"\uFFFD\">Blah</p>", doc.body().html()); // replaced in attr, discarded in data
860860
}
861861

862862
@Test public void handlesNullInComments() {
@@ -2130,4 +2130,42 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) {
21302130
assertEquals("a < b", data.data());
21312131
assertEquals("<data>a < b</data>", data.outerHtml());
21322132
}
2133+
2134+
@Test void dropsNullsFromBody() {
2135+
// https://github.com/jhy/jsoup/issues/2395
2136+
String html = "<p>\u0000</p><p>\u0000\u0000</p><p>Hi\u0000</p>";
2137+
2138+
Parser parser = Parser.htmlParser();
2139+
parser.setTrackErrors(10);
2140+
2141+
Document doc = Jsoup.parse(html, parser);
2142+
assertEquals("<p></p>\n<p></p>\n<p>Hi</p>", doc.body().html());
2143+
assertEquals("Hi", doc.body().text());
2144+
2145+
ParseErrorList errors = parser.getErrors();
2146+
assertEquals(4, errors.size());
2147+
assertEquals("<1:4>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString());
2148+
assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
2149+
assertEquals("<1:13>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
2150+
assertEquals("<1:23>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
2151+
// todo should we replace that null, for convenience?
2152+
}
2153+
2154+
@Test void replacesNullsInForeign() {
2155+
String html = "<svg><text>\u0000</text><text>\u0000\u0000</text><text>Hi\u0000</text></svg>";
2156+
Parser parser = Parser.htmlParser();
2157+
parser.setTrackErrors(10);
2158+
2159+
Document doc = Jsoup.parse(html, parser);
2160+
assertEquals("<svg>\n <text>�</text><text>��</text><text>Hi�</text>\n</svg>", doc.body().html());
2161+
assertEquals("���Hi�", doc.body().text());
2162+
2163+
ParseErrorList errors = parser.getErrors();
2164+
assertEquals(4, errors.size());
2165+
assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString());
2166+
assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
2167+
assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
2168+
assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
2169+
2170+
}
21332171
}

0 commit comments

Comments
 (0)