Skip to content

Commit 2cc74b6

Browse files
nicolo-ribaudojhy
andauthored
Align maximum HTML depth handling with browsers (#2421)
Co-authored-by: Jonathan Hedley <jonathan@hedley.net>
1 parent bcbac42 commit 2cc74b6

File tree

9 files changed

+241
-14
lines changed

9 files changed

+241
-14
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
(If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) You can confirm that the re2j engine has been enabled correctly by calling `Regex.usingRe2j()`. [#2407](https://github.com/jhy/jsoup/pull/2407)
1515

1616
* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser's configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
17+
* Added a configurable maximum parser depth (number of open elements on stack) to both HTML and XML parsers. The HTML parser now defaults to a depth of 512 to match browser behavior, and protect against unbounded stack growth, while the XML parser keeps unlimited depth by default but can opt into a limit via `Parser#setMaxDepth`. [#2421](https://github.com/jhy/jsoup/issues/2421)
1718
* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
1819
* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)
1920

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ public class HtmlTreeBuilder extends TreeBuilder {
6060
"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
6161
};
6262

63-
public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
63+
/** @deprecated This is not used anymore. Will be removed in a future release. */
64+
@Deprecated
65+
public static final int MaxScopeSearchDepth = 100;
6466

6567
private HtmlTreeBuilderState state; // the current state
6668
private HtmlTreeBuilderState originalState; // original / marked state
@@ -392,6 +394,8 @@ FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean
392394
* @param el the Element to insert and make the current element
393395
*/
394396
private void doInsertElement(Element el) {
397+
enforceStackDepthLimit();
398+
395399
if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed))
396400
formElement.addElement(el); // connect form controls to their form element
397401

@@ -498,6 +502,20 @@ boolean removeFromStack(Element el) {
498502
return false;
499503
}
500504

505+
@Override
506+
void onStackPrunedForDepth(Element element) {
507+
// handle other effects of popping to keep state correct
508+
if (element == headElement) headElement = null;
509+
if (element == formElement) setFormElement(null);
510+
removeFromActiveFormattingElements(element);
511+
if (element.nameIs("template")) {
512+
clearFormattingElementsToLastMarker();
513+
if (templateModeSize() > 0)
514+
popTemplateMode();
515+
resetInsertionMode();
516+
}
517+
}
518+
501519
/** Pops the stack until the given HTML element is removed. */
502520
@Nullable
503521
Element popStackToClose(String elName) {
@@ -699,9 +717,8 @@ private boolean inSpecificScope(String targetName, String[] baseTypes, String[]
699717
private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) {
700718
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
701719
final int bottom = stack.size() -1;
702-
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
703720
// don't walk too far up the tree
704-
for (int pos = bottom; pos >= top; pos--) {
721+
for (int pos = bottom; pos >= 0; pos--) {
705722
Element el = stack.get(pos);
706723
String elName = el.normalName();
707724
// namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg:
@@ -762,11 +779,7 @@ boolean inSelectScope(String targetName) {
762779

763780
/** Tests if there is some element on the stack that is not in the provided set. */
764781
boolean onStackNot(String[] allowedTags) {
765-
final int bottom = stack.size() -1;
766-
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
767-
// don't walk too far up the tree
768-
769-
for (int pos = bottom; pos >= top; pos--) {
782+
for (int pos = stack.size() - 1; pos >= 0; pos--) {
770783
final String elName = stack.get(pos).normalName();
771784
if (!inSorted(elName, allowedTags))
772785
return true;

src/main/java/org/jsoup/parser/Parser.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class Parser implements Cloneable {
3030
private boolean trackPosition = false;
3131
private @Nullable TagSet tagSet;
3232
private final ReentrantLock lock = new ReentrantLock();
33+
private int maxDepth;
3334

3435
/**
3536
* Create a new Parser, using the specified TreeBuilder
@@ -39,6 +40,7 @@ public Parser(TreeBuilder treeBuilder) {
3940
this.treeBuilder = treeBuilder;
4041
settings = treeBuilder.defaultSettings();
4142
errors = ParseErrorList.noTracking();
43+
maxDepth = treeBuilder.defaultMaxDepth();
4244
}
4345

4446
/**
@@ -60,6 +62,7 @@ private Parser(Parser copy) {
6062
errors = new ParseErrorList(copy.errors); // only copies size, not contents
6163
settings = new ParseSettings(copy.settings);
6264
trackPosition = copy.trackPosition;
65+
maxDepth = copy.maxDepth;
6366
tagSet = new TagSet(copy.tagSet());
6467
}
6568

@@ -195,6 +198,28 @@ public ParseSettings settings() {
195198
return settings;
196199
}
197200

201+
/**
202+
Set the parser's maximum stack depth (maximum number of open elements). When reached, new open elements will be
203+
removed to prevent excessive nesting. Defaults to 512 for the HTML parser, and unlimited for the XML
204+
parser.
205+
206+
@param maxDepth maximum parser depth; must be >= 1
207+
@return this Parser, for chaining
208+
*/
209+
public Parser setMaxDepth(int maxDepth) {
210+
Validate.isTrue(maxDepth >= 1, "maxDepth must be >= 1");
211+
this.maxDepth = maxDepth;
212+
return this;
213+
}
214+
215+
/**
216+
* Get the maximum parser depth (maximum number of open elements).
217+
* @return the current max parser depth
218+
*/
219+
public int getMaxDepth() {
220+
return maxDepth;
221+
}
222+
198223
/**
199224
Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
200225
parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
@@ -351,6 +376,6 @@ public static Parser htmlParser() {
351376
* @return a new simple XML parser.
352377
*/
353378
public static Parser xmlParser() {
354-
return new Parser(new XmlTreeBuilder());
379+
return new Parser(new XmlTreeBuilder()).setMaxDepth(Integer.MAX_VALUE);
355380
}
356381
}

src/main/java/org/jsoup/parser/TreeBuilder.java

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,8 @@
1111
import org.jspecify.annotations.Nullable;
1212

1313
import java.io.Reader;
14-
import java.io.StringReader;
1514
import java.util.ArrayList;
16-
import java.util.HashMap;
1715
import java.util.List;
18-
import java.util.Map;
1916

2017
import static org.jsoup.parser.Parser.NamespaceHtml;
2118

@@ -174,6 +171,33 @@ final void push(Element element) {
174171
onNodeInserted(element);
175172
}
176173

174+
/**
175+
Ensures the stack respects {@link Parser#getMaxDepth()} by closing the deepest open elements until there is room for
176+
a new insertion.
177+
*/
178+
final void enforceStackDepthLimit() {
179+
final int maxDepth = parser.getMaxDepth();
180+
if (maxDepth == Integer.MAX_VALUE) return;
181+
while (stack.size() >= maxDepth) {
182+
Element trimmed = pop();
183+
onStackPrunedForDepth(trimmed);
184+
}
185+
}
186+
187+
/**
188+
Hook for the HTML Tree Builder that needs to clean up when an element is removed due to the depth limit
189+
*/
190+
void onStackPrunedForDepth(Element element) {
191+
// default no-op
192+
}
193+
194+
/**
195+
Default maximum depth for parsers using this tree builder.
196+
*/
197+
int defaultMaxDepth() {
198+
return 512;
199+
}
200+
177201
/**
178202
Get the current element (last on the stack). If all items have been removed, returns the document instead
179203
(which might not actually be on the stack; use stack.size() == 0 to test if required.

src/main/java/org/jsoup/parser/XmlTreeBuilder.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ TagSet defaultTagSet() {
106106
return new TagSet(); // an empty tagset
107107
}
108108

109+
@Override
110+
int defaultMaxDepth() {
111+
return Integer.MAX_VALUE;
112+
}
113+
109114
@Override
110115
protected boolean process(Token token) {
111116
currentToken = token;
@@ -151,6 +156,8 @@ void insertElementFor(Token.StartTag startTag) {
151156
applyNamespacesToAttributes(attributes, namespaces);
152157
}
153158

159+
enforceStackDepthLimit();
160+
154161
String tagName = startTag.tagName.value();
155162
String ns = resolveNamespace(tagName, namespaces);
156163
Tag tag = tagFor(tagName, startTag.normalName, ns, settings);

src/test/java/org/jsoup/nodes/ElementIT.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.jsoup.nodes;
22

33
import org.jsoup.Jsoup;
4+
import org.jsoup.parser.Parser;
45
import org.jsoup.select.Elements;
56
import org.junit.jupiter.api.Test;
67

@@ -125,6 +126,7 @@ public void testFastReparentExistingContent() {
125126
@Test void wrapNoOverflow() {
126127
// deepChild was recursive, so could overflow if presented with a fairly insane wrap
127128
Document doc = new Document("https://example.com/");
129+
doc.parser().setMaxDepth(Integer.MAX_VALUE); // don't limit to 512
128130
Element el = doc.body().appendElement("p");
129131
int num = 50000;
130132
StringBuilder sb = new StringBuilder();
@@ -134,7 +136,7 @@ public void testFastReparentExistingContent() {
134136
el.wrap(sb.toString());
135137
String html = doc.body().html();
136138
assertTrue(html.startsWith("<div>"));
137-
assertEquals(num + 3, el.parents().size());
139+
assertEquals(num + 3, el.parents().size()); // + 3 is for body, html, document
138140
}
139141

140142
@Test

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import org.jsoup.nodes.*;
88
import org.jsoup.safety.Safelist;
99
import org.jsoup.select.Elements;
10+
import org.junit.jupiter.api.Nested;
1011
import org.junit.jupiter.api.Test;
1112
import org.junit.jupiter.params.ParameterizedTest;
1213
import org.junit.jupiter.params.provider.Arguments;
@@ -2205,6 +2206,115 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) {
22052206
assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
22062207
assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
22072208
assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
2209+
}
2210+
2211+
@Nested class DeepHtmlTrees {
2212+
private int depth(Element el) {
2213+
int d = 0;
2214+
while ((el = el.parent()) != null) {
2215+
d++;
2216+
} while (el != null);
2217+
return d;
2218+
}
22082219

2220+
/**
2221+
* Parse the HTML code in `contents`, wrapped in enough divs to ensure that the root elements
2222+
* of contents are at depth `startingDepth`.
2223+
*/
2224+
private Element parseDeepHtml(int startingDepth, String contents) {
2225+
StringBuilder html = new StringBuilder();
2226+
html.append("<html><body>");
2227+
for (int i = 0; i < startingDepth - 4; i++) {
2228+
html.append("<div>");
2229+
}
2230+
html.append("<div id='container'>");
2231+
html.append(contents);
2232+
2233+
Parser parser = Parser.htmlParser();
2234+
Document doc = Jsoup.parse(html.toString(), parser);
2235+
Element container = doc.getElementById("container");
2236+
assertNotNull(container);
2237+
assertEquals(startingDepth - 1, depth(container));
2238+
2239+
return container;
2240+
}
2241+
2242+
@Test void nestedDivs() {
2243+
Element container = parseDeepHtml(511, "<div><div><div>");
2244+
2245+
assertEquals("<div>\n <div></div>\n <div></div>\n</div>", container.html());
2246+
}
2247+
2248+
@Test void closingTagOfTagClosedByDepthLimit() {
2249+
// The <a></a> tag would be nested too deep, so it first closes the innermost <span>.
2250+
// This means that the first </span> will close the outer <span>, as it's the only
2251+
// one that is currently open. The last </span> is then just ignored, as there is no
2252+
// open <span> left to close.
2253+
Element container = parseDeepHtml(511, "<span><span><a></a></span><b></b></span>");
2254+
2255+
assertEquals("<span><span></span><a></a></span><b></b>", container.html());
2256+
}
2257+
2258+
@Test void tableAtDepthLimitWithDirectTd() {
2259+
Element container = parseDeepHtml(512, "<table><td>");
2260+
2261+
assertEquals("<table></table>\n<tbody></tbody>\n<tr></tr>\n<td></td>", container.html());
2262+
}
2263+
2264+
@Test void tableRightBeforeDepthLimitWithDirectTd() {
2265+
Element container = parseDeepHtml(511, "<table><td>");
2266+
2267+
assertEquals("<table>\n <tbody></tbody>\n <tr></tr>\n <td></td>\n</table>", container.html());
2268+
}
2269+
2270+
@Test void customDepthLimit() {
2271+
Parser parser = Parser.htmlParser().setMaxDepth(5);
2272+
String input = "<html><body><div><div><div><div><div><div>";
2273+
2274+
Document doc = Jsoup.parse(input, parser);
2275+
String expected = new StringBuilder()
2276+
.append("<html>\n")
2277+
.append(" <head></head>\n")
2278+
.append(" <body>\n")
2279+
.append(" <div>\n")
2280+
.append(" <div>\n")
2281+
.append(" <div></div>\n")
2282+
.append(" <div></div>\n")
2283+
.append(" <div></div>\n")
2284+
.append(" <div></div>\n")
2285+
.append(" </div>\n")
2286+
.append(" </div>\n")
2287+
.append(" </body>\n")
2288+
.append("</html>")
2289+
.toString();
2290+
2291+
assertEquals(expected, doc.html());
2292+
}
2293+
2294+
@Test void formControlsDetachWhenFormTrimmed() {
2295+
Parser parser = Parser.htmlParser().setMaxDepth(3);
2296+
String input = "<form id='f'><div><input name='foo'></div></form>";
2297+
2298+
Document doc = Jsoup.parse(input, "", parser);
2299+
Element formEl = doc.getElementById("f");
2300+
assertNotNull(formEl);
2301+
assertTrue(formEl instanceof FormElement);
2302+
FormElement form = (FormElement) formEl;
2303+
assertEquals("", form.html());
2304+
assertEquals(0, form.elements().size());
2305+
}
2306+
2307+
@Test void templateModesClearedWhenTrimmed() {
2308+
Parser parser = Parser.htmlParser().setMaxDepth(3);
2309+
String input = "<template id='tmpl'><div><span>One</span></div></template><p>Two</p>";
2310+
2311+
Document doc = Jsoup.parse(input, "", parser);
2312+
Element template = doc.getElementById("tmpl");
2313+
assertNotNull(template);
2314+
assertEquals("", template.html());
2315+
Element paragraph = doc.selectFirst("p");
2316+
assertNotNull(paragraph);
2317+
assertEquals("Two", paragraph.text());
2318+
}
22092319
}
22102320
}

src/test/java/org/jsoup/parser/ParserIT.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,16 @@ public void handlesDeepStack() {
4949
long start = System.currentTimeMillis();
5050
Document doc = Parser.parseBodyFragment(longBody.toString(), "");
5151

52+
int depth = 1;
53+
Element el = doc.body();
54+
while (el.childrenSize() > 0) {
55+
el = el.child(0);
56+
depth++;
57+
}
58+
5259
// Assert
53-
assertEquals(2, doc.body().childNodeSize());
60+
assertEquals(1, doc.body().childrenSize());
61+
assertEquals(512, depth);
5462
assertEquals(25000, doc.select("dd").size());
5563
assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower
5664
// was originally much longer, or stack overflow.

0 commit comments

Comments
 (0)