|
7 | 7 | import org.jsoup.nodes.*; |
8 | 8 | import org.jsoup.safety.Safelist; |
9 | 9 | import org.jsoup.select.Elements; |
| 10 | +import org.junit.jupiter.api.Nested; |
10 | 11 | import org.junit.jupiter.api.Test; |
11 | 12 | import org.junit.jupiter.params.ParameterizedTest; |
12 | 13 | import org.junit.jupiter.params.provider.Arguments; |
@@ -2205,6 +2206,115 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) { |
2205 | 2206 | assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString()); |
2206 | 2207 | assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString()); |
2207 | 2208 | assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString()); |
| 2209 | + } |
| 2210 | + |
| 2211 | + @Nested class DeepHtmlTrees { |
| 2212 | + private int depth(Element el) { |
| 2213 | + int d = 0; |
| 2214 | + while ((el = el.parent()) != null) { |
| 2215 | + d++; |
| 2216 | + } while (el != null); |
| 2217 | + return d; |
| 2218 | + } |
2208 | 2219 |
|
| 2220 | + /** |
| 2221 | + * Parse the HTML code in `contents`, wrapped in enough divs to ensure that the root elements |
| 2222 | + * of contents are at depth `startingDepth`. |
| 2223 | + */ |
| 2224 | + private Element parseDeepHtml(int startingDepth, String contents) { |
| 2225 | + StringBuilder html = new StringBuilder(); |
| 2226 | + html.append("<html><body>"); |
| 2227 | + for (int i = 0; i < startingDepth - 4; i++) { |
| 2228 | + html.append("<div>"); |
| 2229 | + } |
| 2230 | + html.append("<div id='container'>"); |
| 2231 | + html.append(contents); |
| 2232 | + |
| 2233 | + Parser parser = Parser.htmlParser(); |
| 2234 | + Document doc = Jsoup.parse(html.toString(), parser); |
| 2235 | + Element container = doc.getElementById("container"); |
| 2236 | + assertNotNull(container); |
| 2237 | + assertEquals(startingDepth - 1, depth(container)); |
| 2238 | + |
| 2239 | + return container; |
| 2240 | + } |
| 2241 | + |
| 2242 | + @Test void nestedDivs() { |
| 2243 | + Element container = parseDeepHtml(511, "<div><div><div>"); |
| 2244 | + |
| 2245 | + assertEquals("<div>\n <div></div>\n <div></div>\n</div>", container.html()); |
| 2246 | + } |
| 2247 | + |
| 2248 | + @Test void closingTagOfTagClosedByDepthLimit() { |
| 2249 | + // The <a></a> tag would be nested too deep, so it first closes the innermost <span>. |
| 2250 | + // This means that the first </span> will close the outer <span>, as it's the only |
| 2251 | + // one that is currently open. The last </span> is then just ignored, as there is no |
| 2252 | + // open <span> left to close. |
| 2253 | + Element container = parseDeepHtml(511, "<span><span><a></a></span><b></b></span>"); |
| 2254 | + |
| 2255 | + assertEquals("<span><span></span><a></a></span><b></b>", container.html()); |
| 2256 | + } |
| 2257 | + |
| 2258 | + @Test void tableAtDepthLimitWithDirectTd() { |
| 2259 | + Element container = parseDeepHtml(512, "<table><td>"); |
| 2260 | + |
| 2261 | + assertEquals("<table></table>\n<tbody></tbody>\n<tr></tr>\n<td></td>", container.html()); |
| 2262 | + } |
| 2263 | + |
| 2264 | + @Test void tableRightBeforeDepthLimitWithDirectTd() { |
| 2265 | + Element container = parseDeepHtml(511, "<table><td>"); |
| 2266 | + |
| 2267 | + assertEquals("<table>\n <tbody></tbody>\n <tr></tr>\n <td></td>\n</table>", container.html()); |
| 2268 | + } |
| 2269 | + |
| 2270 | + @Test void customDepthLimit() { |
| 2271 | + Parser parser = Parser.htmlParser().setMaxDepth(5); |
| 2272 | + String input = "<html><body><div><div><div><div><div><div>"; |
| 2273 | + |
| 2274 | + Document doc = Jsoup.parse(input, parser); |
| 2275 | + String expected = new StringBuilder() |
| 2276 | + .append("<html>\n") |
| 2277 | + .append(" <head></head>\n") |
| 2278 | + .append(" <body>\n") |
| 2279 | + .append(" <div>\n") |
| 2280 | + .append(" <div>\n") |
| 2281 | + .append(" <div></div>\n") |
| 2282 | + .append(" <div></div>\n") |
| 2283 | + .append(" <div></div>\n") |
| 2284 | + .append(" <div></div>\n") |
| 2285 | + .append(" </div>\n") |
| 2286 | + .append(" </div>\n") |
| 2287 | + .append(" </body>\n") |
| 2288 | + .append("</html>") |
| 2289 | + .toString(); |
| 2290 | + |
| 2291 | + assertEquals(expected, doc.html()); |
| 2292 | + } |
| 2293 | + |
| 2294 | + @Test void formControlsDetachWhenFormTrimmed() { |
| 2295 | + Parser parser = Parser.htmlParser().setMaxDepth(3); |
| 2296 | + String input = "<form id='f'><div><input name='foo'></div></form>"; |
| 2297 | + |
| 2298 | + Document doc = Jsoup.parse(input, "", parser); |
| 2299 | + Element formEl = doc.getElementById("f"); |
| 2300 | + assertNotNull(formEl); |
| 2301 | + assertTrue(formEl instanceof FormElement); |
| 2302 | + FormElement form = (FormElement) formEl; |
| 2303 | + assertEquals("", form.html()); |
| 2304 | + assertEquals(0, form.elements().size()); |
| 2305 | + } |
| 2306 | + |
| 2307 | + @Test void templateModesClearedWhenTrimmed() { |
| 2308 | + Parser parser = Parser.htmlParser().setMaxDepth(3); |
| 2309 | + String input = "<template id='tmpl'><div><span>One</span></div></template><p>Two</p>"; |
| 2310 | + |
| 2311 | + Document doc = Jsoup.parse(input, "", parser); |
| 2312 | + Element template = doc.getElementById("tmpl"); |
| 2313 | + assertNotNull(template); |
| 2314 | + assertEquals("", template.html()); |
| 2315 | + Element paragraph = doc.selectFirst("p"); |
| 2316 | + assertNotNull(paragraph); |
| 2317 | + assertEquals("Two", paragraph.text()); |
| 2318 | + } |
2209 | 2319 | } |
2210 | 2320 | } |
0 commit comments