Skip to content

Commit df34774

Browse files
committed
Don't fire fake onNodeClosed for /body or /html
These were fired to get those closing node source positions tracked, but the node isn't actually popped at that stage. So, it was causing the StreamParser to emit twice. Just call trackNodePosition directly. Fixes #2295
1 parent f9154f0 commit df34774

File tree

5 files changed

+43
-4
lines changed

5 files changed

+43
-4
lines changed

CHANGES.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# jsoup Changelog
22

3-
## 1.19.2 (PENDING)
3+
## 1.20.1 (PENDING)
44

55
### Changes
66

@@ -22,6 +22,7 @@
2222
* When converting a Document to the W3C DOM in `W3CDom`, an element with an attribute in an undeclared namespace now
2323
gets a declaration of `xmlns:prefix="undefined"`. This allows subsequent serializations to XML via `W3CDom.asString()`
2424
to succeed. [#2087](https://github.com/jhy/jsoup/issues/2087).
25+
* The `StreamParser` could emit the final elements of a document twice, due to how `onNodeCompleted` was fired when closing out the stack. [#2295](https://github.com/jhy/jsoup/issues/2295).
2526

2627
## 1.19.1 (2025-03-04)
2728

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
687687
} else {
688688
if (tb.onStackNot(InBodyEndOtherErrors))
689689
tb.error(this);
690-
tb.onNodeClosed(tb.getFromStack("body")); // track source position of close; everything is still on stack in case of trailers
690+
tb.trackNodePosition(tb.getFromStack("body"), false); // track source position of close; body is left on stack, in case of trailers
691691
tb.transition(AfterBody);
692692
}
693693
break;
@@ -1656,7 +1656,7 @@ else if (name.equals("col")) {
16561656
tb.error(this);
16571657
return false;
16581658
} else {
1659-
if (html != null) tb.onNodeClosed(html); // track source position of close; everything is still on stack in case of trailers
1659+
if (html != null) tb.trackNodePosition(html, false); // track source position of close; html is left on stack, in case of trailers
16601660
tb.transition(AfterAfterBody);
16611661
}
16621662
} else if (t.isEOF()) {

src/main/java/org/jsoup/parser/TreeBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ void onNodeClosed(Node node) {
279279
nodeListener.tail(node, stack.size());
280280
}
281281

282-
private void trackNodePosition(Node node, boolean isStart) {
282+
void trackNodePosition(Node node, boolean isStart) {
283283
if (!trackSourceRange) return;
284284

285285
final Token token = currentToken;

src/test/java/org/jsoup/parser/PositionTest.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,23 @@ private void printRange(Node node) {
220220
assertEquals("2,9:15-4,8:33", data.sourceRange().toString());
221221
}
222222

223+
@Test void tracksExplicitAndImplicitBodyHtml() {
224+
// Tests that </body></html> tokens are tracked when present
225+
String htmlSans = "<body><a>Link</a>";
226+
String htmlWith = "<html><head></head><body><a>Link</a></body></html>";
227+
228+
Document docSans = Jsoup.parse(htmlSans, TrackingHtmlParser);
229+
Document docWith = Jsoup.parse(htmlWith, TrackingHtmlParser);
230+
231+
StringBuilder trackSans = new StringBuilder();
232+
StringBuilder trackWith = new StringBuilder();
233+
docSans.forEachNode(node -> accumulatePositions(node, trackSans));
234+
docWith.forEachNode(node -> accumulatePositions(node, trackWith));
235+
236+
assertEquals("#document:0-0~17-17; html:0-0~17-17; head:0-0~0-0; body:0-6~17-17; a:6-9~13-17; #text:9-13; ", trackSans.toString());
237+
assertEquals("#document:0-0~50-50; html:0-6~43-50; head:6-12~12-19; body:19-25~36-43; a:25-28~32-36; #text:28-32; ", trackWith.toString());
238+
}
239+
223240
@Test void tracksXml() {
224241
String xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!doctype html>\n<rss url=foo>\nXML\n</rss>\n<!-- comment -->";
225242
Document doc = Jsoup.parse(xml, TrackingXmlParser);

src/test/java/org/jsoup/parser/StreamParserTest.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import org.jsoup.nodes.Node;
1111
import org.jsoup.select.Elements;
1212
import org.junit.jupiter.api.Test;
13+
import org.junit.jupiter.params.ParameterizedTest;
14+
import org.junit.jupiter.params.provider.ValueSource;
1315

1416
import java.io.BufferedReader;
1517
import java.io.File;
@@ -463,4 +465,23 @@ void canStreamFragmentXml() throws IOException {
463465
}
464466
}
465467

468+
@ParameterizedTest
469+
@ValueSource(strings = {
470+
"<html><body><a>Link</a></body></html>",
471+
"<html><body><a>Link</a>",
472+
"<a>Link</a></body></html>",
473+
"<a>Link</a>",
474+
"<a>Link",
475+
"<a>Link</body>",
476+
})
477+
void emitsOnlyOnce(String html) {
478+
try (StreamParser parser = new StreamParser(Parser.htmlParser()).parse(html, "")) {
479+
// https://github.com/jhy/jsoup/issues/2295
480+
// When there was a /body or /html, those were being emitted twice, due to firing a fake onNodeClosed to track their source positions
481+
StringBuilder seen = new StringBuilder();
482+
parser.stream().forEach(el -> trackSeen(el, seen));
483+
assertEquals("head+;a[Link];body;html;#root;", seen.toString());
484+
}
485+
}
486+
466487
}

0 commit comments

Comments
 (0)