Skip to content

Commit 7838399

Browse files
committed
Bring self-closing tags into HTML5 spec
For #2300 In HTML, only foreign elements (svg, math) can self close. Allows users to override via custom tags. XML parse and syntax will allow self-closing.
1 parent aabf0b0 commit 7838399

File tree

11 files changed

+169
-51
lines changed

11 files changed

+169
-51
lines changed

CHANGES.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,17 @@
44

55
### Changes
66

7+
* To better follow the HTML5 spec and current browsers, the HTML parser no longer allows self-closing tags (`<foo />`)
8+
to close HTML elements by default. Foreign content (SVG, MathML), and content parsed with the XML parser, still
9+
supports self-closing tags. If you need specific HTML tags to support self-closing, you can register a custom tag via
10+
the `TagSet` configured in `Parser.tagSet()`, using `Tag#set(Tag.SelfClose)`. Void/empty tags (like `<img>`) are
11+
unaffected by this change. [#2300](https://github.com/jhy/jsoup/issues/2300).
12+
713
### Improvements
814

15+
* Added the ability to define custom tags, and to modify properties of known tags, via the `TagSet` tag collection.
16+
Their properties can impact both the parse and how content is
17+
serialized. [#2285](https://github.com/jhy/jsoup/issues/2285).
918
* `Element.cssSelector()` will prefer to return shorter selectors by using ancestor IDs when available and unique. E.g.
1019
`#id > div > p` instead of `html > body > div > div > p` [#2283](https://github.com/jhy/jsoup/pull/2283).
1120
* Added `Elements.deselect(int index)`, `Elements.deselect(Object o)`, and `Elements.deselectAll()` methods to remove

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,25 +1812,29 @@ public Range endSourceRange() {
18121812

18131813
@Override
18141814
void outerHtmlHead(final Appendable accum, Document.OutputSettings out) throws IOException {
1815-
accum.append('<').append(safeTagName(out.syntax()));
1815+
String tagName = safeTagName(out.syntax());
1816+
accum.append('<').append(tagName);
18161817
if (attributes != null) attributes.html(accum, out);
18171818

1818-
// selfclosing includes unknown tags, isEmpty defines tags that are always empty
1819-
if (childNodes.isEmpty() && tag.isSelfClosing()) {
1820-
if (out.syntax() == html && tag.isEmpty())
1819+
if (childNodes.isEmpty()) {
1820+
boolean xmlMode = out.syntax() == xml || !tag.namespace().equals(NamespaceHtml);
1821+
if (xmlMode && (tag.is(Tag.SeenSelfClose) || (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())))) {
1822+
accum.append(" />");
1823+
} else if (!xmlMode && tag.isEmpty()) { // html void element
18211824
accum.append('>');
1822-
else
1823-
accum.append(" />"); // <img> in html, <img /> in xml
1824-
}
1825-
else
1825+
} else {
1826+
accum.append("></").append(tagName).append('>');
1827+
}
1828+
} else {
18261829
accum.append('>');
1830+
}
18271831
}
18281832

18291833
@Override
18301834
void outerHtmlTail(Appendable accum, Document.OutputSettings out) throws IOException {
1831-
if (!(childNodes.isEmpty() && tag.isSelfClosing())) {
1835+
if (!childNodes.isEmpty())
18321836
accum.append("</").append(safeTagName(out.syntax())).append('>');
1833-
}
1837+
// if empty, we have already closed in htmlHead
18341838
}
18351839

18361840
/* If XML syntax, normalizes < to _ in tag name. */

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -328,21 +328,18 @@ Element insertElementFor(final Token.StartTag startTag) {
328328
Element el = createElementFor(startTag, NamespaceHtml, false);
329329
doInsertElement(el);
330330

331-
// handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
331+
// handle self-closing tags. when the spec expects an empty (void) tag, will directly hit insertEmpty, so won't generate this fake end tag.
332332
if (startTag.isSelfClosing()) {
333333
Tag tag = el.tag();
334-
if (tag.isKnownTag()) {
335-
if (!tag.isEmpty())
336-
tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
337-
// else: ok
334+
tag.setSeenSelfClose(); // can infer output if in xml syntax
335+
if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) {
336+
// ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
337+
tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data
338+
tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing
339+
} else {
340+
// error it, and leave the inserted element on
341+
tokeniser.error("Tag [%s] cannot be self-closing; not a void tag", tag.normalName());
338342
}
339-
else { // unknown tag: remember this is self-closing, for output
340-
tag.setSelfClosing();
341-
}
342-
343-
// effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
344-
tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data
345-
tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing
346343
}
347344

348345
return el;
@@ -355,8 +352,8 @@ Element insertForeignElementFor(final Token.StartTag startTag, String namespace)
355352
Element el = createElementFor(startTag, namespace, true);
356353
doInsertElement(el);
357354

358-
if (startTag.isSelfClosing()) {
359-
el.tag().setSelfClosing(); // remember this is self-closing for output
355+
if (startTag.isSelfClosing()) { // foreign els are OK to self-close
356+
el.tag().setSeenSelfClose(); // remember this is self-closing for output
360357
pop();
361358
}
362359

src/main/java/org/jsoup/parser/Tag.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -324,9 +324,8 @@ public boolean isFormSubmittable() {
324324
return (options &= FormSubmittable) != 0;
325325
}
326326

327-
Tag setSelfClosing() {
328-
options |= Tag.SelfClose; // does not change known status. // todo will refactor how self-closing is handled in TreeBuilder
329-
return this;
327+
void setSeenSelfClose() {
328+
options |= Tag.SeenSelfClose; // does not change known status
330329
}
331330

332331
/**

src/main/java/org/jsoup/parser/TreeBuilder.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ void initialiseParse(Reader input, String baseUri, Parser parser) {
5252
reader = new CharacterReader(input);
5353
trackSourceRange = parser.isTrackPosition();
5454
reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility
55+
if (parser.isTrackErrors()) parser.getErrors().clear();
5556
tokeniser = new Tokeniser(this);
5657
stack = new ArrayList<>(32);
5758
tagSet = parser.tagSet();

src/main/java/org/jsoup/parser/XmlTreeBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ void insertElementFor(Token.StartTag startTag) {
157157
push(el);
158158

159159
if (startTag.isSelfClosing()) {
160-
tag.setSelfClosing();
160+
tag.setSeenSelfClose();
161161
pop(); // push & pop ensures onNodeInserted & onNodeClosed
162162
} else {
163163
TokeniserState textState = tag.textState();

src/test/java/org/jsoup/nodes/DocumentTest.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import org.jsoup.nodes.Document.OutputSettings.Syntax;
88
import org.jsoup.parser.ParseSettings;
99
import org.jsoup.parser.Parser;
10+
import org.jsoup.parser.Tag;
1011
import org.jsoup.select.Elements;
1112
import org.junit.jupiter.api.Disabled;
1213
import org.junit.jupiter.api.Test;
@@ -158,14 +159,16 @@ public class DocumentTest {
158159

159160
@Test public void testHtmlAndXmlSyntax() {
160161
String h = "<!DOCTYPE html><body><img async checked='checked' src='&<>\"'>&lt;&gt;&amp;&quot;<foo />bar";
161-
Document doc = Jsoup.parse(h);
162+
Parser parser = Parser.htmlParser();
163+
parser.tagSet().valueOf("foo", Parser.NamespaceHtml).set(Tag.SelfClose); // customize foo to allow self close
164+
Document doc = Jsoup.parse(h, parser);
162165

163166
doc.outputSettings().syntax(Syntax.html);
164167
assertEquals("<!doctype html>\n" +
165168
"<html>\n" +
166169
" <head></head>\n" +
167170
" <body>\n" +
168-
" <img async checked src=\"&amp;<>&quot;\">&lt;&gt;&amp;\"<foo />bar\n" +
171+
" <img async checked src=\"&amp;<>&quot;\">&lt;&gt;&amp;\"<foo></foo>bar\n" + // html won't include self-closing
169172
" </body>\n" +
170173
"</html>", doc.html());
171174

@@ -174,7 +177,7 @@ public class DocumentTest {
174177
"<html>\n" +
175178
" <head></head>\n" +
176179
" <body>\n" +
177-
" <img async=\"\" checked=\"checked\" src=\"&amp;&lt;>&quot;\" />&lt;&gt;&amp;\"<foo />bar\n" +
180+
" <img async=\"\" checked=\"checked\" src=\"&amp;&lt;>&quot;\" />&lt;&gt;&amp;\"<foo />bar\n" + // xml will
178181
" </body>\n" +
179182
"</html>", doc.html());
180183
}

src/test/java/org/jsoup/nodes/ElementTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,9 +2555,9 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
25552555
Document doc = Jsoup.parse(html);
25562556
assertEquals("<bar>\n <p></p>\n</bar>", doc.body().html());
25572557

2558-
html = "<foo>\n <bar />\n</foo>";
2558+
html = "<foo>\n <bar /></foo>";
25592559
doc = Jsoup.parse(html);
2560-
assertEquals("<foo>\n <bar />\n</foo>", doc.body().html());
2560+
assertEquals("<foo>\n <bar></bar>\n</foo>", doc.body().html());
25612561
}
25622562

25632563
@Test void spanInBlockTrims() {

0 commit comments

Comments
 (0)