Skip to content

Commit 920e89e

Browse files
committed
Directly parse XML declarations in XmlTreeBuilder
Vs the old method of bouncing through the HTML parser's bogus comments. This simplifies the parse flow for declarations and can better handle dodgy inputs.
1 parent b875b92 commit 920e89e

File tree

8 files changed

+145
-15
lines changed

8 files changed

+145
-15
lines changed

CHANGES.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
longer in the spec.
3939
* Added `Elements.selectFirst(String cssQuery)` and `Elements.expectFirst(String cssQuery)`, to select the first
4040
matching element from an `Elements` list. [2263](https://github.com/jhy/jsoup/pull/2263/)
41+
* When parsing with the XML parser, XML Declarations and Processing Instructions are directly handled, vs bouncing
42+
through the HTML parser's bogus comment handler. Serialization for non-doctype declarations no longer end with a
43+
spurious `!`.
4144

4245
### Bug Fixes
4346

src/main/java/org/jsoup/nodes/Comment.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,11 @@ public Comment clone() {
5555
}
5656

5757
/**
58-
* Check if this comment looks like an XML Declaration.
58+
* Check if this comment looks like an XML Declaration. This is the case when the HTML parser sees an XML
59+
* declaration or processing instruction. Other than doctypes, those aren't part of HTML, and will be parsed as a
60+
* bogus comment.
5961
* @return true if it looks like, maybe, it's an XML Declaration.
62+
* @see #asXmlDeclaration()
6063
*/
6164
public boolean isXmlDeclaration() {
6265
String data = getData();
@@ -70,6 +73,7 @@ private static boolean isXmlDeclarationData(String data) {
7073
/**
7174
* Attempt to cast this comment to an XML Declaration node.
7275
* @return an XML declaration if it could be parsed as one, null otherwise.
76+
* @see #isXmlDeclaration()
7377
*/
7478
public @Nullable XmlDeclaration asXmlDeclaration() {
7579
String data = getData();

src/main/java/org/jsoup/nodes/XmlDeclaration.java

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,24 @@
66
import java.io.IOException;
77

88
/**
9-
* An XML Declaration.
9+
* An XML Declaration. Includes support for treating the declaration contents as pseudo attributes.
1010
*/
1111
public class XmlDeclaration extends LeafNode {
12-
// todo this impl isn't really right, the data shouldn't be attributes, just a run of text after the name
13-
private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
12+
13+
/**
14+
First char is `!` if isDeclaration, like in {@code <!ENTITY ...>}.
15+
Otherwise, is `?`, a processing instruction, like {@code <?xml .... ?>} (and note trailing `?`).
16+
*/
17+
private final boolean isDeclaration;
1418

1519
/**
1620
* Create a new XML declaration
1721
* @param name of declaration
18-
* @param isProcessingInstruction is processing instruction
22+
* @param isDeclaration {@code true} if a declaration (first char is `!`), otherwise a processing instruction (first char is `?`).
1923
*/
20-
public XmlDeclaration(String name, boolean isProcessingInstruction) {
24+
public XmlDeclaration(String name, boolean isDeclaration) {
2125
super(name);
22-
this.isProcessingInstruction = isProcessingInstruction;
26+
this.isDeclaration = isDeclaration;
2327
}
2428

2529
@Override public String nodeName() {
@@ -69,11 +73,11 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
6973
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
7074
accum
7175
.append("<")
72-
.append(isProcessingInstruction ? "!" : "?")
76+
.append(isDeclaration ? "!" : "?")
7377
.append(coreValue());
7478
getWholeDeclaration(accum, out);
7579
accum
76-
.append(isProcessingInstruction ? "!" : "?")
80+
.append(isDeclaration ? "" : "?")
7781
.append(">");
7882
}
7983

src/main/java/org/jsoup/parser/Token.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,34 @@ public String toString() {
481481

482482
}
483483

484+
/**
485+
XmlDeclaration - extends Tag for pseudo attribute support
486+
*/
487+
final static class XmlDecl extends Tag {
488+
boolean isDeclaration = true; // <!..>, or <?...?> if false (a processing instruction)
489+
490+
public XmlDecl(TreeBuilder treeBuilder) {
491+
super(TokenType.XmlDecl, treeBuilder);
492+
}
493+
494+
@Override
495+
XmlDecl reset() {
496+
super.reset();
497+
isDeclaration = true;
498+
return this;
499+
}
500+
501+
@Override
502+
public String toString() {
503+
String open = isDeclaration ? "<!" : "<?";
504+
String close = isDeclaration ? ">" : "?>";
505+
if (hasAttributes() && attributes.size() > 0)
506+
return open + toStringName() + " " + attributes.toString() + close;
507+
else
508+
return open + toStringName() + close;
509+
}
510+
}
511+
484512
final static class EOF extends Token {
485513
EOF() {
486514
super(Token.TokenType.EOF);
@@ -542,6 +570,10 @@ final Character asCharacter() {
542570
return (Character) this;
543571
}
544572

573+
final XmlDecl asXmlDecl() {
574+
return (XmlDecl) this;
575+
}
576+
545577
final boolean isEOF() {
546578
return type == TokenType.EOF;
547579
}
@@ -552,6 +584,7 @@ public enum TokenType {
552584
EndTag,
553585
Comment,
554586
Character, // note no CData - treated in builder as an extension of Character
587+
XmlDecl,
555588
EOF
556589
}
557590
}

src/main/java/org/jsoup/parser/Tokeniser.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import org.jsoup.helper.Validate;
44
import org.jsoup.internal.StringUtil;
5+
import org.jsoup.nodes.Document;
56
import org.jsoup.nodes.Entities;
67
import org.jspecify.annotations.Nullable;
78

@@ -40,20 +41,24 @@ final class Tokeniser {
4041
private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
4142
final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
4243

44+
final Document.OutputSettings.Syntax syntax; // html or xml syntax; affects processing of xml declarations vs as bogus comments
4345
final Token.StartTag startPending;
4446
final Token.EndTag endPending;
4547
Token.Tag tagPending; // tag we are building up: start or end pending
4648
final Token.Character charPending = new Token.Character();
4749
final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
4850
final Token.Comment commentPending = new Token.Comment(); // comment building up
51+
final Token.XmlDecl xmlDeclPending; // xml decl building up
4952
@Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag
5053
@Nullable private String lastStartCloseSeq; // "</" + lastStartTag, so we can quickly check for that in RCData
5154

5255
private int markupStartPos, charStartPos = 0; // reader pos at the start of markup / characters. markup updated on state transition, char on token emit.
5356

5457
Tokeniser(TreeBuilder treeBuilder) {
58+
syntax = treeBuilder instanceof XmlTreeBuilder ? Document.OutputSettings.Syntax.xml : Document.OutputSettings.Syntax.html;
5559
tagPending = startPending = new Token.StartTag(treeBuilder);
5660
endPending = new Token.EndTag(treeBuilder);
61+
xmlDeclPending = new Token.XmlDecl(treeBuilder);
5762
this.reader = treeBuilder.reader;
5863
this.errors = treeBuilder.parser.getErrors();
5964
}
@@ -262,6 +267,13 @@ Token.Tag createTagPending(boolean start) {
262267
return tagPending;
263268
}
264269

270+
Token.XmlDecl createXmlDeclPending(boolean isDeclaration) {
271+
Token.XmlDecl decl = xmlDeclPending.reset();
272+
decl.isDeclaration = isDeclaration;
273+
tagPending = decl;
274+
return decl;
275+
}
276+
265277
void emitTagPending() {
266278
tagPending.finaliseTag();
267279
emit(tagPending);

src/main/java/org/jsoup/parser/TokeniserState.java

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import org.jsoup.nodes.DocumentType;
44

5+
import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
6+
57
/**
68
* States and transition activations for the Tokeniser.
79
*/
@@ -105,8 +107,12 @@ enum TokeniserState {
105107
t.advanceTransition(EndTagOpen);
106108
break;
107109
case '?':
108-
t.createBogusCommentPending();
109-
t.transition(BogusComment);
110+
if (t.syntax == xml) {
111+
t.advanceTransition(MarkupProcessingOpen);
112+
} else {
113+
t.createBogusCommentPending();
114+
t.transition(BogusComment);
115+
}
110116
break;
111117
default:
112118
if (r.matchesAsciiAlpha()) {
@@ -590,6 +596,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
590596
t.tagPending.appendAttributeName(c, r.pos()-1, r.pos());
591597
t.transition(AttributeName);
592598
break;
599+
case '?': // Handle trailing ? in <?xml...?>
600+
if (t.tagPending instanceof Token.XmlDecl)
601+
break;
602+
// otherwise fall through to default
593603
default: // A-Z, anything else
594604
t.tagPending.newAttribute();
595605
r.unconsume();
@@ -634,6 +644,11 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
634644
t.error(this);
635645
t.tagPending.appendAttributeName(c, pos, r.pos());
636646
break;
647+
case '?':
648+
if (t.syntax == xml && t.tagPending instanceof Token.XmlDecl) {
649+
t.transition(AfterAttributeName);
650+
break;
651+
} // otherwise default - take it
637652
default: // buffer underrun
638653
t.tagPending.appendAttributeName(c, pos, r.pos());
639654
}
@@ -917,7 +932,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
917932
}
918933
}
919934
},
920-
MarkupDeclarationOpen {
935+
MarkupDeclarationOpen { // from <!
921936
@Override void read(Tokeniser t, CharacterReader r) {
922937
if (r.matchConsume("--")) {
923938
t.createCommentPending();
@@ -930,9 +945,27 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
930945
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
931946
t.createTempBuffer();
932947
t.transition(CdataSection);
948+
} else {
949+
if (t.syntax == xml && r.matchesAsciiAlpha()) {
950+
t.createXmlDeclPending(true);
951+
t.transition(TagName); // treat <!ENTITY as XML Declaration, with tag-like handling
952+
} else {
953+
t.error(this);
954+
t.createBogusCommentPending();
955+
t.transition(BogusComment);
956+
}
957+
}
958+
}
959+
},
960+
MarkupProcessingOpen { // From <? in syntax XML
961+
@Override void read(Tokeniser t, CharacterReader r) {
962+
if (r.matchesAsciiAlpha()) {
963+
t.createXmlDeclPending(false);
964+
t.transition(TagName); // treat <?xml... as XML Declaration (processing instruction), with tag-like handling
933965
} else {
934966
t.error(this);
935967
t.createBogusCommentPending();
968+
t.commentPending.append('?'); // push the ? to the start of the comment
936969
t.transition(BogusComment);
937970
}
938971
}
@@ -1623,7 +1656,7 @@ else if (r.matches('>')) {
16231656

16241657
static final char nullChar = '\u0000';
16251658
// char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
1626-
static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
1659+
static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>', '?'};
16271660
static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'};
16281661

16291662
private static final char replacementChar = Tokeniser.replacementChar;

src/main/java/org/jsoup/parser/XmlTreeBuilder.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ XmlTreeBuilder newInstance() {
6464
protected boolean process(Token token) {
6565
currentToken = token;
6666

67-
// start tag, end tag, doctype, comment, character, eof
67+
// start tag, end tag, doctype, xmldecl, comment, character, eof
6868
switch (token.type) {
6969
case StartTag:
7070
insertElementFor(token.asStartTag());
@@ -81,6 +81,9 @@ protected boolean process(Token token) {
8181
case Doctype:
8282
insertDoctypeFor(token.asDoctype());
8383
break;
84+
case XmlDecl:
85+
insertXmlDeclarationFor(token.asXmlDecl());
86+
break;
8487
case EOF: // could put some normalisation here if desired
8588
break;
8689
default:
@@ -134,6 +137,12 @@ void insertDoctypeFor(Token.Doctype token) {
134137
insertLeafNode(doctypeNode);
135138
}
136139

140+
void insertXmlDeclarationFor(Token.XmlDecl token) {
141+
XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
142+
if (token.attributes != null) decl.attributes().addAll(token.attributes);
143+
insertLeafNode(decl);
144+
}
145+
137146
/**
138147
* If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
139148
* found, skips.

src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import org.jsoup.Jsoup;
44
import org.jsoup.TextUtil;
5-
import org.jsoup.internal.StringUtil;
65
import org.jsoup.nodes.*;
76
import org.jsoup.select.Elements;
87
import org.junit.jupiter.api.Disabled;
@@ -353,4 +352,37 @@ private static void assertXmlNamespace(Element el) {
353352
assertEquals(Parser.NamespaceXml, el.tag().namespace(), String.format("Element %s not in XML namespace", el.tagName()));
354353
}
355354

355+
@Test void declarations() {
356+
String xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE html\n" +
357+
" PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" +
358+
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
359+
"<!ELEMENT footnote (#PCDATA|a)*>";
360+
Document doc = Jsoup.parse(xml, Parser.xmlParser());
361+
362+
XmlDeclaration proc = (XmlDeclaration) doc.childNode(0);
363+
DocumentType doctype = (DocumentType) doc.childNode(1);
364+
XmlDeclaration decl = (XmlDeclaration) doc.childNode(2);
365+
366+
assertEquals("xml", proc.name());
367+
assertEquals("1.0", proc.attr("version"));
368+
assertEquals("utf-8", proc.attr("encoding"));
369+
assertEquals("version=\"1.0\" encoding=\"utf-8\"", proc.getWholeDeclaration());
370+
assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>", proc.outerHtml());
371+
372+
assertEquals("html", doctype.name());
373+
assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", doctype.attr("publicId"));
374+
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", doctype.attr("systemId"));
375+
assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doctype.outerHtml());
376+
377+
assertEquals("ELEMENT", decl.name());
378+
assertEquals("footnote (#PCDATA|a)*", decl.getWholeDeclaration());
379+
assertTrue(decl.hasAttr("footNote"));
380+
assertFalse(decl.hasAttr("ELEMENT"));
381+
assertEquals("<!ELEMENT footnote (#PCDATA|a)*>", decl.outerHtml());
382+
383+
assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>" +
384+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
385+
"<!ELEMENT footnote (#PCDATA|a)*>", doc.outerHtml());
386+
}
387+
356388
}

0 commit comments

Comments
 (0)