Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
longer in the spec.
* Added `Elements.selectFirst(String cssQuery)` and `Elements.expectFirst(String cssQuery)`, to select the first
matching element from an `Elements` list. [2263](https://github.com/jhy/jsoup/pull/2263/)
* When parsing with the XML parser, XML Declarations and Processing Instructions are directly handled, vs bouncing
through the HTML parser's bogus comment handler. Serialization for non-doctype declarations no longer end with a
spurious `!`. [2275](https://github.com/jhy/jsoup/pull/2275)

### Bug Fixes

Expand Down
6 changes: 5 additions & 1 deletion src/main/java/org/jsoup/nodes/Comment.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@ public Comment clone() {
}

/**
* Check if this comment looks like an XML Declaration.
* Check if this comment looks like an XML Declaration. This is the case when the HTML parser sees an XML
* declaration or processing instruction. Other than doctypes, those aren't part of HTML, and will be parsed as a
* bogus comment.
* @return true if it looks like, maybe, it's an XML Declaration.
* @see #asXmlDeclaration()
*/
public boolean isXmlDeclaration() {
String data = getData();
Expand All @@ -70,6 +73,7 @@ private static boolean isXmlDeclarationData(String data) {
/**
* Attempt to cast this comment to an XML Declaration node.
* @return an XML declaration if it could be parsed as one, null otherwise.
* @see #isXmlDeclaration()
*/
public @Nullable XmlDeclaration asXmlDeclaration() {
String data = getData();
Expand Down
20 changes: 12 additions & 8 deletions src/main/java/org/jsoup/nodes/XmlDeclaration.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,24 @@
import java.io.IOException;

/**
* An XML Declaration.
* An XML Declaration. Includes support for treating the declaration contents as pseudo attributes.
*/
public class XmlDeclaration extends LeafNode {
// todo this impl isn't really right, the data shouldn't be attributes, just a run of text after the name
private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)

/**
First char is `!` if isDeclaration, like in {@code <!ENTITY ...>}.
Otherwise, is `?`, a processing instruction, like {@code <?xml .... ?>} (and note trailing `?`).
*/
private final boolean isDeclaration;

/**
* Create a new XML declaration
* @param name of declaration
* @param isProcessingInstruction is processing instruction
* @param isDeclaration {@code true} if a declaration (first char is `!`), otherwise a processing instruction (first char is `?`).
*/
public XmlDeclaration(String name, boolean isProcessingInstruction) {
public XmlDeclaration(String name, boolean isDeclaration) {
super(name);
this.isProcessingInstruction = isProcessingInstruction;
this.isDeclaration = isDeclaration;
}

@Override public String nodeName() {
Expand Down Expand Up @@ -69,11 +73,11 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
accum
.append("<")
.append(isProcessingInstruction ? "!" : "?")
.append(isDeclaration ? "!" : "?")
.append(coreValue());
getWholeDeclaration(accum, out);
accum
.append(isProcessingInstruction ? "!" : "?")
.append(isDeclaration ? "" : "?")
.append(">");
}

Expand Down
33 changes: 33 additions & 0 deletions src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,34 @@ public String toString() {

}

/**
XmlDeclaration - extends Tag for pseudo attribute support
*/
final static class XmlDecl extends Tag {
boolean isDeclaration = true; // <!..>, or <?...?> if false (a processing instruction)

public XmlDecl(TreeBuilder treeBuilder) {
super(TokenType.XmlDecl, treeBuilder);
}

@Override
XmlDecl reset() {
super.reset();
isDeclaration = true;
return this;
}

@Override
public String toString() {
String open = isDeclaration ? "<!" : "<?";
String close = isDeclaration ? ">" : "?>";
if (hasAttributes() && attributes.size() > 0)
return open + toStringName() + " " + attributes.toString() + close;
else
return open + toStringName() + close;
}
}

final static class EOF extends Token {
EOF() {
super(Token.TokenType.EOF);
Expand Down Expand Up @@ -542,6 +570,10 @@ final Character asCharacter() {
return (Character) this;
}

final XmlDecl asXmlDecl() {
return (XmlDecl) this;
}

final boolean isEOF() {
return type == TokenType.EOF;
}
Expand All @@ -552,6 +584,7 @@ public enum TokenType {
EndTag,
Comment,
Character, // note no CData - treated in builder as an extension of Character
XmlDecl,
EOF
}
}
12 changes: 12 additions & 0 deletions src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
import org.jspecify.annotations.Nullable;

Expand Down Expand Up @@ -40,20 +41,24 @@ final class Tokeniser {
private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>

final Document.OutputSettings.Syntax syntax; // html or xml syntax; affects processing of xml declarations vs as bogus comments
final Token.StartTag startPending;
final Token.EndTag endPending;
Token.Tag tagPending; // tag we are building up: start or end pending
final Token.Character charPending = new Token.Character();
final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
final Token.Comment commentPending = new Token.Comment(); // comment building up
final Token.XmlDecl xmlDeclPending; // xml decl building up
@Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag
@Nullable private String lastStartCloseSeq; // "</" + lastStartTag, so we can quickly check for that in RCData

private int markupStartPos, charStartPos = 0; // reader pos at the start of markup / characters. markup updated on state transition, char on token emit.

Tokeniser(TreeBuilder treeBuilder) {
syntax = treeBuilder instanceof XmlTreeBuilder ? Document.OutputSettings.Syntax.xml : Document.OutputSettings.Syntax.html;
tagPending = startPending = new Token.StartTag(treeBuilder);
endPending = new Token.EndTag(treeBuilder);
xmlDeclPending = new Token.XmlDecl(treeBuilder);
this.reader = treeBuilder.reader;
this.errors = treeBuilder.parser.getErrors();
}
Expand Down Expand Up @@ -262,6 +267,13 @@ Token.Tag createTagPending(boolean start) {
return tagPending;
}

Token.XmlDecl createXmlDeclPending(boolean isDeclaration) {
Token.XmlDecl decl = xmlDeclPending.reset();
decl.isDeclaration = isDeclaration;
tagPending = decl;
return decl;
}

void emitTagPending() {
tagPending.finaliseTag();
emit(tagPending);
Expand Down
41 changes: 37 additions & 4 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import org.jsoup.nodes.DocumentType;

import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;

/**
* States and transition activations for the Tokeniser.
*/
Expand Down Expand Up @@ -105,8 +107,12 @@ enum TokeniserState {
t.advanceTransition(EndTagOpen);
break;
case '?':
t.createBogusCommentPending();
t.transition(BogusComment);
if (t.syntax == xml) {
t.advanceTransition(MarkupProcessingOpen);
} else {
t.createBogusCommentPending();
t.transition(BogusComment);
}
break;
default:
if (r.matchesAsciiAlpha()) {
Expand Down Expand Up @@ -590,6 +596,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
t.tagPending.appendAttributeName(c, r.pos()-1, r.pos());
t.transition(AttributeName);
break;
case '?': // Handle trailing ? in <?xml...?>
if (t.tagPending instanceof Token.XmlDecl)
break;
// otherwise fall through to default
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
Expand Down Expand Up @@ -634,6 +644,11 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
t.error(this);
t.tagPending.appendAttributeName(c, pos, r.pos());
break;
case '?':
if (t.syntax == xml && t.tagPending instanceof Token.XmlDecl) {
t.transition(AfterAttributeName);
break;
} // otherwise default - take it
default: // buffer underrun
t.tagPending.appendAttributeName(c, pos, r.pos());
}
Expand Down Expand Up @@ -917,7 +932,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
}
}
},
MarkupDeclarationOpen {
MarkupDeclarationOpen { // from <!
@Override void read(Tokeniser t, CharacterReader r) {
if (r.matchConsume("--")) {
t.createCommentPending();
Expand All @@ -930,9 +945,27 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.createTempBuffer();
t.transition(CdataSection);
} else {
if (t.syntax == xml && r.matchesAsciiAlpha()) {
t.createXmlDeclPending(true);
t.transition(TagName); // treat <!ENTITY as XML Declaration, with tag-like handling
} else {
t.error(this);
t.createBogusCommentPending();
t.transition(BogusComment);
}
}
}
},
MarkupProcessingOpen { // From <? in syntax XML
@Override void read(Tokeniser t, CharacterReader r) {
if (r.matchesAsciiAlpha()) {
t.createXmlDeclPending(false);
t.transition(TagName); // treat <?xml... as XML Declaration (processing instruction), with tag-like handling
} else {
t.error(this);
t.createBogusCommentPending();
t.commentPending.append('?'); // push the ? to the start of the comment
t.transition(BogusComment);
}
}
Expand Down Expand Up @@ -1623,7 +1656,7 @@ else if (r.matches('>')) {

static final char nullChar = '\u0000';
// char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>', '?'};
static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'};

private static final char replacementChar = Tokeniser.replacementChar;
Expand Down
11 changes: 10 additions & 1 deletion src/main/java/org/jsoup/parser/XmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ XmlTreeBuilder newInstance() {
protected boolean process(Token token) {
currentToken = token;

// start tag, end tag, doctype, comment, character, eof
// start tag, end tag, doctype, xmldecl, comment, character, eof
switch (token.type) {
case StartTag:
insertElementFor(token.asStartTag());
Expand All @@ -81,6 +81,9 @@ protected boolean process(Token token) {
case Doctype:
insertDoctypeFor(token.asDoctype());
break;
case XmlDecl:
insertXmlDeclarationFor(token.asXmlDecl());
break;
case EOF: // could put some normalisation here if desired
break;
default:
Expand Down Expand Up @@ -134,6 +137,12 @@ void insertDoctypeFor(Token.Doctype token) {
insertLeafNode(doctypeNode);
}

void insertXmlDeclarationFor(Token.XmlDecl token) {
XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
if (token.attributes != null) decl.attributes().addAll(token.attributes);
insertLeafNode(decl);
}

/**
* If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
* found, skips.
Expand Down
34 changes: 33 additions & 1 deletion src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import org.jsoup.Jsoup;
import org.jsoup.TextUtil;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Disabled;
Expand Down Expand Up @@ -353,4 +352,37 @@ private static void assertXmlNamespace(Element el) {
assertEquals(Parser.NamespaceXml, el.tag().namespace(), String.format("Element %s not in XML namespace", el.tagName()));
}

@Test void declarations() {
String xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE html\n" +
" PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" +
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
"<!ELEMENT footnote (#PCDATA|a)*>";
Document doc = Jsoup.parse(xml, Parser.xmlParser());

XmlDeclaration proc = (XmlDeclaration) doc.childNode(0);
DocumentType doctype = (DocumentType) doc.childNode(1);
XmlDeclaration decl = (XmlDeclaration) doc.childNode(2);

assertEquals("xml", proc.name());
assertEquals("1.0", proc.attr("version"));
assertEquals("utf-8", proc.attr("encoding"));
assertEquals("version=\"1.0\" encoding=\"utf-8\"", proc.getWholeDeclaration());
assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>", proc.outerHtml());

assertEquals("html", doctype.name());
assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", doctype.attr("publicId"));
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", doctype.attr("systemId"));
assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doctype.outerHtml());

assertEquals("ELEMENT", decl.name());
assertEquals("footnote (#PCDATA|a)*", decl.getWholeDeclaration());
assertTrue(decl.hasAttr("footNote"));
assertFalse(decl.hasAttr("ELEMENT"));
assertEquals("<!ELEMENT footnote (#PCDATA|a)*>", decl.outerHtml());

assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>" +
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
"<!ELEMENT footnote (#PCDATA|a)*>", doc.outerHtml());
}

}
Loading