Directly parse XML declarations in XmlTreeBuilder

jhy · jhy · commit 920e89e58faa · 2025-02-25T14:23:47.000+11:00
Vs the old method of bouncing through the HTML parser's bogus comments.

This simplifies the parse flow for declarations and can better handle dodgy inputs.
diff --git a/CHANGES.md b/CHANGES.md
@@ -38,6 +38,9 @@
   longer in the spec.
 * Added `Elements.selectFirst(String cssQuery)` and `Elements.expectFirst(String cssQuery)`, to select the first
   matching element from an `Elements` list.  [2263](https://github.com/jhy/jsoup/pull/2263/)
+* When parsing with the XML parser, XML Declarations and Processing Instructions are directly handled, vs bouncing
+  through the HTML parser's bogus comment handler. Serialization for non-doctype declarations no longer end with a
+  spurious `!`.
 
 ### Bug Fixes
 
diff --git a/src/main/java/org/jsoup/nodes/Comment.java b/src/main/java/org/jsoup/nodes/Comment.java
@@ -55,8 +55,11 @@ public Comment clone() {
     }
 
     /**
-     * Check if this comment looks like an XML Declaration.
+     * Check if this comment looks like an XML Declaration. This is the case when the HTML parser sees an XML
+     * declaration or processing instruction. Other than doctypes, those aren't part of HTML, and will be parsed as a
+     * bogus comment.
      * @return true if it looks like, maybe, it's an XML Declaration.
+     * @see #asXmlDeclaration()
      */
     public boolean isXmlDeclaration() {
         String data = getData();
@@ -70,6 +73,7 @@ private static boolean isXmlDeclarationData(String data) {
     /**
      * Attempt to cast this comment to an XML Declaration node.
      * @return an XML declaration if it could be parsed as one, null otherwise.
+     * @see #isXmlDeclaration()
      */
     public @Nullable XmlDeclaration asXmlDeclaration() {
         String data = getData();
diff --git a/src/main/java/org/jsoup/nodes/XmlDeclaration.java b/src/main/java/org/jsoup/nodes/XmlDeclaration.java
@@ -6,20 +6,24 @@
 import java.io.IOException;
 
 /**
- * An XML Declaration.
+ * An XML Declaration. Includes support for treating the declaration contents as pseudo attributes.
  */
 public class XmlDeclaration extends LeafNode {
-    // todo this impl isn't really right, the data shouldn't be attributes, just a run of text after the name
-    private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
+
+    /**
+     First char is `!` if isDeclaration, like in {@code  <!ENTITY ...>}.
+     Otherwise, is `?`, a processing instruction, like {@code <?xml .... ?>} (and note trailing `?`).
+     */
+    private final boolean isDeclaration;
 
     /**
      * Create a new XML declaration
      * @param name of declaration
-     * @param isProcessingInstruction is processing instruction
+     * @param isDeclaration {@code true} if a declaration (first char is `!`), otherwise a processing instruction (first char is `?`).
      */
-    public XmlDeclaration(String name, boolean isProcessingInstruction) {
+    public XmlDeclaration(String name, boolean isDeclaration) {
         super(name);
-        this.isProcessingInstruction = isProcessingInstruction;
+        this.isDeclaration = isDeclaration;
     }
 
     @Override public String nodeName() {
@@ -69,11 +73,11 @@ private void getWholeDeclaration(Appendable accum, Document.OutputSettings out)
     void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
         accum
             .append("<")
-            .append(isProcessingInstruction ? "!" : "?")
+            .append(isDeclaration ? "!" : "?")
             .append(coreValue());
         getWholeDeclaration(accum, out);
         accum
-            .append(isProcessingInstruction ? "!" : "?")
+            .append(isDeclaration ? "" : "?")
             .append(">");
     }
 
diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java
@@ -481,6 +481,34 @@ public String toString() {
 
     }
 
+    /**
+     XmlDeclaration - extends Tag for pseudo attribute support
+     */
+    final static class XmlDecl extends Tag {
+        boolean isDeclaration = true; // <!..>, or <?...?> if false (a processing instruction)
+
+        public XmlDecl(TreeBuilder treeBuilder) {
+            super(TokenType.XmlDecl, treeBuilder);
+        }
+
+        @Override
+        XmlDecl reset() {
+            super.reset();
+            isDeclaration = true;
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            String open = isDeclaration ? "<!" : "<?";
+            String close = isDeclaration ? ">" : "?>";
+            if (hasAttributes() && attributes.size() > 0)
+                return open + toStringName() + " " + attributes.toString() + close;
+            else
+                return open + toStringName() + close;
+        }
+    }
+
     final static class EOF extends Token {
         EOF() {
             super(Token.TokenType.EOF);
@@ -542,6 +570,10 @@ final Character asCharacter() {
         return (Character) this;
     }
 
+    final XmlDecl asXmlDecl() {
+        return (XmlDecl) this;
+    }
+
     final boolean isEOF() {
         return type == TokenType.EOF;
     }
@@ -552,6 +584,7 @@ public enum TokenType {
         EndTag,
         Comment,
         Character, // note no CData - treated in builder as an extension of Character
+        XmlDecl,
         EOF
     }
 }
diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -2,6 +2,7 @@
 
 import org.jsoup.helper.Validate;
 import org.jsoup.internal.StringUtil;
+import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Entities;
 import org.jspecify.annotations.Nullable;
 
@@ -40,20 +41,24 @@ final class Tokeniser {
     private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
     final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
 
+    final Document.OutputSettings.Syntax syntax; // html or xml syntax; affects processing of xml declarations vs as bogus comments
     final Token.StartTag startPending;
     final Token.EndTag endPending;
     Token.Tag tagPending; // tag we are building up: start or end pending
     final Token.Character charPending = new Token.Character();
     final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
     final Token.Comment commentPending = new Token.Comment(); // comment building up
+    final Token.XmlDecl xmlDeclPending; // xml decl building up
     @Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag
     @Nullable private String lastStartCloseSeq; // "</" + lastStartTag, so we can quickly check for that in RCData
 
     private int markupStartPos, charStartPos = 0; // reader pos at the start of markup / characters. markup updated on state transition, char on token emit.
 
     Tokeniser(TreeBuilder treeBuilder) {
+        syntax = treeBuilder instanceof XmlTreeBuilder ? Document.OutputSettings.Syntax.xml : Document.OutputSettings.Syntax.html;
         tagPending = startPending  = new Token.StartTag(treeBuilder);
         endPending = new Token.EndTag(treeBuilder);
+        xmlDeclPending = new Token.XmlDecl(treeBuilder);
         this.reader = treeBuilder.reader;
         this.errors = treeBuilder.parser.getErrors();
     }
@@ -262,6 +267,13 @@ Token.Tag createTagPending(boolean start) {
         return tagPending;
     }
 
+    Token.XmlDecl createXmlDeclPending(boolean isDeclaration) {
+        Token.XmlDecl decl = xmlDeclPending.reset();
+        decl.isDeclaration = isDeclaration;
+        tagPending = decl;
+        return decl;
+    }
+
     void emitTagPending() {
         tagPending.finaliseTag();
         emit(tagPending);
diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -2,6 +2,8 @@
 
 import org.jsoup.nodes.DocumentType;
 
+import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
+
 /**
  * States and transition activations for the Tokeniser.
  */
@@ -105,8 +107,12 @@ enum TokeniserState {
                     t.advanceTransition(EndTagOpen);
                     break;
                 case '?':
-                    t.createBogusCommentPending();
-                    t.transition(BogusComment);
+                    if (t.syntax == xml) {
+                        t.advanceTransition(MarkupProcessingOpen);
+                    } else {
+                        t.createBogusCommentPending();
+                        t.transition(BogusComment);
+                    }
                     break;
                 default:
                     if (r.matchesAsciiAlpha()) {
@@ -590,6 +596,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
                     t.tagPending.appendAttributeName(c, r.pos()-1, r.pos());
                     t.transition(AttributeName);
                     break;
+                case '?': // Handle trailing ? in <?xml...?>
+                    if (t.tagPending instanceof Token.XmlDecl)
+                        break;
+                    // otherwise fall through to default
                 default: // A-Z, anything else
                     t.tagPending.newAttribute();
                     r.unconsume();
@@ -634,6 +644,11 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
                     t.error(this);
                     t.tagPending.appendAttributeName(c, pos, r.pos());
                     break;
+                case '?':
+                    if (t.syntax == xml && t.tagPending instanceof Token.XmlDecl) {
+                        t.transition(AfterAttributeName);
+                        break;
+                    } // otherwise default - take it
                 default: // buffer underrun
                     t.tagPending.appendAttributeName(c, pos, r.pos());
             }
@@ -917,7 +932,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
             }
         }
     },
-    MarkupDeclarationOpen {
+    MarkupDeclarationOpen { // from <!
         @Override void read(Tokeniser t, CharacterReader r) {
             if (r.matchConsume("--")) {
                 t.createCommentPending();
@@ -930,9 +945,27 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
                 //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
                 t.createTempBuffer();
                 t.transition(CdataSection);
+            } else {
+                if (t.syntax == xml && r.matchesAsciiAlpha()) {
+                    t.createXmlDeclPending(true);
+                    t.transition(TagName); // treat <!ENTITY as XML Declaration, with tag-like handling
+                } else {
+                    t.error(this);
+                    t.createBogusCommentPending();
+                    t.transition(BogusComment);
+                }
+            }
+        }
+    },
+    MarkupProcessingOpen { // From <? in syntax XML
+        @Override void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesAsciiAlpha()) {
+                t.createXmlDeclPending(false);
+                t.transition(TagName); // treat <?xml... as XML Declaration (processing instruction), with tag-like handling
             } else {
                 t.error(this);
                 t.createBogusCommentPending();
+                t.commentPending.append('?'); // push the ? to the start of the comment
                 t.transition(BogusComment);
             }
         }
@@ -1623,7 +1656,7 @@ else if (r.matches('>')) {
 
     static final char nullChar = '\u0000';
     // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
-    static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
+    static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>', '?'};
     static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'};
 
     private static final char replacementChar = Tokeniser.replacementChar;
diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java
@@ -64,7 +64,7 @@ XmlTreeBuilder newInstance() {
     protected boolean process(Token token) {
         currentToken = token;
 
-        // start tag, end tag, doctype, comment, character, eof
+        // start tag, end tag, doctype, xmldecl, comment, character, eof
         switch (token.type) {
             case StartTag:
                 insertElementFor(token.asStartTag());
@@ -81,6 +81,9 @@ protected boolean process(Token token) {
             case Doctype:
                 insertDoctypeFor(token.asDoctype());
                 break;
+            case XmlDecl:
+                insertXmlDeclarationFor(token.asXmlDecl());
+                break;
             case EOF: // could put some normalisation here if desired
                 break;
             default:
@@ -134,6 +137,12 @@ void insertDoctypeFor(Token.Doctype token) {
         insertLeafNode(doctypeNode);
     }
 
+    void insertXmlDeclarationFor(Token.XmlDecl token) {
+        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
+        if (token.attributes != null) decl.attributes().addAll(token.attributes);
+        insertLeafNode(decl);
+    }
+
     /**
      * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
      * found, skips.
diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
@@ -2,7 +2,6 @@
 
 import org.jsoup.Jsoup;
 import org.jsoup.TextUtil;
-import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.*;
 import org.jsoup.select.Elements;
 import org.junit.jupiter.api.Disabled;
@@ -353,4 +352,37 @@ private static void assertXmlNamespace(Element el) {
         assertEquals(Parser.NamespaceXml, el.tag().namespace(), String.format("Element %s not in XML namespace", el.tagName()));
     }
 
+    @Test void declarations() {
+        String xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE html\n" +
+            "  PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" +
+            "  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
+            "<!ELEMENT footnote (#PCDATA|a)*>";
+        Document doc = Jsoup.parse(xml, Parser.xmlParser());
+
+        XmlDeclaration proc = (XmlDeclaration) doc.childNode(0);
+        DocumentType doctype = (DocumentType) doc.childNode(1);
+        XmlDeclaration decl = (XmlDeclaration) doc.childNode(2);
+
+        assertEquals("xml", proc.name());
+        assertEquals("1.0", proc.attr("version"));
+        assertEquals("utf-8", proc.attr("encoding"));
+        assertEquals("version=\"1.0\" encoding=\"utf-8\"", proc.getWholeDeclaration());
+        assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>", proc.outerHtml());
+
+        assertEquals("html", doctype.name());
+        assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", doctype.attr("publicId"));
+        assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", doctype.attr("systemId"));
+        assertEquals("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doctype.outerHtml());
+
+        assertEquals("ELEMENT", decl.name());
+        assertEquals("footnote (#PCDATA|a)*", decl.getWholeDeclaration());
+        assertTrue(decl.hasAttr("footNote"));
+        assertFalse(decl.hasAttr("ELEMENT"));
+        assertEquals("<!ELEMENT footnote (#PCDATA|a)*>", decl.outerHtml());
+
+        assertEquals("<?xml version=\"1.0\" encoding=\"utf-8\"?>" +
+            "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">" +
+            "<!ELEMENT footnote (#PCDATA|a)*>", doc.outerHtml());
+    }
+
 }