22
33import org .jsoup .nodes .DocumentType ;
44
5+ import static org .jsoup .nodes .Document .OutputSettings .Syntax .xml ;
6+
57/**
68 * States and transition activations for the Tokeniser.
79 */
@@ -105,8 +107,12 @@ enum TokeniserState {
105107 t .advanceTransition (EndTagOpen );
106108 break ;
107109 case '?' :
108- t .createBogusCommentPending ();
109- t .transition (BogusComment );
110+ if (t .syntax == xml ) {
111+ t .advanceTransition (MarkupProcessingOpen );
112+ } else {
113+ t .createBogusCommentPending ();
114+ t .transition (BogusComment );
115+ }
110116 break ;
111117 default :
112118 if (r .matchesAsciiAlpha ()) {
@@ -590,6 +596,10 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
590596 t .tagPending .appendAttributeName (c , r .pos ()-1 , r .pos ());
591597 t .transition (AttributeName );
592598 break ;
599+ case '?' : // Handle trailing ? in <?xml...?>
600+ if (t .tagPending instanceof Token .XmlDecl )
601+ break ;
602+ // otherwise fall through to default
593603 default : // A-Z, anything else
594604 t .tagPending .newAttribute ();
595605 r .unconsume ();
@@ -634,6 +644,11 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
634644 t .error (this );
635645 t .tagPending .appendAttributeName (c , pos , r .pos ());
636646 break ;
647+ case '?' :
648+ if (t .syntax == xml && t .tagPending instanceof Token .XmlDecl ) {
649+ t .transition (AfterAttributeName );
650+ break ;
651+ } // otherwise default - take it
637652 default : // buffer underrun
638653 t .tagPending .appendAttributeName (c , pos , r .pos ());
639654 }
@@ -917,7 +932,7 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
917932 }
918933 }
919934 },
920- MarkupDeclarationOpen {
935+ MarkupDeclarationOpen { // from <!
921936 @ Override void read (Tokeniser t , CharacterReader r ) {
922937 if (r .matchConsume ("--" )) {
923938 t .createCommentPending ();
@@ -930,9 +945,27 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
930945 //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
931946 t .createTempBuffer ();
932947 t .transition (CdataSection );
948+ } else {
949+ if (t .syntax == xml && r .matchesAsciiAlpha ()) {
950+ t .createXmlDeclPending (true );
951+ t .transition (TagName ); // treat <!ENTITY as XML Declaration, with tag-like handling
952+ } else {
953+ t .error (this );
954+ t .createBogusCommentPending ();
955+ t .transition (BogusComment );
956+ }
957+ }
958+ }
959+ },
960+ MarkupProcessingOpen { // From <? in syntax XML
961+ @ Override void read (Tokeniser t , CharacterReader r ) {
962+ if (r .matchesAsciiAlpha ()) {
963+ t .createXmlDeclPending (false );
964+ t .transition (TagName ); // treat <?xml... as XML Declaration (processing instruction), with tag-like handling
933965 } else {
934966 t .error (this );
935967 t .createBogusCommentPending ();
968+ t .commentPending .append ('?' ); // push the ? to the start of the comment
936969 t .transition (BogusComment );
937970 }
938971 }
@@ -1623,7 +1656,7 @@ else if (r.matches('>')) {
16231656
16241657 static final char nullChar = '\u0000' ;
16251658 // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
1626- static final char [] attributeNameCharsSorted = new char []{'\t' , '\n' , '\f' , '\r' , ' ' , '"' , '\'' , '/' , '<' , '=' , '>' };
1659+ static final char [] attributeNameCharsSorted = new char []{'\t' , '\n' , '\f' , '\r' , ' ' , '"' , '\'' , '/' , '<' , '=' , '>' , '?' };
16271660 static final char [] attributeValueUnquoted = new char []{nullChar , '\t' , '\n' , '\f' , '\r' , ' ' , '"' , '&' , '\'' , '<' , '=' , '>' , '`' };
16281661
16291662 private static final char replacementChar = Tokeniser .replacementChar ;
0 commit comments