@@ -45,9 +45,15 @@ typedef struct bracket {
4545 bool in_bracket_image1 ;
4646} bracket ;
4747
48+ #define FLAG_SKIP_HTML_CDATA (1u << 0)
49+ #define FLAG_SKIP_HTML_DECLARATION (1u << 1)
50+ #define FLAG_SKIP_HTML_PI (1u << 2)
51+ #define FLAG_SKIP_HTML_COMMENT (1u << 3)
52+
4853typedef struct subject {
4954 cmark_mem * mem ;
5055 cmark_chunk input ;
56+ unsigned flags ;
5157 int line ;
5258 bufsize_t pos ;
5359 int block_offset ;
@@ -164,6 +170,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
164170 int i ;
165171 e -> mem = mem ;
166172 e -> input = * chunk ;
173+ e -> flags = 0 ;
167174 e -> line = line_number ;
168175 e -> pos = 0 ;
169176 e -> block_offset = block_offset ;
@@ -904,7 +911,63 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
904911 }
905912
906913 // finally, try to match an html tag
907- matchlen = scan_html_tag (& subj -> input , subj -> pos );
914+ if (subj -> pos + 2 <= subj -> input .len ) {
915+ int c = subj -> input .data [subj -> pos ];
916+ if (c == '!' && (subj -> flags & FLAG_SKIP_HTML_COMMENT ) == 0 ) {
917+ c = subj -> input .data [subj -> pos + 1 ];
918+ if (c == '-' && subj -> input .data [subj -> pos + 2 ] == '-' ) {
919+ if (subj -> input .data [subj -> pos + 3 ] == '>' ) {
920+ matchlen = 4 ;
921+ } else if (subj -> input .data [subj -> pos + 3 ] == '-' &&
922+ subj -> input .data [subj -> pos + 4 ] == '>' ) {
923+ matchlen = 5 ;
924+ } else {
925+ matchlen = scan_html_comment (& subj -> input , subj -> pos + 1 );
926+ if (matchlen > 0 ) {
927+ matchlen += 1 ; // prefix "<"
928+ } else { // no match through end of input: set a flag so
929+ // we don't reparse looking for -->:
930+ subj -> flags |= FLAG_SKIP_HTML_COMMENT ;
931+ }
932+ }
933+ } else if (c == '[' ) {
934+ if ((subj -> flags & FLAG_SKIP_HTML_CDATA ) == 0 ) {
935+ matchlen = scan_html_cdata (& subj -> input , subj -> pos + 2 );
936+ if (matchlen > 0 ) {
937+ // The regex doesn't require the final "]]>". But if we're not at
938+ // the end of input, it must come after the match. Otherwise,
939+ // disable subsequent scans to avoid quadratic behavior.
940+ matchlen += 5 ; // prefix "![", suffix "]]>"
941+ if (subj -> pos + matchlen > subj -> input .len ) {
942+ subj -> flags |= FLAG_SKIP_HTML_CDATA ;
943+ matchlen = 0 ;
944+ }
945+ }
946+ }
947+ } else if ((subj -> flags & FLAG_SKIP_HTML_DECLARATION ) == 0 ) {
948+ matchlen = scan_html_declaration (& subj -> input , subj -> pos + 1 );
949+ if (matchlen > 0 ) {
950+ matchlen += 2 ; // prefix "!", suffix ">"
951+ if (subj -> pos + matchlen > subj -> input .len ) {
952+ subj -> flags |= FLAG_SKIP_HTML_DECLARATION ;
953+ matchlen = 0 ;
954+ }
955+ }
956+ }
957+ } else if (c == '?' ) {
958+ if ((subj -> flags & FLAG_SKIP_HTML_PI ) == 0 ) {
959+ // Note that we allow an empty match.
960+ matchlen = scan_html_pi (& subj -> input , subj -> pos + 1 );
961+ matchlen += 3 ; // prefix "?", suffix "?>"
962+ if (subj -> pos + matchlen > subj -> input .len ) {
963+ subj -> flags |= FLAG_SKIP_HTML_PI ;
964+ matchlen = 0 ;
965+ }
966+ }
967+ } else {
968+ matchlen = scan_html_tag (& subj -> input , subj -> pos );
969+ }
970+ }
908971 if (matchlen > 0 ) {
909972 contents = cmark_chunk_dup (& subj -> input , subj -> pos - 1 , matchlen + 1 );
910973 subj -> pos += matchlen ;
0 commit comments