@@ -45,9 +45,15 @@ typedef struct bracket {
45
45
bool in_bracket_image1 ;
46
46
} bracket ;
47
47
48
+ #define FLAG_SKIP_HTML_CDATA (1u << 0)
49
+ #define FLAG_SKIP_HTML_DECLARATION (1u << 1)
50
+ #define FLAG_SKIP_HTML_PI (1u << 2)
51
+ #define FLAG_SKIP_HTML_COMMENT (1u << 3)
52
+
48
53
typedef struct subject {
49
54
cmark_mem * mem ;
50
55
cmark_chunk input ;
56
+ unsigned flags ;
51
57
int line ;
52
58
bufsize_t pos ;
53
59
int block_offset ;
@@ -164,6 +170,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
164
170
int i ;
165
171
e -> mem = mem ;
166
172
e -> input = * chunk ;
173
+ e -> flags = 0 ;
167
174
e -> line = line_number ;
168
175
e -> pos = 0 ;
169
176
e -> block_offset = block_offset ;
@@ -904,7 +911,63 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
904
911
}
905
912
906
913
// finally, try to match an html tag
907
- matchlen = scan_html_tag (& subj -> input , subj -> pos );
914
+ if (subj -> pos + 2 <= subj -> input .len ) {
915
+ int c = subj -> input .data [subj -> pos ];
916
+ if (c == '!' && (subj -> flags & FLAG_SKIP_HTML_COMMENT ) == 0 ) {
917
+ c = subj -> input .data [subj -> pos + 1 ];
918
+ if (c == '-' && subj -> input .data [subj -> pos + 2 ] == '-' ) {
919
+ if (subj -> input .data [subj -> pos + 3 ] == '>' ) {
920
+ matchlen = 4 ;
921
+ } else if (subj -> input .data [subj -> pos + 3 ] == '-' &&
922
+ subj -> input .data [subj -> pos + 4 ] == '>' ) {
923
+ matchlen = 5 ;
924
+ } else {
925
+ matchlen = scan_html_comment (& subj -> input , subj -> pos + 1 );
926
+ if (matchlen > 0 ) {
927
+ matchlen += 1 ; // prefix "<"
928
+ } else { // no match through end of input: set a flag so
929
+ // we don't reparse looking for -->:
930
+ subj -> flags |= FLAG_SKIP_HTML_COMMENT ;
931
+ }
932
+ }
933
+ } else if (c == '[' ) {
934
+ if ((subj -> flags & FLAG_SKIP_HTML_CDATA ) == 0 ) {
935
+ matchlen = scan_html_cdata (& subj -> input , subj -> pos + 2 );
936
+ if (matchlen > 0 ) {
937
+ // The regex doesn't require the final "]]>". But if we're not at
938
+ // the end of input, it must come after the match. Otherwise,
939
+ // disable subsequent scans to avoid quadratic behavior.
940
+ matchlen += 5 ; // prefix "![", suffix "]]>"
941
+ if (subj -> pos + matchlen > subj -> input .len ) {
942
+ subj -> flags |= FLAG_SKIP_HTML_CDATA ;
943
+ matchlen = 0 ;
944
+ }
945
+ }
946
+ }
947
+ } else if ((subj -> flags & FLAG_SKIP_HTML_DECLARATION ) == 0 ) {
948
+ matchlen = scan_html_declaration (& subj -> input , subj -> pos + 1 );
949
+ if (matchlen > 0 ) {
950
+ matchlen += 2 ; // prefix "!", suffix ">"
951
+ if (subj -> pos + matchlen > subj -> input .len ) {
952
+ subj -> flags |= FLAG_SKIP_HTML_DECLARATION ;
953
+ matchlen = 0 ;
954
+ }
955
+ }
956
+ }
957
+ } else if (c == '?' ) {
958
+ if ((subj -> flags & FLAG_SKIP_HTML_PI ) == 0 ) {
959
+ // Note that we allow an empty match.
960
+ matchlen = scan_html_pi (& subj -> input , subj -> pos + 1 );
961
+ matchlen += 3 ; // prefix "?", suffix "?>"
962
+ if (subj -> pos + matchlen > subj -> input .len ) {
963
+ subj -> flags |= FLAG_SKIP_HTML_PI ;
964
+ matchlen = 0 ;
965
+ }
966
+ }
967
+ } else {
968
+ matchlen = scan_html_tag (& subj -> input , subj -> pos );
969
+ }
970
+ }
908
971
if (matchlen > 0 ) {
909
972
contents = cmark_chunk_dup (& subj -> input , subj -> pos - 1 , matchlen + 1 );
910
973
subj -> pos += matchlen ;
0 commit comments