Skip to content

Commit 8d4b76b

Browse files
nwellnhofkevinbackhouse
authored andcommitted
Fix quadratic behavior with inline HTML
Repeated starting sequences like `<?`, `<!DECL ` or `<![CDATA[` could lead to quadratic behavior if no matching ending sequence was found. Separate the inline HTML scanners. Remember if scanning the whole input for a specific ending sequence failed and skip subsequent scans. The basic idea is to remove suffixes `>`, `?>` and `]]>` from the respective regex. Since these regexes are already constructed to match lazily, they will stop before an ending sequence. To check whether an ending sequence was found, we can simply test whether the input buffer is large enough to hold the match plus a potential suffix. If the regex doesn't find the ending sequence, it will match so many characters that this test is guaranteed to fail. In this case, we set a flag to avoid further attempts to execute the regex. To check which inline HTML regex to use, we inspect the start of the text buffer. This allows some fixed characters to be removed from the start of some regexes. `matchlen` is adjusted with a single addition that accounts for both the relevant prefix and suffix. Fixes commonmark#299.
1 parent 9d57d8a commit 8d4b76b

File tree

4 files changed

+14024
-10378
lines changed

4 files changed

+14024
-10378
lines changed

src/inlines.c

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,14 @@ typedef struct bracket {
4545
bool in_bracket_image1;
4646
} bracket;
4747

48+
#define FLAG_SKIP_HTML_CDATA (1u << 0)
49+
#define FLAG_SKIP_HTML_DECLARATION (1u << 1)
50+
#define FLAG_SKIP_HTML_PI (1u << 2)
51+
4852
typedef struct subject{
4953
cmark_mem *mem;
5054
cmark_chunk input;
55+
unsigned flags;
5156
int line;
5257
bufsize_t pos;
5358
int block_offset;
@@ -163,6 +168,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
163168
int i;
164169
e->mem = mem;
165170
e->input = *chunk;
171+
e->flags = 0;
166172
e->line = line_number;
167173
e->pos = 0;
168174
e->block_offset = block_offset;
@@ -899,7 +905,52 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
899905
}
900906

901907
// finally, try to match an html tag
902-
matchlen = scan_html_tag(&subj->input, subj->pos);
908+
if (subj->pos + 2 <= subj->input.len) {
909+
int c = subj->input.data[subj->pos];
910+
if (c == '!') {
911+
c = subj->input.data[subj->pos+1];
912+
if (c == '-') {
913+
matchlen = scan_html_comment(&subj->input, subj->pos + 2);
914+
if (matchlen > 0)
915+
matchlen += 2; // prefix "<-"
916+
} else if (c == '[') {
917+
if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) {
918+
matchlen = scan_html_cdata(&subj->input, subj->pos + 2);
919+
if (matchlen > 0) {
920+
// The regex doesn't require the final "]]>". But if we're not at
921+
// the end of input, it must come after the match. Otherwise,
922+
// disable subsequent scans to avoid quadratic behavior.
923+
matchlen += 5; // prefix "![", suffix "]]>"
924+
if (subj->pos + matchlen > subj->input.len) {
925+
subj->flags |= FLAG_SKIP_HTML_CDATA;
926+
matchlen = 0;
927+
}
928+
}
929+
}
930+
} else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) {
931+
matchlen = scan_html_declaration(&subj->input, subj->pos + 1);
932+
if (matchlen > 0) {
933+
matchlen += 2; // prefix "!", suffix ">"
934+
if (subj->pos + matchlen > subj->input.len) {
935+
subj->flags |= FLAG_SKIP_HTML_DECLARATION;
936+
matchlen = 0;
937+
}
938+
}
939+
}
940+
} else if (c == '?') {
941+
if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) {
942+
// Note that we allow an empty match.
943+
matchlen = scan_html_pi(&subj->input, subj->pos + 1);
944+
matchlen += 3; // prefix "?", suffix "?>"
945+
if (subj->pos + matchlen > subj->input.len) {
946+
subj->flags |= FLAG_SKIP_HTML_PI;
947+
matchlen = 0;
948+
}
949+
}
950+
} else {
951+
matchlen = scan_html_tag(&subj->input, subj->pos);
952+
}
953+
}
903954
if (matchlen > 0) {
904955
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
905956
subj->pos += matchlen;

0 commit comments

Comments
 (0)