Skip to content

Commit d84de00

Browse files
authored
Merge pull request from GHSA-c944-cv5f-hpvr
Improve performance in postprocess_text
2 parents f787273 + c7b272b commit d84de00

File tree

1 file changed

+125
-100
lines changed

1 file changed

+125
-100
lines changed

extensions/autolink.c

Lines changed: 125 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -269,14 +269,20 @@ static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
269269
// inline was finished in inlines.c.
270270
}
271271

272-
static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
272+
static bool validate_protocol(char protocol[], uint8_t *data, int rewind, int max_rewind) {
273273
size_t len = strlen(protocol);
274274

275+
if (len > (size_t)(max_rewind - rewind)) {
276+
return false;
277+
}
278+
275279
// Check that the protocol matches
276-
for (int i = 1; i <= len; i++) {
277-
if (data[-rewind - i] != protocol[len - i]) {
278-
return false;
279-
}
280+
if (memcmp(data - rewind - len, protocol, len) != 0) {
281+
return false;
282+
}
283+
284+
if (len == (size_t)(max_rewind - rewind)) {
285+
return true;
280286
}
281287

282288
char prev_char = data[-rewind - len - 1];
@@ -285,126 +291,145 @@ static bool validate_protocol(char protocol[], uint8_t *data, int rewind) {
285291
return !cmark_isalnum(prev_char);
286292
}
287293

288-
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset, int depth) {
289-
// postprocess_text can recurse very deeply if there is a very long line of
290-
// '@' only. Stop at a reasonable depth to ensure it cannot crash.
291-
if (depth > 1000) return;
294+
static void postprocess_text(cmark_parser *parser, cmark_node *text) {
295+
size_t start = 0;
296+
size_t offset = 0;
297+
// `text` is going to be split into a list of nodes containing shorter segments
298+
// of text, so we detach the memory buffer from text and use `cmark_chunk_dup` to
299+
// create references to it. Later, `cmark_chunk_to_cstr` is used to convert
300+
// the references into allocated buffers. The detached buffer is freed before we
301+
// return.
302+
cmark_chunk detached_chunk = text->as.literal;
303+
text->as.literal = cmark_chunk_dup(&detached_chunk, 0, detached_chunk.len);
304+
305+
uint8_t *data = text->as.literal.data;
306+
size_t remaining = text->as.literal.len;
307+
308+
while (true) {
309+
size_t link_end;
310+
uint8_t *at;
311+
bool auto_mailto = true;
312+
bool is_xmpp = false;
313+
size_t rewind;
314+
size_t max_rewind;
315+
size_t np = 0;
316+
317+
if (offset >= remaining)
318+
break;
292319

293-
size_t link_end;
294-
uint8_t *data = text->as.literal.data,
295-
*at;
296-
size_t size = text->as.literal.len;
297-
bool auto_mailto = true;
298-
bool is_xmpp = false;
299-
int rewind, max_rewind,
300-
nb = 0, np = 0, ns = 0;
320+
at = (uint8_t *)memchr(data + start + offset, '@', remaining - offset);
321+
if (!at)
322+
break;
301323

302-
if (offset < 0 || (size_t)offset >= size)
303-
return;
324+
max_rewind = at - (data + start + offset);
304325

305-
data += offset;
306-
size -= offset;
326+
found_at:
327+
for (rewind = 0; rewind < max_rewind; ++rewind) {
328+
uint8_t c = data[start + offset + max_rewind - rewind - 1];
307329

308-
at = (uint8_t *)memchr(data, '@', size);
309-
if (!at)
310-
return;
330+
if (cmark_isalnum(c))
331+
continue;
311332

312-
max_rewind = (int)(at - data);
313-
data += max_rewind;
314-
size -= max_rewind;
333+
if (strchr(".+-_", c) != NULL)
334+
continue;
315335

316-
for (rewind = 0; rewind < max_rewind; ++rewind) {
317-
uint8_t c = data[-rewind - 1];
336+
if (strchr(":", c) != NULL) {
337+
if (validate_protocol("mailto:", data + start + offset + max_rewind, rewind, max_rewind)) {
338+
auto_mailto = false;
339+
continue;
340+
}
341+
342+
if (validate_protocol("xmpp:", data + start + offset + max_rewind, rewind, max_rewind)) {
343+
auto_mailto = false;
344+
is_xmpp = true;
345+
continue;
346+
}
347+
}
318348

319-
if (cmark_isalnum(c))
320-
continue;
349+
break;
350+
}
321351

322-
if (strchr(".+-_", c) != NULL)
352+
if (rewind == 0) {
353+
offset += max_rewind + 1;
323354
continue;
355+
}
356+
357+
assert(data[start + offset + max_rewind] == '@');
358+
for (link_end = 1; link_end < remaining - offset - max_rewind; ++link_end) {
359+
uint8_t c = data[start + offset + max_rewind + link_end];
324360

325-
if (strchr(":", c) != NULL) {
326-
if (validate_protocol("mailto:", data, rewind)) {
327-
auto_mailto = false;
361+
if (cmark_isalnum(c))
328362
continue;
329-
}
330363

331-
if (validate_protocol("xmpp:", data, rewind)) {
332-
auto_mailto = false;
333-
is_xmpp = true;
364+
if (c == '@') {
365+
// Found another '@', so go back and try again with an updated offset and max_rewind.
366+
offset += max_rewind + 1;
367+
max_rewind = link_end - 1;
368+
goto found_at;
369+
} else if (c == '.' && link_end < remaining - offset - max_rewind - 1 &&
370+
cmark_isalnum(data[start + offset + max_rewind + link_end + 1]))
371+
np++;
372+
else if (c == '/' && is_xmpp)
334373
continue;
335-
}
374+
else if (c != '-' && c != '_')
375+
break;
336376
}
337377

338-
break;
339-
}
340-
341-
if (rewind == 0 || ns > 0) {
342-
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
343-
return;
344-
}
345-
346-
for (link_end = 0; link_end < size; ++link_end) {
347-
uint8_t c = data[link_end];
348-
349-
if (cmark_isalnum(c))
350-
continue;
351-
352-
if (c == '@')
353-
nb++;
354-
else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
355-
np++;
356-
else if (c == '/' && is_xmpp)
378+
if (link_end < 2 || np == 0 ||
379+
(!cmark_isalpha(data[start + offset + max_rewind + link_end - 1]) &&
380+
data[start + offset + max_rewind + link_end - 1] != '.')) {
381+
offset += max_rewind + link_end;
357382
continue;
358-
else if (c != '-' && c != '_')
359-
break;
360-
}
383+
}
361384

362-
if (link_end < 2 || nb != 1 || np == 0 ||
363-
(!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
364-
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
365-
return;
366-
}
385+
link_end = autolink_delim(data + start + offset + max_rewind, link_end);
367386

368-
link_end = autolink_delim(data, link_end);
387+
if (link_end == 0) {
388+
offset += max_rewind + 1;
389+
continue;
390+
}
369391

370-
if (link_end == 0) {
371-
postprocess_text(parser, text, max_rewind + 1 + offset, depth + 1);
372-
return;
373-
}
392+
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
393+
cmark_strbuf buf;
394+
cmark_strbuf_init(parser->mem, &buf, 10);
395+
if (auto_mailto)
396+
cmark_strbuf_puts(&buf, "mailto:");
397+
cmark_strbuf_put(&buf, data + start + offset + max_rewind - rewind, (bufsize_t)(link_end + rewind));
398+
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
399+
400+
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
401+
cmark_chunk email = cmark_chunk_dup(
402+
&detached_chunk,
403+
start + offset + max_rewind - rewind,
404+
(bufsize_t)(link_end + rewind));
405+
cmark_chunk_to_cstr(parser->mem, &email);
406+
link_text->as.literal = email;
407+
cmark_node_append_child(link_node, link_text);
374408

375-
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
409+
cmark_node_insert_after(text, link_node);
376410

377-
cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
378-
cmark_strbuf buf;
379-
cmark_strbuf_init(parser->mem, &buf, 10);
380-
if (auto_mailto)
381-
cmark_strbuf_puts(&buf, "mailto:");
382-
cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
383-
link_node->as.link.url = cmark_chunk_buf_detach(&buf);
384-
385-
cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
386-
cmark_chunk email = cmark_chunk_dup(
387-
&text->as.literal,
388-
offset + max_rewind - rewind,
389-
(bufsize_t)(link_end + rewind));
390-
cmark_chunk_to_cstr(parser->mem, &email);
391-
link_text->as.literal = email;
392-
cmark_node_append_child(link_node, link_text);
411+
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
412+
post->as.literal = cmark_chunk_dup(&detached_chunk,
413+
(bufsize_t)(start + offset + max_rewind + link_end),
414+
(bufsize_t)(remaining - offset - max_rewind - link_end));
393415

394-
cmark_node_insert_after(text, link_node);
416+
cmark_node_insert_after(link_node, post);
395417

396-
cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
397-
post->as.literal = cmark_chunk_dup(&text->as.literal,
398-
(bufsize_t)(offset + max_rewind + link_end),
399-
(bufsize_t)(size - link_end));
400-
cmark_chunk_to_cstr(parser->mem, &post->as.literal);
418+
text->as.literal = cmark_chunk_dup(&detached_chunk, start, offset + max_rewind - rewind);
419+
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
401420

402-
cmark_node_insert_after(link_node, post);
421+
text = post;
422+
start += offset + max_rewind + link_end;
423+
remaining -= offset + max_rewind + link_end;
424+
offset = 0;
425+
}
403426

404-
text->as.literal.len = offset + max_rewind - rewind;
405-
text->as.literal.data[text->as.literal.len] = 0;
427+
// Convert the reference to allocated memory.
428+
assert(!text->as.literal.alloc);
429+
cmark_chunk_to_cstr(parser->mem, &text->as.literal);
406430

407-
postprocess_text(parser, post, 0, depth + 1);
431+
// Free the detached buffer.
432+
cmark_chunk_free(parser->mem, &detached_chunk);
408433
}
409434

410435
static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser, cmark_node *root) {
@@ -431,7 +456,7 @@ static cmark_node *postprocess(cmark_syntax_extension *ext, cmark_parser *parser
431456
}
432457

433458
if (ev == CMARK_EVENT_ENTER && node->type == CMARK_NODE_TEXT) {
434-
postprocess_text(parser, node, 0, /*depth*/0);
459+
postprocess_text(parser, node);
435460
}
436461
}
437462

0 commit comments

Comments
 (0)