From 7de92ff28cc95e56a8f5723a9cb37ff341bc706e Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Sat, 9 Aug 2025 11:08:07 +0200 Subject: [PATCH 1/4] =?UTF-8?q?=F0=9F=91=8C=20fix=20quadratic=20complexity?= =?UTF-8?q?=20in=20reference=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ports: https://github.com/markdown-it/markdown-it/commit/de814cae739e1546aa0528f3ece849aa7a436265 --- markdown_it/helpers/parse_link_destination.py | 5 +- markdown_it/helpers/parse_link_title.py | 79 +++++---- markdown_it/rules_block/reference.py | 152 ++++++++++-------- markdown_it/rules_inline/image.py | 2 +- tests/test_port/fixtures/commonmark_extras.md | 34 ++++ 5 files changed, 169 insertions(+), 103 deletions(-) diff --git a/markdown_it/helpers/parse_link_destination.py b/markdown_it/helpers/parse_link_destination.py index 93989eb5..c98323c0 100644 --- a/markdown_it/helpers/parse_link_destination.py +++ b/markdown_it/helpers/parse_link_destination.py @@ -6,17 +6,15 @@ class _Result: - __slots__ = ("lines", "ok", "pos", "str") + __slots__ = ("ok", "pos", "str") def __init__(self) -> None: self.ok = False self.pos = 0 - self.lines = 0 self.str = "" def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result: - lines = 0 start = pos result = _Result() @@ -80,7 +78,6 @@ def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result: return result result.str = unescapeAll(string[start:pos]) - result.lines = lines result.pos = pos result.ok = True return result diff --git a/markdown_it/helpers/parse_link_title.py b/markdown_it/helpers/parse_link_title.py index f002c7c4..a38ff0d9 100644 --- a/markdown_it/helpers/parse_link_title.py +++ b/markdown_it/helpers/parse_link_title.py @@ -3,58 +3,73 @@ from ..common.utils import charCodeAt, unescapeAll -class _Result: - __slots__ = ("lines", "ok", "pos", "str") +class _State: + __slots__ = ("can_continue", "marker", "ok", "pos", "str") def __init__(self) -> None: self.ok = False + """if `true`, this is a valid link title""" + self.can_continue = False + """if `true`, this link can be continued on the next line""" self.pos = 0 - self.lines = 0 + """if `ok`, it's the position of the first character after the closing marker""" self.str = "" + """if `ok`, it's the unescaped title""" + self.marker = 0 + """expected closing marker character code""" def __str__(self) -> str: return self.str -def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result: - lines = 0 - start = pos - result = _Result() +def parseLinkTitle( + string: str, start: int, maximum: int, prev_state: _State | None = None +) -> _State: + """Parse link title within `str` in [start, max] range, + or continue previous parsing if `prev_state` is defined (equal to result of last execution). + """ + pos = start + state = _State() - if pos >= maximum: - return result + if prev_state is not None: + # this is a continuation of a previous parseLinkTitle call on the next line, + # used in reference links only + state.str = prev_state.str + state.marker = prev_state.marker + else: + if pos >= maximum: + return state - marker = charCodeAt(string, pos) + marker = charCodeAt(string, pos) - # /* " */ /* ' */ /* ( */ - if marker != 0x22 and marker != 0x27 and marker != 0x28: - return result + # /* " */ /* ' */ /* ( */ + if marker != 0x22 and marker != 0x27 and marker != 0x28: + return state - pos += 1 + start += 1 + pos += 1 + + # if opening marker is "(", switch it to closing marker ")" + if marker == 0x28: + marker = 0x29 - # if opening marker is "(", switch it to closing marker ")" - if marker == 0x28: - marker = 0x29 + state.marker = marker while pos < maximum: code = charCodeAt(string, pos) - if code == marker: - title = string[start + 1 : pos] - title = unescapeAll(title) - result.pos = pos + 1 - result.lines = lines - result.str = title - result.ok = True - return result - elif code == 0x28 and marker == 0x29: # /* ( */ /* ) */ - return result - elif code == 0x0A: - lines += 1 + if code == state.marker: + state.pos = pos + 1 + state.str += unescapeAll(string[start:pos]) + state.ok = True + return state + elif code == 0x28 and state.marker == 0x29: # /* ( */ /* ) */ + return state elif code == 0x5C and pos + 1 < maximum: # /* \ */ pos += 1 - if charCodeAt(string, pos) == 0x0A: - lines += 1 pos += 1 - return result + # no closing marker found, but this link title may continue on the next line (for references) + state.can_continue = True + state.str += unescapeAll(string[start:pos]) + return state diff --git a/markdown_it/rules_block/reference.py b/markdown_it/rules_block/reference.py index b77944b2..ad94d409 100644 --- a/markdown_it/rules_block/reference.py +++ b/markdown_it/rules_block/reference.py @@ -11,7 +11,6 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> "entering reference: %s, %s, %s, %s", state, startLine, _endLine, silent ) - lines = 0 pos = state.bMarks[startLine] + state.tShift[startLine] maximum = state.eMarks[startLine] nextLine = startLine + 1 @@ -22,51 +21,9 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> if state.src[pos] != "[": return False - # Simple check to quickly interrupt scan on [link](url) at the start of line. - # Can be useful on practice: https:#github.com/markdown-it/markdown-it/issues/54 - while pos < maximum: - # /* ] */ /* \ */ /* : */ - if state.src[pos] == "]" and state.src[pos - 1] != "\\": - if pos + 1 == maximum: - return False - if state.src[pos + 1] != ":": - return False - break - pos += 1 - - endLine = state.lineMax - - # jump line-by-line until empty one or EOF - terminatorRules = state.md.block.ruler.getRules("reference") + string = state.src[pos : maximum + 1] - oldParentType = state.parentType - state.parentType = "reference" - - while nextLine < endLine and not state.isEmpty(nextLine): - # this would be a code block normally, but after paragraph - # it's considered a lazy continuation regardless of what's there - if state.sCount[nextLine] - state.blkIndent > 3: - nextLine += 1 - continue - - # quirk for blockquotes, this line should already be checked by that rule - if state.sCount[nextLine] < 0: - nextLine += 1 - continue - - # Some tags can terminate paragraph without empty line. - terminate = False - for terminatorRule in terminatorRules: - if terminatorRule(state, nextLine, endLine, True): - terminate = True - break - - if terminate: - break - - nextLine += 1 - - string = state.getLines(startLine, nextLine, state.blkIndent, False).strip() + # string = state.getLines(startLine, nextLine, state.blkIndent, False).strip() maximum = len(string) labelEnd = None @@ -79,11 +36,20 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> labelEnd = pos break elif ch == 0x0A: # /* \n */ - lines += 1 + if (lineContent := getNextLine(state, nextLine)) is not None: + string += lineContent + maximum = len(string) + nextLine += 1 elif ch == 0x5C: # /* \ */ pos += 1 - if pos < maximum and charCodeAt(string, pos) == 0x0A: - lines += 1 + if ( + pos < maximum + and charCodeAt(string, pos) == 0x0A + and (lineContent := getNextLine(state, nextLine)) is not None + ): + string += lineContent + maximum = len(string) + nextLine += 1 pos += 1 if ( @@ -97,7 +63,10 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> while pos < maximum: ch = charCodeAt(string, pos) if ch == 0x0A: - lines += 1 + if (lineContent := getNextLine(state, nextLine)) is not None: + string += lineContent + maximum = len(string) + nextLine += 1 elif isSpace(ch): pass else: @@ -106,20 +75,19 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> # [label]: destination 'title' # ^^^^^^^^^^^ parse this - res = state.md.helpers.parseLinkDestination(string, pos, maximum) - if not res.ok: + destRes = state.md.helpers.parseLinkDestination(string, pos, maximum) + if not destRes.ok: return False - href = state.md.normalizeLink(res.str) + href = state.md.normalizeLink(destRes.str) if not state.md.validateLink(href): return False - pos = res.pos - lines += res.lines + pos = destRes.pos # save cursor state, we could require to rollback later destEndPos = pos - destEndLineNo = lines + destEndLineNo = nextLine # [label]: destination 'title' # ^^^ skipping those spaces @@ -127,7 +95,10 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> while pos < maximum: ch = charCodeAt(string, pos) if ch == 0x0A: - lines += 1 + if (lineContent := getNextLine(state, nextLine)) is not None: + string += lineContent + maximum = len(string) + nextLine += 1 elif isSpace(ch): pass else: @@ -136,15 +107,23 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> # [label]: destination 'title' # ^^^^^^^ parse this - res = state.md.helpers.parseLinkTitle(string, pos, maximum) - if pos < maximum and start != pos and res.ok: - title = res.str - pos = res.pos - lines += res.lines + titleRes = state.md.helpers.parseLinkTitle(string, pos, maximum, None) + while titleRes.can_continue: + if (lineContent := getNextLine(state, nextLine)) is None: + break + string += lineContent + pos = maximum + maximum = len(string) + nextLine += 1 + titleRes = state.md.helpers.parseLinkTitle(string, pos, maximum, titleRes) + + if pos < maximum and start != pos and titleRes.ok: + title = titleRes.str + pos = titleRes.pos else: title = "" pos = destEndPos - lines = destEndLineNo + nextLine = destEndLineNo # skip trailing spaces until the rest of the line while pos < maximum: @@ -158,7 +137,7 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> # but it could still be a valid reference if we roll back title = "" pos = destEndPos - lines = destEndLineNo + nextLine = destEndLineNo while pos < maximum: ch = charCodeAt(string, pos) if not isSpace(ch): @@ -181,7 +160,7 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> if "references" not in state.env: state.env["references"] = {} - state.line = startLine + lines + 1 + state.line = nextLine # note, this is not part of markdown-it JS, but is useful for renderers if state.md.options.get("inline_definitions", False): @@ -210,6 +189,47 @@ def reference(state: StateBlock, startLine: int, _endLine: int, silent: bool) -> } ) - state.parentType = oldParentType - return True + + +def getNextLine(state: StateBlock, nextLine: int) -> None | str: + endLine = state.lineMax + + if nextLine >= endLine or state.isEmpty(nextLine): + # empty line or end of input + return None + + isContinuation = False + + # this would be a code block normally, but after paragraph + # it's considered a lazy continuation regardless of what's there + if state.is_code_block(nextLine): + isContinuation = True + + # quirk for blockquotes, this line should already be checked by that rule + if state.sCount[nextLine] < 0: + isContinuation = True + + if not isContinuation: + terminatorRules = state.md.block.ruler.getRules("reference") + oldParentType = state.parentType + state.parentType = "reference" + + # Some tags can terminate paragraph without empty line. + terminate = False + for terminatorRule in terminatorRules: + if terminatorRule(state, nextLine, endLine, True): + terminate = True + break + + state.parentType = oldParentType + + if terminate: + # terminated by another block + return None + + pos = state.bMarks[nextLine] + state.tShift[nextLine] + maximum = state.eMarks[nextLine] + + # max + 1 explicitly includes the newline + return state.src[pos : maximum + 1] diff --git a/markdown_it/rules_inline/image.py b/markdown_it/rules_inline/image.py index b4a32a9f..005105b1 100644 --- a/markdown_it/rules_inline/image.py +++ b/markdown_it/rules_inline/image.py @@ -66,7 +66,7 @@ def image(state: StateInline, silent: bool) -> bool: # [link]( "title" ) # ^^^^^^^ parsing link title - res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax) + res = state.md.helpers.parseLinkTitle(state.src, pos, state.posMax, None) if pos < max and start != pos and res.ok: title = res.str pos = res.pos diff --git a/tests/test_port/fixtures/commonmark_extras.md b/tests/test_port/fixtures/commonmark_extras.md index 5d13d859..f0b31dbd 100644 --- a/tests/test_port/fixtures/commonmark_extras.md +++ b/tests/test_port/fixtures/commonmark_extras.md @@ -49,6 +49,40 @@ Reference labels: support ligatures (equivalent according to unicode case foldin

fffifl

. +Reference can be interrupted by other rules +. +[foo]: /url 'title + - - - +' + +[foo] +. +

[foo]: /url 'title

+
+

'

+

[foo]

+. + +Escape character in link reference title doesn't escape newlines +. +[foo]: /url " +hello +\ +\ +\ +world +" + +[foo] +. +

foo

+. Issue #35. `<` should work as punctuation . From dcbaeb55237dd4ebd5e511ca6cdd848e36af043a Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Sat, 9 Aug 2025 11:24:03 +0200 Subject: [PATCH 2/4] Update conf.py --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index 290eac7f..e468b853 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,6 +52,7 @@ ".*_NodeType", ".*Literal.*", ".*_Result", + ".*_State", "EnvType", "Path", "Ellipsis", From 4265be17429b22d40e4e1aeeb87524a039cd859a Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Sat, 9 Aug 2025 11:25:55 +0200 Subject: [PATCH 3/4] Update fuzz.yml --- .github/workflows/fuzz.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 5c5ed478..8529e3ab 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -27,7 +27,7 @@ jobs: with: oss-fuzz-project-name: 'markdown-it-py' language: python - fuzz-seconds: 60 + fuzz-seconds: 300 - name: Upload Crash uses: actions/upload-artifact@v4 if: failure() && steps.build.outcome == 'success' From aba376686115be15500ab6aba0cfb50c638908c5 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Sat, 9 Aug 2025 13:05:16 +0200 Subject: [PATCH 4/4] Update fuzz.yml --- .github/workflows/fuzz.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 8529e3ab..5c5ed478 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -27,7 +27,7 @@ jobs: with: oss-fuzz-project-name: 'markdown-it-py' language: python - fuzz-seconds: 300 + fuzz-seconds: 60 - name: Upload Crash uses: actions/upload-artifact@v4 if: failure() && steps.build.outcome == 'success'