From 3fa0662de45295739ab34827ff0a9eed3b049876 Mon Sep 17 00:00:00 2001 From: Matt Linville Date: Mon, 20 Oct 2025 12:49:39 -0700 Subject: [PATCH 1/3] fix: fix URL parsing in docstrings Fixes #95 - Restrict _RE_ARGSTART pattern to valid Python identifiers only - Add heuristics to detect non-argument lines containing URLs - Prevent lines with URLs from being incorrectly parsed as arguments This fixes the issue where URLs in docstrings were being split and wrapped with incorrect HTML tags. --- src/lazydocs/generation.py | 41 +++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/src/lazydocs/generation.py b/src/lazydocs/generation.py index 7013dd9..1bc0ce9 100755 --- a/src/lazydocs/generation.py +++ b/src/lazydocs/generation.py @@ -33,7 +33,8 @@ ) _RE_TYPED_ARGSTART = re.compile(r"^([\w\[\]_]{1,}?)[ ]*?\((.*?)\):[ ]+(.{2,})", re.IGNORECASE) -_RE_ARGSTART = re.compile(r"^(.+):[ ]+(.{2,})$", re.IGNORECASE) +# Restrict to valid Python identifier-like patterns to avoid matching URLs +_RE_ARGSTART = re.compile(r"^([\w\[\]_]+):[ ]+(.{2,})$", re.IGNORECASE) _RE_CODE_TEXT = re.compile(r"^```[\w\-\.]*[ ]*$", re.IGNORECASE) @@ -583,11 +584,41 @@ def _lines_isvalid(lines: list, start_index: int, blockindent: int, argindent = indent elif arg_list and not literal_block and _RE_ARGSTART.match(line): # start of an exception-type block - out.append( - "- " - + _RE_ARGSTART.sub(r"`\1`: \2", line) + # Check if this looks like a URL being incorrectly parsed + match = _RE_ARGSTART.match(line) + # Check if the part before the colon contains URL indicators or + # is likely descriptive text rather than an argument name + before_colon = match.group(1) if match else "" + after_colon = match.group(2) if match else "" + + # Heuristics to detect non-argument lines: + # 1. The text before colon contains "http" (part of a URL) + # 2. The line contains "://" (URL protocol) + # 3. The text before colon is too long to be an argument name (>40 chars) + # 4. The text before colon contains common English words that aren't argument names + is_not_argument = ( + "http" in before_colon.lower() or + "://" in line or + len(before_colon) > 40 or + # Check for common descriptive phrases (without trailing space) + any(word in before_colon.lower() for word in ["see", "to find", "refer", "documentation", "available"]) ) - argindent = indent + + if match and is_not_argument: + # This is likely descriptive text with a colon, not an argument + # Treat it as regular text continuation + if argindent > 0: + padding = max(indent - argindent + offset, 0) + out.append(" " * padding + line.replace("\n", "\n" + " " * padding)) + else: + out.append(line) + else: + # This is a real argument + out.append( + "- " + + _RE_ARGSTART.sub(r"`\1`: \2", line) + ) + argindent = indent elif indent > argindent: # attach docs text of argument # * (blockindent + 2) From 7226bd220738a301f39534745b16b20612759929 Mon Sep 17 00:00:00 2001 From: Matt Linville Date: Mon, 20 Oct 2025 12:58:42 -0700 Subject: [PATCH 2/3] fix: address code quality issues - Remove unused variable 'after_colon' - Extract common_words list to avoid long line - Improve code readability --- src/lazydocs/generation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lazydocs/generation.py b/src/lazydocs/generation.py index 1bc0ce9..2da18af 100755 --- a/src/lazydocs/generation.py +++ b/src/lazydocs/generation.py @@ -589,19 +589,18 @@ def _lines_isvalid(lines: list, start_index: int, blockindent: int, # Check if the part before the colon contains URL indicators or # is likely descriptive text rather than an argument name before_colon = match.group(1) if match else "" - after_colon = match.group(2) if match else "" # Heuristics to detect non-argument lines: # 1. The text before colon contains "http" (part of a URL) # 2. The line contains "://" (URL protocol) # 3. The text before colon is too long to be an argument name (>40 chars) # 4. The text before colon contains common English words that aren't argument names + common_words = ["see", "to find", "refer", "documentation", "available"] is_not_argument = ( "http" in before_colon.lower() or "://" in line or len(before_colon) > 40 or - # Check for common descriptive phrases (without trailing space) - any(word in before_colon.lower() for word in ["see", "to find", "refer", "documentation", "available"]) + any(word in before_colon.lower() for word in common_words) ) if match and is_not_argument: From 58126142f056d502b49f5165ee6c58c0d9ee49cd Mon Sep 17 00:00:00 2001 From: Matt Linville Date: Mon, 20 Oct 2025 13:00:57 -0700 Subject: [PATCH 3/3] fix: remove trailing whitespace - Remove trailing whitespace from line 589 --- src/lazydocs/generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lazydocs/generation.py b/src/lazydocs/generation.py index 2da18af..c9d00a9 100755 --- a/src/lazydocs/generation.py +++ b/src/lazydocs/generation.py @@ -586,7 +586,7 @@ def _lines_isvalid(lines: list, start_index: int, blockindent: int, # start of an exception-type block # Check if this looks like a URL being incorrectly parsed match = _RE_ARGSTART.match(line) - # Check if the part before the colon contains URL indicators or + # Check if the part before the colon contains URL indicators or # is likely descriptive text rather than an argument name before_colon = match.group(1) if match else ""