Skip to content

Commit 7f7f5a8

Browse files
committed
fix(rules): support multi-line code spans in MkDocs containers
Replace the per-line backtick scanner with a dedent-and-reparse approach that delegates all CommonMark parsing to pulldown-cmark. This handles multi-line code spans natively without manual reimplementation of CommonMark edge cases. The approach: identify contiguous runs of MkDocs container lines, strip the container indentation, reparse with pulldown-cmark, then map byte offsets back to the original document. Container openers (admonition markers, tab markers) are excluded from the minimum indent calculation so nested containers (e.g., admonition inside content tab) are handled correctly. Also fix MD034 and MD052 code span filters to use byte-offset comparison instead of single-line-only line/column checks, which correctly handles multi-line code spans for both URL and email detection.
1 parent ff9237e commit 7f7f5a8

File tree

5 files changed

+506
-20
lines changed

5 files changed

+506
-20
lines changed

src/lint_context/element_parsers.rs

Lines changed: 120 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use crate::config::MarkdownFlavor;
22
use crate::utils::code_block_utils::CodeBlockUtils;
3+
use crate::utils::mkdocs_admonitions;
4+
use crate::utils::mkdocs_tabs;
35
use crate::utils::regex_cache::URL_SIMPLE_REGEX;
46
use pulldown_cmark::{Event, Options, Parser};
57
use regex::Regex;
@@ -11,7 +13,11 @@ use super::types::*;
1113
static BARE_EMAIL_PATTERN: LazyLock<Regex> =
1214
LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
1315

14-
/// Parse all inline code spans in the content using pulldown-cmark streaming parser
16+
/// Parse all inline code spans in the content using pulldown-cmark streaming parser.
17+
///
18+
/// Note: For MkDocs content, `scan_mkdocs_container_code_spans()` must be called separately
19+
/// to detect code spans that pulldown-cmark misses inside 4-space-indented containers.
20+
/// This is done during LintContext construction in mod.rs.
1521
pub(super) fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpan> {
1622
// Quick check - if no backticks, no code spans
1723
if !content.contains('`') {
@@ -31,6 +37,119 @@ pub(super) fn parse_code_spans(content: &str, lines: &[LineInfo]) -> Vec<CodeSpa
3137
build_code_spans_from_ranges(content, lines, &ranges)
3238
}
3339

40+
/// Scan MkDocs container lines for code spans that pulldown-cmark missed.
41+
///
42+
/// pulldown-cmark treats 4-space-indented MkDocs content (admonitions, content tabs,
43+
/// markdown HTML blocks) as indented code blocks, so it never emits `Event::Code` for
44+
/// backtick spans within those regions. This function dedents contiguous runs of container
45+
/// lines and reparses them with pulldown-cmark, which correctly handles both single-line
46+
/// and multi-line code spans including all CommonMark edge cases.
47+
pub(super) fn scan_mkdocs_container_code_spans(
48+
content: &str,
49+
lines: &[LineInfo],
50+
existing_ranges: &[(usize, usize)],
51+
) -> Vec<CodeSpan> {
52+
let mut extra_ranges: Vec<(usize, usize)> = Vec::new();
53+
54+
// Process contiguous runs of MkDocs container lines
55+
let mut i = 0;
56+
while i < lines.len() {
57+
// Find start of a container run
58+
if !lines[i].in_mkdocs_container() || lines[i].in_code_block {
59+
i += 1;
60+
continue;
61+
}
62+
63+
// Collect the contiguous run
64+
let run_start = i;
65+
while i < lines.len() && lines[i].in_mkdocs_container() && !lines[i].in_code_block {
66+
i += 1;
67+
}
68+
let run_end = i;
69+
70+
// Quick check: any backticks in this run?
71+
let has_backticks = lines[run_start..run_end]
72+
.iter()
73+
.any(|li| li.content(content).contains('`'));
74+
if !has_backticks {
75+
continue;
76+
}
77+
78+
// Compute minimum indentation across content lines only.
79+
// Container openers (e.g., `=== "Tab"`, `!!! note`) are structural markers
80+
// that should be excluded from the min_indent calculation. For nested
81+
// containers (admonition inside content tab), including openers would
82+
// prevent stripping enough indent from deeply nested content, causing
83+
// pulldown-cmark to misinterpret it as indented code blocks.
84+
let min_indent = lines[run_start..run_end]
85+
.iter()
86+
.filter(|li| {
87+
if li.is_blank || li.indent == 0 {
88+
return false;
89+
}
90+
let line_text = li.content(content);
91+
// Exclude container openers from min_indent calculation
92+
if mkdocs_admonitions::is_admonition_start(line_text) || mkdocs_tabs::is_tab_marker(line_text) {
93+
return false;
94+
}
95+
true
96+
})
97+
.map(|li| li.indent)
98+
.min()
99+
.unwrap_or(0);
100+
101+
// Build dedented string and line map for offset translation.
102+
// Each entry: (byte offset in dedented string, byte offset in original document)
103+
let mut dedented = String::new();
104+
let mut line_map: Vec<(usize, usize)> = Vec::new();
105+
106+
for li in &lines[run_start..run_end] {
107+
let dedented_line_start = dedented.len();
108+
let line_content = li.content(content);
109+
let bytes_to_strip = min_indent.min(li.indent);
110+
let stripped = &line_content[bytes_to_strip..];
111+
let original_start = li.byte_offset + bytes_to_strip;
112+
line_map.push((dedented_line_start, original_start));
113+
dedented.push_str(stripped);
114+
dedented.push('\n');
115+
}
116+
117+
// Parse the dedented string with pulldown-cmark
118+
let parser = Parser::new(&dedented).into_offset_iter();
119+
for (event, range) in parser {
120+
if let Event::Code(_) = event {
121+
let orig_start = dedented_to_original(range.start, &line_map);
122+
let orig_end = dedented_to_original(range.end, &line_map);
123+
124+
// Skip ranges already detected by the initial pulldown-cmark pass
125+
let overlaps = existing_ranges.iter().any(|&(s, e)| s < orig_end && e > orig_start);
126+
if !overlaps {
127+
extra_ranges.push((orig_start, orig_end));
128+
}
129+
}
130+
}
131+
}
132+
133+
if extra_ranges.is_empty() {
134+
return Vec::new();
135+
}
136+
137+
extra_ranges.sort_unstable_by_key(|&(start, _)| start);
138+
build_code_spans_from_ranges(content, lines, &extra_ranges)
139+
}
140+
141+
/// Convert a byte offset in the dedented string back to the original document offset.
142+
///
143+
/// `line_map` entries are `(dedented_line_start, original_line_start)`.
144+
fn dedented_to_original(dedented_offset: usize, line_map: &[(usize, usize)]) -> usize {
145+
// Find the rightmost entry whose dedented_line_start <= dedented_offset
146+
let idx = line_map
147+
.partition_point(|&(ds, _)| ds <= dedented_offset)
148+
.saturating_sub(1);
149+
let (dedented_line_start, original_line_start) = line_map[idx];
150+
original_line_start + (dedented_offset - dedented_line_start)
151+
}
152+
34153
pub(super) fn build_code_spans_from_ranges(
35154
content: &str,
36155
lines: &[LineInfo],

src/lint_context/mod.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,27 @@ impl<'a> LintContext<'a> {
294294
}
295295

296296
// Parse code spans early so we can exclude them from link/image parsing
297-
let code_spans = profile_section!(
297+
let mut code_spans = profile_section!(
298298
"Code spans",
299299
profile,
300300
element_parsers::build_code_spans_from_ranges(content, &lines, &code_span_ranges)
301301
);
302302

303+
// Supplement code spans for MkDocs container content that pulldown-cmark missed.
304+
// pulldown-cmark treats 4-space-indented MkDocs content as indented code blocks,
305+
// so backtick code spans within admonitions/tabs/markdown HTML are invisible to it.
306+
if flavor == MarkdownFlavor::MkDocs {
307+
let extra = profile_section!(
308+
"MkDocs code spans",
309+
profile,
310+
element_parsers::scan_mkdocs_container_code_spans(content, &lines, &code_span_ranges,)
311+
);
312+
if !extra.is_empty() {
313+
code_spans.extend(extra);
314+
code_spans.sort_by_key(|span| span.byte_offset);
315+
}
316+
}
317+
303318
// Mark lines that are continuations of multi-line code spans
304319
// This is needed for parse_list_blocks to correctly handle list items with multi-line code spans
305320
for span in &code_spans {

src/rules/md034_no_bare_urls.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,10 @@ impl MD034NoBareUrls {
392392
continue;
393393
}
394394

395-
// Check if email is inside a code span
395+
// Check if email is inside a code span (byte offsets handle multi-line spans)
396396
let is_in_code_span = code_spans
397397
.iter()
398-
.any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
398+
.any(|span| absolute_pos >= span.byte_offset && absolute_pos < span.byte_end);
399399

400400
if !is_in_code_span {
401401
let email_len = end - start;
@@ -485,14 +485,19 @@ impl Rule for MD034NoBareUrls {
485485
let mut line_warnings =
486486
self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
487487

488-
// Filter out warnings that are inside code spans
488+
// Filter out warnings that are inside code spans (handles multi-line spans via byte offsets)
489489
line_warnings.retain(|warning| {
490-
// Check if the URL is inside a code span
491490
!code_spans.iter().any(|span| {
492-
span.line == warning.line &&
493-
warning.column > 0 && // column is 1-indexed
494-
(warning.column - 1) >= span.start_col &&
495-
(warning.column - 1) < span.end_col
491+
if let Some(fix) = &warning.fix {
492+
// Byte-offset check handles both single-line and multi-line code spans
493+
fix.range.start >= span.byte_offset && fix.range.start < span.byte_end
494+
} else {
495+
span.line == warning.line
496+
&& span.end_line == warning.line
497+
&& warning.column > 0
498+
&& (warning.column - 1) >= span.start_col
499+
&& (warning.column - 1) < span.end_col
500+
}
496501
})
497502
});
498503

src/rules/md052_reference_links_images.rs

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,11 @@ impl MD052ReferenceLinkImages {
262262
false
263263
}
264264

265-
/// Check if a position is inside any code span
266-
fn is_in_code_span(line: usize, col: usize, code_spans: &[crate::lint_context::CodeSpan]) -> bool {
265+
/// Check if a byte position is inside any code span
266+
fn is_in_code_span(byte_pos: usize, code_spans: &[crate::lint_context::CodeSpan]) -> bool {
267267
code_spans
268268
.iter()
269-
.any(|span| span.line == line && col >= span.start_col && col < span.end_col)
269+
.any(|span| byte_pos >= span.byte_offset && byte_pos < span.byte_end)
270270
}
271271

272272
/// Check if a byte position is within an HTML tag
@@ -340,7 +340,7 @@ impl MD052ReferenceLinkImages {
340340
}
341341

342342
// Skip links inside code spans
343-
if Self::is_in_code_span(link.line, link.start_col, &code_spans) {
343+
if Self::is_in_code_span(link.byte_offset, &code_spans) {
344344
continue;
345345
}
346346

@@ -447,7 +447,7 @@ impl MD052ReferenceLinkImages {
447447
}
448448

449449
// Skip images inside code spans
450-
if Self::is_in_code_span(image.line, image.start_col, &code_spans) {
450+
if Self::is_in_code_span(image.byte_offset, &code_spans) {
451451
continue;
452452
}
453453

@@ -709,17 +709,15 @@ impl MD052ReferenceLinkImages {
709709
if !references.contains(&reference_lower) && !reported_refs.contains_key(&reference_lower) {
710710
let full_match = cap.get(0).unwrap();
711711
let col = full_match.start();
712+
let line_start_byte = ctx.line_offsets[line_num];
713+
let byte_pos = line_start_byte + col;
712714

713715
// Skip if inside code span
714716
let code_spans = ctx.code_spans();
715-
if Self::is_in_code_span(line_num + 1, col, &code_spans) {
717+
if Self::is_in_code_span(byte_pos, &code_spans) {
716718
continue;
717719
}
718720

719-
// Check if this position is within a covered range
720-
let line_start_byte = ctx.line_offsets[line_num];
721-
let byte_pos = line_start_byte + col;
722-
723721
// Skip if inside Jinja template
724722
if ctx.is_in_jinja_range(byte_pos) {
725723
continue;

0 commit comments

Comments
 (0)