Skip to content

Commit 3cd7e33

Browse files
committed
fix(parser): bare autolink trailing dot and nested autolink detected
Bare autolinks no longer capture trailing sentence punctuation (. , ; ! ? :) or unbalanced closing parens. A new `bare_url()` PEG rule handles balanced parentheses so URLs like `https://en.wikipedia.org/wiki/Foo_(bar)` work correctly while `(see https://example.com)` keeps the `)` outside.
1 parent c81c45e commit 3cd7e33

File tree

13 files changed

+1247
-164
lines changed

13 files changed

+1247
-164
lines changed

acdc-parser/CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ All notable changes to `acdc-parser` will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [Unreleased]
9+
10+
### Fixed
11+
12+
- **Bare autolinks no longer capture trailing punctuation** — URLs like
13+
`https://example.com.` now correctly exclude the trailing `.` from the link target.
14+
A new `bare_url()` rule with balanced parenthesis handling ensures sentence-level
15+
punctuation (`.`, `,`, `;`, `!`, `?`, `:`) and surrounding parens are not consumed.
16+
- **URL macro display text no longer produces nested autolinks** — display text in
17+
`http://example.com[http://example.com]` is now parsed with autolinks suppressed,
18+
preventing the inner URL from being double-linked.
19+
820
## [0.4.0] - 2026-02-07
921

1022
### Fixed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
http://example.com/test1[http://example.com/test1]
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"name": "document",
3+
"type": "block",
4+
"blocks": [
5+
{
6+
"name": "paragraph",
7+
"type": "block",
8+
"inlines": [
9+
{
10+
"name": "ref",
11+
"type": "inline",
12+
"variant": "link",
13+
"target": {
14+
"type": "url",
15+
"value": "http://example.com/test1"
16+
},
17+
"location": [
18+
{
19+
"line": 1,
20+
"col": 1
21+
},
22+
{
23+
"line": 1,
24+
"col": 50
25+
}
26+
],
27+
"attributes": {}
28+
}
29+
],
30+
"location": [
31+
{
32+
"line": 1,
33+
"col": 1
34+
},
35+
{
36+
"line": 1,
37+
"col": 50
38+
}
39+
]
40+
}
41+
],
42+
"location": [
43+
{
44+
"line": 1,
45+
"col": 1
46+
},
47+
{
48+
"line": 1,
49+
"col": 50
50+
}
51+
]
52+
}

acdc-parser/src/grammar/document.rs

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use crate::{
1717
inline_preprocessor::InlinePreprocessorParserState,
1818
inline_processing::{
1919
adjust_and_log_parse_error, parse_inlines, preprocess_inline_content, process_inlines,
20+
process_inlines_no_autolinks,
2021
},
2122
location_mapping::map_inline_locations,
2223
manpage::{
@@ -3555,9 +3556,12 @@ peg::parser! {
35553556
}
35563557

35573558
pub(crate) rule inlines(offset: usize, block_metadata: &BlockParsingMetadata) -> Vec<InlineNode>
3558-
= (non_plain_text(offset, block_metadata) / plain_text(offset, block_metadata))+
3559+
= (non_plain_text(offset, block_metadata, true) / plain_text(offset, block_metadata, true))+
35593560

3560-
rule non_plain_text(offset: usize, block_metadata: &BlockParsingMetadata) -> InlineNode
3561+
pub(crate) rule inlines_no_autolinks(offset: usize, block_metadata: &BlockParsingMetadata) -> Vec<InlineNode>
3562+
= (non_plain_text(offset, block_metadata, false) / plain_text(offset, block_metadata, false))+
3563+
3564+
rule non_plain_text(offset: usize, block_metadata: &BlockParsingMetadata, allow_autolinks: bool) -> InlineNode
35613565
= inline:(
35623566
// Escaped superscript/subscript must come first - produces RawText to prevent re-parsing
35633567
escaped_super_sub:escaped_superscript_subscript(offset) { escaped_super_sub }
@@ -3587,7 +3591,7 @@ peg::parser! {
35873591
/ url_macro:url_macro(offset, block_metadata) { url_macro }
35883592
/ pass:inline_pass(offset) { pass }
35893593
/ link_macro:link_macro(offset) { link_macro }
3590-
/ inline_autolink:inline_autolink(offset) { inline_autolink }
3594+
/ check_autolinks(allow_autolinks) inline_autolink:inline_autolink(offset) { inline_autolink }
35913595
/ inline_line_break:inline_line_break(offset) { inline_line_break }
35923596
/ bold_text_unconstrained:bold_text_unconstrained(offset, block_metadata) { bold_text_unconstrained }
35933597
/ bold_text_constrained:bold_text_constrained(offset, block_metadata) { bold_text_constrained }
@@ -4000,7 +4004,7 @@ peg::parser! {
40004004
}
40014005
}
40024006
let text = if let Some(text) = text {
4003-
process_inlines(state, block_metadata, &start, end, offset, &text)
4007+
process_inlines_no_autolinks(state, block_metadata, &start, end, offset, &text)
40044008
.map_err(|e| {
40054009
tracing::error!(?e, url_text = text, "could not process URL macro text");
40064010
"could not process URL macro text"
@@ -4048,7 +4052,7 @@ peg::parser! {
40484052
}
40494053
}
40504054
let text = if let Some(text) = text {
4051-
process_inlines(state, block_metadata, &start, end, offset, &text)
4055+
process_inlines_no_autolinks(state, block_metadata, &start, end, offset, &text)
40524056
.map_err(|e| {
40534057
tracing::error!(?e, url_text = text, "could not process mailto macro text");
40544058
"could not process mailto macro text"
@@ -4065,12 +4069,16 @@ peg::parser! {
40654069
})))
40664070
}
40674071

4072+
rule check_autolinks(allow: bool) -> ()
4073+
= {? if allow { Ok(()) } else { Err("autolinks suppressed") } }
4074+
40684075
rule inline_autolink(offset: usize) -> InlineNode
4069-
= start:position!()
4076+
=
4077+
start:position!()
40704078
url_info:(
40714079
"<" url:url() ">" { (url, true) }
40724080
/ "<" url:email_address() ">" { (format!("mailto:{url}"), true) }
4073-
/ url:url() { (url, false) }
4081+
/ url:bare_url() { (url, false) }
40744082
/ url:email_address() { (format!("mailto:{url}"), false) }
40754083
)
40764084
end:position!()
@@ -4299,7 +4307,7 @@ peg::parser! {
42994307
if trimmed.is_empty() {
43004308
vec![]
43014309
} else {
4302-
process_inlines(state, block_metadata, &start, end, offset, trimmed)
4310+
process_inlines_no_autolinks(state, block_metadata, &start, end, offset, trimmed)
43034311
.map_err(|e| {
43044312
tracing::error!(?e, xref_text = trimmed, "could not process xref text");
43054313
"could not process xref text"
@@ -4331,7 +4339,7 @@ peg::parser! {
43314339
let text = if raw_text.is_empty() {
43324340
vec![]
43334341
} else {
4334-
process_inlines(state, block_metadata, &start, end, offset, raw_text)
4342+
process_inlines_no_autolinks(state, block_metadata, &start, end, offset, raw_text)
43354343
.map_err(|e| {
43364344
tracing::error!(?e, xref_text = raw_text, "could not process xref text");
43374345
"could not process xref text"
@@ -4860,14 +4868,14 @@ peg::parser! {
48604868
}))
48614869
}
48624870

4863-
rule plain_text(offset: usize, block_metadata: &BlockParsingMetadata) -> InlineNode
4871+
rule plain_text(offset: usize, block_metadata: &BlockParsingMetadata, allow_autolinks: bool) -> InlineNode
48644872
= start_pos:position!()
48654873
content:$((
48664874
// Escape sequences for superscript/subscript markers - only when NOT followed by
48674875
// a complete pattern (those are handled by escaped_superscript_subscript rule)
48684876
"\\" "^" !([^'^' | ' ' | '\t' | '\n']+ "^")
48694877
/ "\\" "~" !([^'~' | ' ' | '\t' | '\n']+ "~")
4870-
/ (!(eol()*<2,> / ![_] / escaped_syntax_match() / index_term_match() / inline_anchor_match() / cross_reference_shorthand_match() / cross_reference_macro_match() / hard_wrap(offset) / footnote_match(offset, block_metadata) / inline_image(start_pos, block_metadata) / inline_icon(start_pos, block_metadata) / inline_stem(start_pos) / inline_keyboard(start_pos) / inline_button(start_pos) / inline_menu(start_pos) / mailto_macro(start_pos, block_metadata) / url_macro(start_pos, block_metadata) / inline_pass(start_pos) / link_macro(start_pos) / inline_autolink(start_pos) / inline_line_break(start_pos) / bold_text_unconstrained(start_pos, block_metadata) / bold_text_constrained_match() / italic_text_unconstrained(start_pos, block_metadata) / italic_text_constrained_match() / monospace_text_unconstrained(start_pos, block_metadata) / monospace_text_constrained_match() / highlight_text_unconstrained(start_pos, block_metadata) / highlight_text_constrained_match() / superscript_text(start_pos, block_metadata) / subscript_text(start_pos, block_metadata) / curved_quotation_text(start_pos, block_metadata) / curved_apostrophe_text(start_pos, block_metadata) / standalone_curved_apostrophe(start_pos, block_metadata)) [_])
4878+
/ (!(eol()*<2,> / ![_] / escaped_syntax_match() / index_term_match() / inline_anchor_match() / cross_reference_shorthand_match() / cross_reference_macro_match() / hard_wrap(offset) / footnote_match(offset, block_metadata) / inline_image(start_pos, block_metadata) / inline_icon(start_pos, block_metadata) / inline_stem(start_pos) / inline_keyboard(start_pos) / inline_button(start_pos) / inline_menu(start_pos) / mailto_macro(start_pos, block_metadata) / url_macro(start_pos, block_metadata) / inline_pass(start_pos) / link_macro(start_pos) / (check_autolinks(allow_autolinks) inline_autolink(start_pos)) / inline_line_break(start_pos) / bold_text_unconstrained(start_pos, block_metadata) / bold_text_constrained_match() / italic_text_unconstrained(start_pos, block_metadata) / italic_text_constrained_match() / monospace_text_unconstrained(start_pos, block_metadata) / monospace_text_constrained_match() / highlight_text_unconstrained(start_pos, block_metadata) / highlight_text_constrained_match() / superscript_text(start_pos, block_metadata) / subscript_text(start_pos, block_metadata) / curved_quotation_text(start_pos, block_metadata) / curved_apostrophe_text(start_pos, block_metadata) / standalone_curved_apostrophe(start_pos, block_metadata)) [_])
48714879
)+)
48724880
end:position!()
48734881
{
@@ -5590,6 +5598,63 @@ peg::parser! {
55905598
Ok(strip_url_backslash_escapes(&processed.text))
55915599
}
55925600

5601+
/// URL for bare autolinks — avoids capturing trailing sentence punctuation
5602+
/// (., ;, !, etc.) by only consuming punctuation when more URL chars follow.
5603+
rule bare_url() -> String =
5604+
proto:$("https" / "http" / "ftp" / "irc") "://" path:bare_url_path()
5605+
{ format!("{proto}://{path}") }
5606+
5607+
/// URL path for bare autolinks. Like url_path() but:
5608+
/// - Trailing punctuation (. , ; ! ? : ' *) only consumed when followed by more URL chars.
5609+
/// - `)` only consumed as part of a balanced `(...)` group, preventing capture of
5610+
/// sentence-level parens like `(see http://example.com)`.
5611+
rule bare_url_path() -> String = path:$(
5612+
bare_url_safe_char()
5613+
( bare_url_safe_char()
5614+
/ bare_url_paren_group()
5615+
/ "("
5616+
/ bare_url_trailing_char() &bare_url_char()
5617+
)*
5618+
)
5619+
{?
5620+
let inline_state = InlinePreprocessorParserState::new(
5621+
path,
5622+
state.line_map.clone(),
5623+
&state.input,
5624+
);
5625+
let processed = inline_preprocessing::run(path, &state.document_attributes, &inline_state)
5626+
.map_err(|e| {
5627+
tracing::error!(?e, "could not preprocess bare url path");
5628+
"could not preprocess bare url path"
5629+
})?;
5630+
for warning in inline_state.drain_warnings() {
5631+
state.add_warning(warning);
5632+
}
5633+
Ok(strip_url_backslash_escapes(&processed.text))
5634+
}
5635+
5636+
/// Balanced parenthesized group in a URL path.
5637+
/// Handles nested parens: `http://example.com/wiki/Foo_(bar_(baz))`
5638+
/// Only `)` consumed via this rule — unbalanced `)` is never captured.
5639+
rule bare_url_paren_group()
5640+
= "(" (bare_url_safe_char() / bare_url_trailing_char() / bare_url_paren_group() / "(")* ")"
5641+
5642+
/// URL chars that are safe to end a bare URL — won't be confused with sentence punctuation.
5643+
/// Excludes `(` and `)` which are handled separately via `bare_url_paren_group`.
5644+
rule bare_url_safe_char() = ['A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '~'
5645+
| '/' | '#' | '@' | '$' | '&'
5646+
| '+' | '=' | '%' | '\\']
5647+
5648+
/// URL chars that are valid mid-URL but should not end a bare URL.
5649+
/// Excludes `)` which is only consumed via balanced `bare_url_paren_group`.
5650+
rule bare_url_trailing_char() = ['.' | ',' | ';' | '!' | '?' | ':' | '\'' | '*']
5651+
5652+
/// Any valid URL path char (for lookahead in trailing char rule).
5653+
/// Includes `(` because it can start a paren group.
5654+
/// Excludes `)` so that trailing chars before `)` aren't greedily consumed
5655+
/// (e.g., `http://example.com.)` keeps both `.` and `)` outside).
5656+
rule bare_url_char() = bare_url_safe_char() / bare_url_trailing_char() / "("
5657+
55935658
/// Filesystem path - conservative character set for cross-platform compatibility
55945659
/// Includes '{' and '}' for `AsciiDoc` attribute substitution
55955660
pub rule path() -> String = path:$(['A'..='Z' | 'a'..='z' | '0'..='9' | '{' | '}' | '_' | '-' | '.' | '/' | '\\' ]+)

acdc-parser/src/grammar/inline_processing.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,38 @@ pub(crate) fn parse_inlines(
137137
Ok(inlines)
138138
}
139139

140+
#[tracing::instrument(skip_all, fields(processed=?processed, block_metadata=?block_metadata))]
141+
pub(crate) fn parse_inlines_no_autolinks(
142+
processed: &ProcessedContent,
143+
state: &mut ParserState,
144+
block_metadata: &BlockParsingMetadata,
145+
location: &Location,
146+
) -> Result<Vec<InlineNode>, Error> {
147+
let mut inline_peg_state = ParserState::new(&processed.text);
148+
inline_peg_state.document_attributes = state.document_attributes.clone();
149+
inline_peg_state.footnote_tracker = state.footnote_tracker.clone();
150+
151+
let inlines = match document_parser::inlines_no_autolinks(
152+
&processed.text,
153+
&mut inline_peg_state,
154+
0,
155+
block_metadata,
156+
) {
157+
Ok(inlines) => inlines,
158+
Err(err) => {
159+
return Err(adjust_peg_error_position(
160+
&err,
161+
&processed.text,
162+
location.absolute_start,
163+
state,
164+
));
165+
}
166+
};
167+
168+
state.footnote_tracker = inline_peg_state.footnote_tracker.clone();
169+
Ok(inlines)
170+
}
171+
140172
/// Process inlines
141173
///
142174
/// This function processes inline content by first preprocessing it and then parsing it
@@ -161,3 +193,25 @@ pub(crate) fn process_inlines(
161193
let content = parse_inlines(&processed, state, block_metadata, &location)?;
162194
super::location_mapping::map_inline_locations(state, &processed, &content, &location)
163195
}
196+
197+
/// Process inlines with autolinks suppressed.
198+
///
199+
/// Used inside URL macros, mailto macros, and cross-references where nested
200+
/// autolinks would cause incorrect parsing.
201+
#[tracing::instrument(skip_all, fields(?content_start, end, offset))]
202+
pub(crate) fn process_inlines_no_autolinks(
203+
state: &mut ParserState,
204+
block_metadata: &BlockParsingMetadata,
205+
content_start: &PositionWithOffset,
206+
end: usize,
207+
offset: usize,
208+
content: &str,
209+
) -> Result<Vec<InlineNode>, Error> {
210+
let (location, processed) =
211+
preprocess_inline_content(state, content_start, end, offset, content)?;
212+
if processed.text.trim().is_empty() {
213+
return Ok(Vec::new());
214+
}
215+
let content = parse_inlines_no_autolinks(&processed, state, block_metadata, &location)?;
216+
super::location_mapping::map_inline_locations(state, &processed, &content, &location)
217+
}

0 commit comments

Comments
 (0)