Skip to content

Commit 5057787

Browse files
authored
Merge pull request rust-lang#20854 from epage/frontmatter
feat(parser): Don't error on frontmatter
2 parents 2edfc82 + 77d9b8e commit 5057787

File tree

12 files changed

+407
-9
lines changed

12 files changed

+407
-9
lines changed

src/tools/rust-analyzer/Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,7 @@ dependencies = [
15831583
"rustc-literal-escaper 0.0.4",
15841584
"stdx",
15851585
"tracing",
1586+
"winnow",
15861587
]
15871588

15881589
[[package]]

src/tools/rust-analyzer/crates/parser/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ rustc-literal-escaper.workspace = true
1919
tracing = { workspace = true, optional = true }
2020

2121
edition.workspace = true
22+
winnow = { version = "0.7.13", default-features = false }
2223

2324
[dev-dependencies]
2425
expect-test = "1.5.1"
Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
// Copied from https://github.com/rust-lang/cargo/blob/367fd9f213750cd40317803dd0a5a3ce3f0c676d/src/cargo/util/frontmatter.rs
2+
#![expect(dead_code)] // avoid editing
3+
#![expect(unreachable_pub)] // avoid editing
4+
#![expect(clippy::useless_format)] // avoid editing
5+
6+
type Span = std::ops::Range<usize>;
7+
8+
#[derive(Debug)]
9+
pub struct ScriptSource<'s> {
10+
/// The full file
11+
raw: &'s str,
12+
/// The `#!/usr/bin/env cargo` line, if present
13+
shebang: Option<Span>,
14+
/// The code fence opener (`---`)
15+
open: Option<Span>,
16+
/// Trailing text after `ScriptSource::open` that identifies the meaning of
17+
/// `ScriptSource::frontmatter`
18+
info: Option<Span>,
19+
/// The lines between `ScriptSource::open` and `ScriptSource::close`
20+
frontmatter: Option<Span>,
21+
/// The code fence closer (`---`)
22+
close: Option<Span>,
23+
/// All content after the frontmatter and shebang
24+
content: Span,
25+
}
26+
27+
impl<'s> ScriptSource<'s> {
28+
pub fn parse(raw: &'s str) -> Result<Self, FrontmatterError> {
29+
use winnow::stream::FindSlice as _;
30+
use winnow::stream::Location as _;
31+
use winnow::stream::Offset as _;
32+
use winnow::stream::Stream as _;
33+
34+
let content_end = raw.len();
35+
let mut source = Self {
36+
raw,
37+
shebang: None,
38+
open: None,
39+
info: None,
40+
frontmatter: None,
41+
close: None,
42+
content: 0..content_end,
43+
};
44+
45+
let mut input = winnow::stream::LocatingSlice::new(raw);
46+
47+
if let Some(shebang_end) = strip_shebang(input.as_ref()) {
48+
let shebang_start = input.current_token_start();
49+
let _ = input.next_slice(shebang_end);
50+
let shebang_end = input.current_token_start();
51+
source.shebang = Some(shebang_start..shebang_end);
52+
source.content = shebang_end..content_end;
53+
}
54+
55+
// Whitespace may precede a frontmatter but must end with a newline
56+
if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
57+
let _ = input.next_slice(nl_end);
58+
}
59+
60+
// Opens with a line that starts with 3 or more `-` followed by an optional identifier
61+
const FENCE_CHAR: char = '-';
62+
let fence_length = input
63+
.as_ref()
64+
.char_indices()
65+
.find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
66+
.unwrap_or_else(|| input.eof_offset());
67+
let open_start = input.current_token_start();
68+
let fence_pattern = input.next_slice(fence_length);
69+
let open_end = input.current_token_start();
70+
match fence_length {
71+
0 => {
72+
return Ok(source);
73+
}
74+
1 | 2 => {
75+
// either not a frontmatter or invalid frontmatter opening
76+
return Err(FrontmatterError::new(
77+
format!(
78+
"found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3"
79+
),
80+
raw.len()..raw.len(),
81+
).push_visible_span(open_start..open_end));
82+
}
83+
_ => {}
84+
}
85+
source.open = Some(open_start..open_end);
86+
let Some(info_nl) = input.find_slice("\n") else {
87+
return Err(FrontmatterError::new(
88+
format!("unclosed frontmatter; expected `{fence_pattern}`"),
89+
raw.len()..raw.len(),
90+
)
91+
.push_visible_span(open_start..open_end));
92+
};
93+
let info = input.next_slice(info_nl.start);
94+
let info = info.strip_suffix('\r').unwrap_or(info); // already excludes `\n`
95+
let info = info.trim_matches(is_horizontal_whitespace);
96+
if !info.is_empty() {
97+
let info_start = info.offset_from(&raw);
98+
let info_end = info_start + info.len();
99+
source.info = Some(info_start..info_end);
100+
}
101+
102+
// Ends with a line that starts with a matching number of `-` only followed by whitespace
103+
let nl_fence_pattern = format!("\n{fence_pattern}");
104+
let Some(frontmatter_nl) = input.find_slice(nl_fence_pattern.as_str()) else {
105+
for len in (2..(nl_fence_pattern.len() - 1)).rev() {
106+
let Some(frontmatter_nl) = input.find_slice(&nl_fence_pattern[0..len]) else {
107+
continue;
108+
};
109+
let _ = input.next_slice(frontmatter_nl.start + 1);
110+
let close_start = input.current_token_start();
111+
let _ = input.next_slice(len);
112+
let close_end = input.current_token_start();
113+
let fewer_dashes = fence_length - len;
114+
return Err(FrontmatterError::new(
115+
format!(
116+
"closing code fence has {fewer_dashes} less `-` than the opening fence"
117+
),
118+
close_start..close_end,
119+
)
120+
.push_visible_span(open_start..open_end));
121+
}
122+
return Err(FrontmatterError::new(
123+
format!("unclosed frontmatter; expected `{fence_pattern}`"),
124+
raw.len()..raw.len(),
125+
)
126+
.push_visible_span(open_start..open_end));
127+
};
128+
let frontmatter_start = input.current_token_start() + 1; // skip nl from infostring
129+
let _ = input.next_slice(frontmatter_nl.start + 1);
130+
let frontmatter_end = input.current_token_start();
131+
source.frontmatter = Some(frontmatter_start..frontmatter_end);
132+
let close_start = input.current_token_start();
133+
let _ = input.next_slice(fence_length);
134+
let close_end = input.current_token_start();
135+
source.close = Some(close_start..close_end);
136+
137+
let nl = input.find_slice("\n");
138+
let after_closing_fence =
139+
input.next_slice(nl.map(|span| span.end).unwrap_or_else(|| input.eof_offset()));
140+
let content_start = input.current_token_start();
141+
let extra_dashes = after_closing_fence.chars().take_while(|b| *b == FENCE_CHAR).count();
142+
if 0 < extra_dashes {
143+
let extra_start = close_end;
144+
let extra_end = extra_start + extra_dashes;
145+
return Err(FrontmatterError::new(
146+
format!("closing code fence has {extra_dashes} more `-` than the opening fence"),
147+
extra_start..extra_end,
148+
)
149+
.push_visible_span(open_start..open_end));
150+
} else {
151+
let after_closing_fence = strip_newline(after_closing_fence);
152+
let after_closing_fence = after_closing_fence.trim_matches(is_horizontal_whitespace);
153+
if !after_closing_fence.is_empty() {
154+
// extra characters beyond the original fence pattern
155+
let after_start = after_closing_fence.offset_from(&raw);
156+
let after_end = after_start + after_closing_fence.len();
157+
return Err(FrontmatterError::new(
158+
format!("unexpected characters after frontmatter close"),
159+
after_start..after_end,
160+
)
161+
.push_visible_span(open_start..open_end));
162+
}
163+
}
164+
165+
source.content = content_start..content_end;
166+
167+
if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
168+
let _ = input.next_slice(nl_end);
169+
}
170+
let fence_length = input
171+
.as_ref()
172+
.char_indices()
173+
.find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
174+
.unwrap_or_else(|| input.eof_offset());
175+
if 0 < fence_length {
176+
let fence_start = input.current_token_start();
177+
let fence_end = fence_start + fence_length;
178+
return Err(FrontmatterError::new(
179+
format!("only one frontmatter is supported"),
180+
fence_start..fence_end,
181+
)
182+
.push_visible_span(open_start..open_end)
183+
.push_visible_span(close_start..close_end));
184+
}
185+
186+
Ok(source)
187+
}
188+
189+
pub fn shebang(&self) -> Option<&'s str> {
190+
self.shebang.clone().map(|span| &self.raw[span])
191+
}
192+
193+
pub fn shebang_span(&self) -> Option<Span> {
194+
self.shebang.clone()
195+
}
196+
197+
pub fn open_span(&self) -> Option<Span> {
198+
self.open.clone()
199+
}
200+
201+
pub fn info(&self) -> Option<&'s str> {
202+
self.info.clone().map(|span| &self.raw[span])
203+
}
204+
205+
pub fn info_span(&self) -> Option<Span> {
206+
self.info.clone()
207+
}
208+
209+
pub fn frontmatter(&self) -> Option<&'s str> {
210+
self.frontmatter.clone().map(|span| &self.raw[span])
211+
}
212+
213+
pub fn frontmatter_span(&self) -> Option<Span> {
214+
self.frontmatter.clone()
215+
}
216+
217+
pub fn close_span(&self) -> Option<Span> {
218+
self.close.clone()
219+
}
220+
221+
pub fn content(&self) -> &'s str {
222+
&self.raw[self.content.clone()]
223+
}
224+
225+
pub fn content_span(&self) -> Span {
226+
self.content.clone()
227+
}
228+
}
229+
230+
/// Returns the index after the shebang line, if present
231+
pub fn strip_shebang(input: &str) -> Option<usize> {
232+
// See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
233+
// Shebang must start with `#!` literally, without any preceding whitespace.
234+
// For simplicity we consider any line starting with `#!` a shebang,
235+
// regardless of restrictions put on shebangs by specific platforms.
236+
if let Some(rest) = input.strip_prefix("#!") {
237+
// Ok, this is a shebang but if the next non-whitespace token is `[`,
238+
// then it may be valid Rust code, so consider it Rust code.
239+
//
240+
// NOTE: rustc considers line and block comments to be whitespace but to avoid
241+
// any more awareness of Rust grammar, we are excluding it.
242+
if !rest.trim_start().starts_with('[') {
243+
// No other choice than to consider this a shebang.
244+
let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len());
245+
return Some(newline_end);
246+
}
247+
}
248+
None
249+
}
250+
251+
/// Returns the index after any lines with only whitespace, if present
252+
pub fn strip_ws_lines(input: &str) -> Option<usize> {
253+
let ws_end = input.find(|c| !is_whitespace(c)).unwrap_or(input.len());
254+
if ws_end == 0 {
255+
return None;
256+
}
257+
258+
let nl_start = input[0..ws_end].rfind('\n')?;
259+
let nl_end = nl_start + 1;
260+
Some(nl_end)
261+
}
262+
263+
/// True if `c` is considered a whitespace according to Rust language definition.
264+
/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
265+
/// for definitions of these classes.
266+
fn is_whitespace(c: char) -> bool {
267+
// This is Pattern_White_Space.
268+
//
269+
// Note that this set is stable (ie, it doesn't change with different
270+
// Unicode versions), so it's ok to just hard-code the values.
271+
272+
matches!(
273+
c,
274+
// End-of-line characters
275+
| '\u{000A}' // line feed (\n)
276+
| '\u{000B}' // vertical tab
277+
| '\u{000C}' // form feed
278+
| '\u{000D}' // carriage return (\r)
279+
| '\u{0085}' // next line (from latin1)
280+
| '\u{2028}' // LINE SEPARATOR
281+
| '\u{2029}' // PARAGRAPH SEPARATOR
282+
283+
// `Default_Ignorable_Code_Point` characters
284+
| '\u{200E}' // LEFT-TO-RIGHT MARK
285+
| '\u{200F}' // RIGHT-TO-LEFT MARK
286+
287+
// Horizontal space characters
288+
| '\u{0009}' // tab (\t)
289+
| '\u{0020}' // space
290+
)
291+
}
292+
293+
/// True if `c` is considered horizontal whitespace according to Rust language definition.
294+
fn is_horizontal_whitespace(c: char) -> bool {
295+
// This is Pattern_White_Space.
296+
//
297+
// Note that this set is stable (ie, it doesn't change with different
298+
// Unicode versions), so it's ok to just hard-code the values.
299+
300+
matches!(
301+
c,
302+
// Horizontal space characters
303+
'\u{0009}' // tab (\t)
304+
| '\u{0020}' // space
305+
)
306+
}
307+
308+
fn strip_newline(text: &str) -> &str {
309+
text.strip_suffix("\r\n").or_else(|| text.strip_suffix('\n')).unwrap_or(text)
310+
}
311+
312+
#[derive(Debug)]
313+
pub struct FrontmatterError {
314+
message: String,
315+
primary_span: Span,
316+
visible_spans: Vec<Span>,
317+
}
318+
319+
impl FrontmatterError {
320+
pub fn new(message: impl Into<String>, span: Span) -> Self {
321+
Self { message: message.into(), primary_span: span, visible_spans: Vec::new() }
322+
}
323+
324+
pub fn push_visible_span(mut self, span: Span) -> Self {
325+
self.visible_spans.push(span);
326+
self
327+
}
328+
329+
pub fn message(&self) -> &str {
330+
self.message.as_str()
331+
}
332+
333+
pub fn primary_span(&self) -> Span {
334+
self.primary_span.clone()
335+
}
336+
337+
pub fn visible_spans(&self) -> &[Span] {
338+
&self.visible_spans
339+
}
340+
}
341+
342+
impl std::fmt::Display for FrontmatterError {
343+
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
344+
self.message.fmt(fmt)
345+
}
346+
}
347+
348+
impl std::error::Error for FrontmatterError {}

src/tools/rust-analyzer/crates/parser/src/lexed_str.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,17 @@ impl<'a> LexedStr<'a> {
3737
pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
3838
let _p = tracing::info_span!("LexedStr::new").entered();
3939
let mut conv = Converter::new(edition, text);
40-
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
41-
conv.res.push(SHEBANG, conv.offset);
42-
conv.offset = shebang_len;
43-
};
40+
if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) {
41+
if let Some(shebang) = script.shebang_span() {
42+
conv.push(SHEBANG, shebang.end - shebang.start, Vec::new());
43+
}
44+
if script.frontmatter().is_some() {
45+
conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new());
46+
}
47+
} else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
48+
// Leave error reporting to `rustc_lexer`
49+
conv.push(SHEBANG, shebang_len, Vec::new());
50+
}
4451

4552
// Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
4653
// but we want to split it to two in edition <2024.

src/tools/rust-analyzer/crates/parser/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ extern crate ra_ap_rustc_lexer as rustc_lexer;
2626
extern crate rustc_lexer;
2727

2828
mod event;
29+
mod frontmatter;
2930
mod grammar;
3031
mod input;
3132
mod lexed_str;

0 commit comments

Comments
 (0)