|
| 1 | +// Copied from https://github.com/rust-lang/cargo/blob/367fd9f213750cd40317803dd0a5a3ce3f0c676d/src/cargo/util/frontmatter.rs |
| 2 | +#![expect(dead_code)] // avoid editing |
| 3 | +#![expect(unreachable_pub)] // avoid editing |
| 4 | +#![expect(clippy::useless_format)] // avoid editing |
| 5 | + |
| 6 | +type Span = std::ops::Range<usize>; |
| 7 | + |
| 8 | +#[derive(Debug)] |
| 9 | +pub struct ScriptSource<'s> { |
| 10 | + /// The full file |
| 11 | + raw: &'s str, |
| 12 | + /// The `#!/usr/bin/env cargo` line, if present |
| 13 | + shebang: Option<Span>, |
| 14 | + /// The code fence opener (`---`) |
| 15 | + open: Option<Span>, |
| 16 | + /// Trailing text after `ScriptSource::open` that identifies the meaning of |
| 17 | + /// `ScriptSource::frontmatter` |
| 18 | + info: Option<Span>, |
| 19 | + /// The lines between `ScriptSource::open` and `ScriptSource::close` |
| 20 | + frontmatter: Option<Span>, |
| 21 | + /// The code fence closer (`---`) |
| 22 | + close: Option<Span>, |
| 23 | + /// All content after the frontmatter and shebang |
| 24 | + content: Span, |
| 25 | +} |
| 26 | + |
| 27 | +impl<'s> ScriptSource<'s> { |
| 28 | + pub fn parse(raw: &'s str) -> Result<Self, FrontmatterError> { |
| 29 | + use winnow::stream::FindSlice as _; |
| 30 | + use winnow::stream::Location as _; |
| 31 | + use winnow::stream::Offset as _; |
| 32 | + use winnow::stream::Stream as _; |
| 33 | + |
| 34 | + let content_end = raw.len(); |
| 35 | + let mut source = Self { |
| 36 | + raw, |
| 37 | + shebang: None, |
| 38 | + open: None, |
| 39 | + info: None, |
| 40 | + frontmatter: None, |
| 41 | + close: None, |
| 42 | + content: 0..content_end, |
| 43 | + }; |
| 44 | + |
| 45 | + let mut input = winnow::stream::LocatingSlice::new(raw); |
| 46 | + |
| 47 | + if let Some(shebang_end) = strip_shebang(input.as_ref()) { |
| 48 | + let shebang_start = input.current_token_start(); |
| 49 | + let _ = input.next_slice(shebang_end); |
| 50 | + let shebang_end = input.current_token_start(); |
| 51 | + source.shebang = Some(shebang_start..shebang_end); |
| 52 | + source.content = shebang_end..content_end; |
| 53 | + } |
| 54 | + |
| 55 | + // Whitespace may precede a frontmatter but must end with a newline |
| 56 | + if let Some(nl_end) = strip_ws_lines(input.as_ref()) { |
| 57 | + let _ = input.next_slice(nl_end); |
| 58 | + } |
| 59 | + |
| 60 | + // Opens with a line that starts with 3 or more `-` followed by an optional identifier |
| 61 | + const FENCE_CHAR: char = '-'; |
| 62 | + let fence_length = input |
| 63 | + .as_ref() |
| 64 | + .char_indices() |
| 65 | + .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)) |
| 66 | + .unwrap_or_else(|| input.eof_offset()); |
| 67 | + let open_start = input.current_token_start(); |
| 68 | + let fence_pattern = input.next_slice(fence_length); |
| 69 | + let open_end = input.current_token_start(); |
| 70 | + match fence_length { |
| 71 | + 0 => { |
| 72 | + return Ok(source); |
| 73 | + } |
| 74 | + 1 | 2 => { |
| 75 | + // either not a frontmatter or invalid frontmatter opening |
| 76 | + return Err(FrontmatterError::new( |
| 77 | + format!( |
| 78 | + "found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3" |
| 79 | + ), |
| 80 | + raw.len()..raw.len(), |
| 81 | + ).push_visible_span(open_start..open_end)); |
| 82 | + } |
| 83 | + _ => {} |
| 84 | + } |
| 85 | + source.open = Some(open_start..open_end); |
| 86 | + let Some(info_nl) = input.find_slice("\n") else { |
| 87 | + return Err(FrontmatterError::new( |
| 88 | + format!("unclosed frontmatter; expected `{fence_pattern}`"), |
| 89 | + raw.len()..raw.len(), |
| 90 | + ) |
| 91 | + .push_visible_span(open_start..open_end)); |
| 92 | + }; |
| 93 | + let info = input.next_slice(info_nl.start); |
| 94 | + let info = info.strip_suffix('\r').unwrap_or(info); // already excludes `\n` |
| 95 | + let info = info.trim_matches(is_horizontal_whitespace); |
| 96 | + if !info.is_empty() { |
| 97 | + let info_start = info.offset_from(&raw); |
| 98 | + let info_end = info_start + info.len(); |
| 99 | + source.info = Some(info_start..info_end); |
| 100 | + } |
| 101 | + |
| 102 | + // Ends with a line that starts with a matching number of `-` only followed by whitespace |
| 103 | + let nl_fence_pattern = format!("\n{fence_pattern}"); |
| 104 | + let Some(frontmatter_nl) = input.find_slice(nl_fence_pattern.as_str()) else { |
| 105 | + for len in (2..(nl_fence_pattern.len() - 1)).rev() { |
| 106 | + let Some(frontmatter_nl) = input.find_slice(&nl_fence_pattern[0..len]) else { |
| 107 | + continue; |
| 108 | + }; |
| 109 | + let _ = input.next_slice(frontmatter_nl.start + 1); |
| 110 | + let close_start = input.current_token_start(); |
| 111 | + let _ = input.next_slice(len); |
| 112 | + let close_end = input.current_token_start(); |
| 113 | + let fewer_dashes = fence_length - len; |
| 114 | + return Err(FrontmatterError::new( |
| 115 | + format!( |
| 116 | + "closing code fence has {fewer_dashes} less `-` than the opening fence" |
| 117 | + ), |
| 118 | + close_start..close_end, |
| 119 | + ) |
| 120 | + .push_visible_span(open_start..open_end)); |
| 121 | + } |
| 122 | + return Err(FrontmatterError::new( |
| 123 | + format!("unclosed frontmatter; expected `{fence_pattern}`"), |
| 124 | + raw.len()..raw.len(), |
| 125 | + ) |
| 126 | + .push_visible_span(open_start..open_end)); |
| 127 | + }; |
| 128 | + let frontmatter_start = input.current_token_start() + 1; // skip nl from infostring |
| 129 | + let _ = input.next_slice(frontmatter_nl.start + 1); |
| 130 | + let frontmatter_end = input.current_token_start(); |
| 131 | + source.frontmatter = Some(frontmatter_start..frontmatter_end); |
| 132 | + let close_start = input.current_token_start(); |
| 133 | + let _ = input.next_slice(fence_length); |
| 134 | + let close_end = input.current_token_start(); |
| 135 | + source.close = Some(close_start..close_end); |
| 136 | + |
| 137 | + let nl = input.find_slice("\n"); |
| 138 | + let after_closing_fence = |
| 139 | + input.next_slice(nl.map(|span| span.end).unwrap_or_else(|| input.eof_offset())); |
| 140 | + let content_start = input.current_token_start(); |
| 141 | + let extra_dashes = after_closing_fence.chars().take_while(|b| *b == FENCE_CHAR).count(); |
| 142 | + if 0 < extra_dashes { |
| 143 | + let extra_start = close_end; |
| 144 | + let extra_end = extra_start + extra_dashes; |
| 145 | + return Err(FrontmatterError::new( |
| 146 | + format!("closing code fence has {extra_dashes} more `-` than the opening fence"), |
| 147 | + extra_start..extra_end, |
| 148 | + ) |
| 149 | + .push_visible_span(open_start..open_end)); |
| 150 | + } else { |
| 151 | + let after_closing_fence = strip_newline(after_closing_fence); |
| 152 | + let after_closing_fence = after_closing_fence.trim_matches(is_horizontal_whitespace); |
| 153 | + if !after_closing_fence.is_empty() { |
| 154 | + // extra characters beyond the original fence pattern |
| 155 | + let after_start = after_closing_fence.offset_from(&raw); |
| 156 | + let after_end = after_start + after_closing_fence.len(); |
| 157 | + return Err(FrontmatterError::new( |
| 158 | + format!("unexpected characters after frontmatter close"), |
| 159 | + after_start..after_end, |
| 160 | + ) |
| 161 | + .push_visible_span(open_start..open_end)); |
| 162 | + } |
| 163 | + } |
| 164 | + |
| 165 | + source.content = content_start..content_end; |
| 166 | + |
| 167 | + if let Some(nl_end) = strip_ws_lines(input.as_ref()) { |
| 168 | + let _ = input.next_slice(nl_end); |
| 169 | + } |
| 170 | + let fence_length = input |
| 171 | + .as_ref() |
| 172 | + .char_indices() |
| 173 | + .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)) |
| 174 | + .unwrap_or_else(|| input.eof_offset()); |
| 175 | + if 0 < fence_length { |
| 176 | + let fence_start = input.current_token_start(); |
| 177 | + let fence_end = fence_start + fence_length; |
| 178 | + return Err(FrontmatterError::new( |
| 179 | + format!("only one frontmatter is supported"), |
| 180 | + fence_start..fence_end, |
| 181 | + ) |
| 182 | + .push_visible_span(open_start..open_end) |
| 183 | + .push_visible_span(close_start..close_end)); |
| 184 | + } |
| 185 | + |
| 186 | + Ok(source) |
| 187 | + } |
| 188 | + |
| 189 | + pub fn shebang(&self) -> Option<&'s str> { |
| 190 | + self.shebang.clone().map(|span| &self.raw[span]) |
| 191 | + } |
| 192 | + |
| 193 | + pub fn shebang_span(&self) -> Option<Span> { |
| 194 | + self.shebang.clone() |
| 195 | + } |
| 196 | + |
| 197 | + pub fn open_span(&self) -> Option<Span> { |
| 198 | + self.open.clone() |
| 199 | + } |
| 200 | + |
| 201 | + pub fn info(&self) -> Option<&'s str> { |
| 202 | + self.info.clone().map(|span| &self.raw[span]) |
| 203 | + } |
| 204 | + |
| 205 | + pub fn info_span(&self) -> Option<Span> { |
| 206 | + self.info.clone() |
| 207 | + } |
| 208 | + |
| 209 | + pub fn frontmatter(&self) -> Option<&'s str> { |
| 210 | + self.frontmatter.clone().map(|span| &self.raw[span]) |
| 211 | + } |
| 212 | + |
| 213 | + pub fn frontmatter_span(&self) -> Option<Span> { |
| 214 | + self.frontmatter.clone() |
| 215 | + } |
| 216 | + |
| 217 | + pub fn close_span(&self) -> Option<Span> { |
| 218 | + self.close.clone() |
| 219 | + } |
| 220 | + |
| 221 | + pub fn content(&self) -> &'s str { |
| 222 | + &self.raw[self.content.clone()] |
| 223 | + } |
| 224 | + |
| 225 | + pub fn content_span(&self) -> Span { |
| 226 | + self.content.clone() |
| 227 | + } |
| 228 | +} |
| 229 | + |
| 230 | +/// Returns the index after the shebang line, if present |
| 231 | +pub fn strip_shebang(input: &str) -> Option<usize> { |
| 232 | + // See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang` |
| 233 | + // Shebang must start with `#!` literally, without any preceding whitespace. |
| 234 | + // For simplicity we consider any line starting with `#!` a shebang, |
| 235 | + // regardless of restrictions put on shebangs by specific platforms. |
| 236 | + if let Some(rest) = input.strip_prefix("#!") { |
| 237 | + // Ok, this is a shebang but if the next non-whitespace token is `[`, |
| 238 | + // then it may be valid Rust code, so consider it Rust code. |
| 239 | + // |
| 240 | + // NOTE: rustc considers line and block comments to be whitespace but to avoid |
| 241 | + // any more awareness of Rust grammar, we are excluding it. |
| 242 | + if !rest.trim_start().starts_with('[') { |
| 243 | + // No other choice than to consider this a shebang. |
| 244 | + let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len()); |
| 245 | + return Some(newline_end); |
| 246 | + } |
| 247 | + } |
| 248 | + None |
| 249 | +} |
| 250 | + |
| 251 | +/// Returns the index after any lines with only whitespace, if present |
| 252 | +pub fn strip_ws_lines(input: &str) -> Option<usize> { |
| 253 | + let ws_end = input.find(|c| !is_whitespace(c)).unwrap_or(input.len()); |
| 254 | + if ws_end == 0 { |
| 255 | + return None; |
| 256 | + } |
| 257 | + |
| 258 | + let nl_start = input[0..ws_end].rfind('\n')?; |
| 259 | + let nl_end = nl_start + 1; |
| 260 | + Some(nl_end) |
| 261 | +} |
| 262 | + |
| 263 | +/// True if `c` is considered a whitespace according to Rust language definition. |
| 264 | +/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) |
| 265 | +/// for definitions of these classes. |
| 266 | +fn is_whitespace(c: char) -> bool { |
| 267 | + // This is Pattern_White_Space. |
| 268 | + // |
| 269 | + // Note that this set is stable (ie, it doesn't change with different |
| 270 | + // Unicode versions), so it's ok to just hard-code the values. |
| 271 | + |
| 272 | + matches!( |
| 273 | + c, |
| 274 | + // End-of-line characters |
| 275 | + | '\u{000A}' // line feed (\n) |
| 276 | + | '\u{000B}' // vertical tab |
| 277 | + | '\u{000C}' // form feed |
| 278 | + | '\u{000D}' // carriage return (\r) |
| 279 | + | '\u{0085}' // next line (from latin1) |
| 280 | + | '\u{2028}' // LINE SEPARATOR |
| 281 | + | '\u{2029}' // PARAGRAPH SEPARATOR |
| 282 | + |
| 283 | + // `Default_Ignorable_Code_Point` characters |
| 284 | + | '\u{200E}' // LEFT-TO-RIGHT MARK |
| 285 | + | '\u{200F}' // RIGHT-TO-LEFT MARK |
| 286 | + |
| 287 | + // Horizontal space characters |
| 288 | + | '\u{0009}' // tab (\t) |
| 289 | + | '\u{0020}' // space |
| 290 | + ) |
| 291 | +} |
| 292 | + |
| 293 | +/// True if `c` is considered horizontal whitespace according to Rust language definition. |
| 294 | +fn is_horizontal_whitespace(c: char) -> bool { |
| 295 | + // This is Pattern_White_Space. |
| 296 | + // |
| 297 | + // Note that this set is stable (ie, it doesn't change with different |
| 298 | + // Unicode versions), so it's ok to just hard-code the values. |
| 299 | + |
| 300 | + matches!( |
| 301 | + c, |
| 302 | + // Horizontal space characters |
| 303 | + '\u{0009}' // tab (\t) |
| 304 | + | '\u{0020}' // space |
| 305 | + ) |
| 306 | +} |
| 307 | + |
| 308 | +fn strip_newline(text: &str) -> &str { |
| 309 | + text.strip_suffix("\r\n").or_else(|| text.strip_suffix('\n')).unwrap_or(text) |
| 310 | +} |
| 311 | + |
| 312 | +#[derive(Debug)] |
| 313 | +pub struct FrontmatterError { |
| 314 | + message: String, |
| 315 | + primary_span: Span, |
| 316 | + visible_spans: Vec<Span>, |
| 317 | +} |
| 318 | + |
| 319 | +impl FrontmatterError { |
| 320 | + pub fn new(message: impl Into<String>, span: Span) -> Self { |
| 321 | + Self { message: message.into(), primary_span: span, visible_spans: Vec::new() } |
| 322 | + } |
| 323 | + |
| 324 | + pub fn push_visible_span(mut self, span: Span) -> Self { |
| 325 | + self.visible_spans.push(span); |
| 326 | + self |
| 327 | + } |
| 328 | + |
| 329 | + pub fn message(&self) -> &str { |
| 330 | + self.message.as_str() |
| 331 | + } |
| 332 | + |
| 333 | + pub fn primary_span(&self) -> Span { |
| 334 | + self.primary_span.clone() |
| 335 | + } |
| 336 | + |
| 337 | + pub fn visible_spans(&self) -> &[Span] { |
| 338 | + &self.visible_spans |
| 339 | + } |
| 340 | +} |
| 341 | + |
| 342 | +impl std::fmt::Display for FrontmatterError { |
| 343 | + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| 344 | + self.message.fmt(fmt) |
| 345 | + } |
| 346 | +} |
| 347 | + |
| 348 | +impl std::error::Error for FrontmatterError {} |
0 commit comments