Skip to content

Commit dd44c4c

Browse files
authored
Rollup merge of #146106 - epage:whitespace, r=fee1-dead
fix(lexer): Only allow horizontal whitespace in frontmatter In writing up the reference for frontmatter, I realized that we probably shouldn't be accepting Unicode Line Ending characters between the code fence and infostring or trailing after the infostring or a code fence. In digging into the unicode specification we use for Whitespace, it divides it up into categories, so I'm deferring to what it says for horizontal whitespace for what should be used within a line. Note, I am leaving out support for Unicode Default Ignorable characters. I figure that can be discussed outside of this change within the reference and tracking issue. Fixes #145971 Frontmatter tracking issue: #136889
2 parents 24d59e3 + 6f0da97 commit dd44c4c

File tree

7 files changed

+73
-21
lines changed

7 files changed

+73
-21
lines changed

compiler/rustc_lexer/src/lib.rs

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -331,24 +331,37 @@ pub fn is_whitespace(c: char) -> bool {
331331

332332
matches!(
333333
c,
334-
// Usual ASCII suspects
335-
'\u{0009}' // \t
336-
| '\u{000A}' // \n
334+
// End-of-line characters
335+
| '\u{000A}' // line feed (\n)
337336
| '\u{000B}' // vertical tab
338337
| '\u{000C}' // form feed
339-
| '\u{000D}' // \r
340-
| '\u{0020}' // space
341-
342-
// NEXT LINE from latin1
343-
| '\u{0085}'
338+
| '\u{000D}' // carriage return (\r)
339+
| '\u{0085}' // next line (from latin1)
340+
| '\u{2028}' // LINE SEPARATOR
341+
| '\u{2029}' // PARAGRAPH SEPARATOR
344342

345-
// Bidi markers
343+
// `Default_Ignorable_Code_Point` characters
346344
| '\u{200E}' // LEFT-TO-RIGHT MARK
347345
| '\u{200F}' // RIGHT-TO-LEFT MARK
348346

349-
// Dedicated whitespace characters from Unicode
350-
| '\u{2028}' // LINE SEPARATOR
351-
| '\u{2029}' // PARAGRAPH SEPARATOR
347+
// Horizontal space characters
348+
| '\u{0009}' // tab (\t)
349+
| '\u{0020}' // space
350+
)
351+
}
352+
353+
/// True if `c` is considered horizontal whitespace according to Rust language definition.
354+
pub fn is_horizontal_whitespace(c: char) -> bool {
355+
// This is Pattern_White_Space.
356+
//
357+
// Note that this set is stable (ie, it doesn't change with different
358+
// Unicode versions), so it's ok to just hard-code the values.
359+
360+
matches!(
361+
c,
362+
// Horizontal space characters
363+
'\u{0009}' // tab (\t)
364+
| '\u{0020}' // space
352365
)
353366
}
354367

@@ -538,7 +551,7 @@ impl Cursor<'_> {
538551
debug_assert!(length_opening >= 3);
539552

540553
// whitespace between the opening and the infostring.
541-
self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
554+
self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
542555

543556
// copied from `eat_identifier`, but allows `-` and `.` in infostring to allow something like
544557
// `---Cargo.toml` as a valid opener
@@ -547,7 +560,7 @@ impl Cursor<'_> {
547560
self.eat_while(|c| is_id_continue(c) || c == '-' || c == '.');
548561
}
549562

550-
self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
563+
self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
551564
let invalid_infostring = self.first() != '\n';
552565

553566
let mut found = false;
@@ -588,7 +601,7 @@ impl Cursor<'_> {
588601
// on a standalone line. Might be wrong.
589602
while let Some(closing) = rest.find("---") {
590603
let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
591-
if rest[preceding_chars_start..closing].chars().all(is_whitespace) {
604+
if rest[preceding_chars_start..closing].chars().all(is_horizontal_whitespace) {
592605
// candidate found
593606
potential_closing = Some(closing);
594607
break;

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_contr
66
use rustc_errors::codes::*;
77
use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
88
use rustc_lexer::{
9-
Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
9+
Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_horizontal_whitespace,
1010
};
1111
use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
1212
use rustc_session::lint::BuiltinLintDiag;
@@ -597,7 +597,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
597597

598598
let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
599599
let last_line = &within[last_line_start..];
600-
let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
600+
let last_line_trimmed = last_line.trim_start_matches(is_horizontal_whitespace);
601601
let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
602602

603603
let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
@@ -640,7 +640,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
640640
});
641641
}
642642

643-
if !rest.trim_matches(is_whitespace).is_empty() {
643+
if !rest.trim_matches(is_horizontal_whitespace).is_empty() {
644644
let span = self.mk_sp(last_line_start_pos, self.pos);
645645
self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
646646
}

tests/ui/.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ json-bom-plus-crlf.rs -text
33
json-bom-plus-crlf-multifile.rs -text
44
json-bom-plus-crlf-multifile-aux.rs -text
55
trailing-carriage-return-in-string.rs -text
6+
frontmatter-crlf.rs -text
67
*.bin -text
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env -S cargo -Zscript
2+
---cargo
3+
# Beware editing: it has numerous whitespace characters which are important.
4+
# It contains one ranges from the 'PATTERN_WHITE_SPACE' property outlined in
5+
# https://unicode.org/Public/UNIDATA/PropList.txt
6+
#
7+
# The characters in the first expression of the assertion can be generated
8+
# from: "4\u{0C}+\n\t\r7\t*\u{20}2\u{85}/\u{200E}3\u{200F}*\u{2028}2\u{2029}"
9+
package.description = """
10+
4 +
11+
12+
7 * 2…/‎3*2
13+
"""
14+
---
15+
16+
//@ check-pass
17+
18+
// Ensure the frontmatter can contain any whitespace
19+
20+
#![feature(frontmatter)]
21+
22+
fn main() {}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env -S cargo -Zscript
2+
---
3+
[dependencies]
4+
clap = "4"
5+
---
6+
7+
//@ check-pass
8+
// ignore-tidy-cr
9+
10+
// crlf line endings should be accepted
11+
12+
#![feature(frontmatter)]
13+
14+
fn main() {}

tests/ui/frontmatter/frontmatter-whitespace-3.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22

3-
---cargo
4-
---
3+
---cargo
4+
---
55

66
// please note the whitespace characters after the first four lines.
77
// This ensures that we accept whitespaces before the frontmatter, after
@@ -10,6 +10,7 @@
1010
//@ check-pass
1111
// ignore-tidy-end-whitespace
1212
// ignore-tidy-leading-newlines
13+
// ignore-tidy-tab
1314

1415
#![feature(frontmatter)]
1516

tests/ui/frontmatter/frontmatter-whitespace-4.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
--- cargo
1+
--- cargo
22
---
33

44
//@ check-pass
5+
// ignore-tidy-tab
56
// A frontmatter infostring can have leading whitespace.
67

78
#![feature(frontmatter)]

0 commit comments

Comments
 (0)