Skip to content

Commit 15edbcd

Browse files
Remove CR LF normalization to align with spec
The specification now says that CR LF normalization is part of the pre-processing prior to tokenization. Since it's not part of the lex grammar, this commit removes it from the parsing code as well. CR are simply disallowed fully now. This is technically a breaking change, but unlikely to be noticed by any real world input. In proc macros, all input has been normalized by the Rust compiler anyway, so only very weird input (CR CR LF) would have been accepted previously, but not after this commit. See: - rust-lang/reference#1944 - rust-lang/reference@fa56fdb - rust-lang/reference@27e1ec9
1 parent d3cf292 commit 15edbcd

File tree

6 files changed

+27
-104
lines changed

6 files changed

+27
-104
lines changed

src/bytestr/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ impl<B: Buffer> fmt::Display for ByteStringLit<B> {
115115
fn parse_impl(input: &str) -> Result<(Option<Vec<u8>>, Option<u32>, usize), ParseError> {
116116
if input.starts_with("br") {
117117
scan_raw_string::<u8>(&input, 2)
118-
.map(|(v, num, start_suffix)| (v.map(String::into_bytes), Some(num), start_suffix))
118+
.map(|(num, start_suffix)| (None, Some(num), start_suffix))
119119
} else {
120120
unescape_string::<u8>(&input, 2)
121121
.map(|(v, start_suffix)| (v.map(String::into_bytes), None, start_suffix))

src/bytestr/tests.rs

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ fn special_whitespace() {
5959
assert_eq!(ByteStringLit::parse(&*input).unwrap().into_value(), s.as_bytes());
6060
}
6161
}
62-
63-
let res = ByteStringLit::parse("br\"\r\"").expect("failed to parse");
64-
assert_eq!(res.value(), b"\r");
6562
}
6663

6764
#[test]
@@ -97,35 +94,14 @@ bar", true, None);
9794
banana", true, None);
9895

9996
// Weird whitespace characters
100-
let lit = ByteStringLit::parse("b\"foo\\\n\r\t\n \n\tbar\"").expect("failed to parse");
97+
let lit = ByteStringLit::parse("b\"foo\\\n\t\n \n\tbar\"").expect("failed to parse");
10198
assert_eq!(lit.value(), b"foobar");
10299

103100
// Raw strings do not handle "string continues"
104101
check!(br"foo\
105102
bar", false, Some(0));
106103
}
107104

108-
#[test]
109-
fn crlf_newlines() {
110-
let lit = ByteStringLit::parse("b\"foo\r\nbar\"").expect("failed to parse");
111-
assert_eq!(lit.value(), b"foo\nbar");
112-
113-
let lit = ByteStringLit::parse("b\"\r\nbar\"").expect("failed to parse");
114-
assert_eq!(lit.value(), b"\nbar");
115-
116-
let lit = ByteStringLit::parse("b\"foo\r\n\"").expect("failed to parse");
117-
assert_eq!(lit.value(), b"foo\n");
118-
119-
let lit = ByteStringLit::parse("br\"foo\r\nbar\"").expect("failed to parse");
120-
assert_eq!(lit.value(), b"foo\nbar");
121-
122-
let lit = ByteStringLit::parse("br#\"\r\nbar\"#").expect("failed to parse");
123-
assert_eq!(lit.value(), b"\nbar");
124-
125-
let lit = ByteStringLit::parse("br##\"foo\r\n\"##").expect("failed to parse");
126-
assert_eq!(lit.value(), b"foo\n");
127-
}
128-
129105
#[test]
130106
fn raw_byte_string() {
131107
check!(br"", false, Some(0));
@@ -172,8 +148,12 @@ fn parse_err() {
172148
assert_err!(ByteStringLit, r#"b"fox"peter""#, InvalidSuffix, 6);
173149
assert_err!(ByteStringLit, r###"br#"foo "# bar"#"###, UnexpectedChar, 10);
174150

175-
assert_err!(ByteStringLit, "b\"\r\"", IsolatedCr, 2);
176-
assert_err!(ByteStringLit, "b\"fo\rx\"", IsolatedCr, 4);
151+
assert_err!(ByteStringLit, "b\"\r\"", CarriageReturn, 2);
152+
assert_err!(ByteStringLit, "b\"fo\rx\"", CarriageReturn, 4);
153+
assert_err!(ByteStringLit, "br\"\r\"", CarriageReturn, 3);
154+
assert_err!(ByteStringLit, "br\"fo\rx\"", CarriageReturn, 5);
155+
assert_err!(ByteStringLit, "b\"a\\\r\"", UnknownEscape, 3..5);
156+
assert_err!(ByteStringLit, "br\"a\\\r\"", CarriageReturn, 5);
177157

178158
assert_err!(ByteStringLit, r##"br####""##, UnterminatedRawString, None);
179159
assert_err!(ByteStringLit, r#####"br##"foo"#bar"#####, UnterminatedRawString, None);

src/err.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -299,9 +299,8 @@ pub(crate) enum ParseErrorKind {
299299

300300
InvalidByteStringLiteralStart,
301301

302-
/// An literal `\r` character not followed by a `\n` character in a
303-
/// (raw) string or byte string literal.
304-
IsolatedCr,
302+
/// `\r` in a (raw) string or (raw) byte string literal.
303+
CarriageReturn,
305304

306305
/// Literal suffix is not a valid identifier.
307306
InvalidSuffix,
@@ -355,7 +354,7 @@ impl fmt::Display for ParseError {
355354
InvalidStringLiteralStart => "invalid start for string literal",
356355
InvalidByteLiteralStart => "invalid start for byte literal",
357356
InvalidByteStringLiteralStart => "invalid start for byte string literal",
358-
IsolatedCr => r"`\r` not immediately followed by `\n` in string",
357+
CarriageReturn => r"`\r` not allowed in string literals",
359358
InvalidSuffix => "literal suffix is not a valid identifier",
360359
UnexpectedIntegerLit => "expected float literal, but found integer",
361360
IntegerSuffixStartingWithE => "integer literal suffix must not start with 'e' or 'E'",

src/escape.rs

Lines changed: 10 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ impl Escapee for char {
109109
/// Checks whether the character is skipped after a string continue start
110110
/// (unescaped backlash followed by `\n`).
111111
fn is_string_continue_skipable_whitespace(b: u8) -> bool {
112-
b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
112+
b == b' ' || b == b'\t' || b == b'\n'
113113
}
114114

115115
/// Unescapes a whole string or byte string.
@@ -143,16 +143,7 @@ pub(crate) fn unescape_string<E: Escapee>(
143143
i += len;
144144
end_last_escape = i;
145145
}
146-
b'\r' => {
147-
if input.as_bytes().get(i + 1) == Some(&b'\n') {
148-
value.push_str(&input[end_last_escape..i]);
149-
value.push('\n');
150-
i += 2;
151-
end_last_escape = i;
152-
} else {
153-
return Err(perr(i, IsolatedCr))
154-
}
155-
}
146+
b'\r' => return Err(perr(i, CarriageReturn)),
156147
b'"' => {
157148
closing_quote_pos = Some(i);
158149
break;
@@ -184,14 +175,13 @@ pub(crate) fn unescape_string<E: Escapee>(
184175
Ok((value, start_suffix))
185176
}
186177

187-
/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
188-
/// just `\n` sequences. Returns an optional new string (if the input contained
189-
/// any `\r\n`) and the number of hashes used by the literal.
178+
/// Reads and checks a raw (byte) string literal. Returns the number of hashes
179+
/// and the index when the suffix starts.
190180
#[inline(never)]
191181
pub(crate) fn scan_raw_string<E: Escapee>(
192182
input: &str,
193183
offset: usize,
194-
) -> Result<(Option<String>, u32, usize), ParseError> {
184+
) -> Result<(u32, usize), ParseError> {
195185
// Raw string literal
196186
let num_hashes = input[offset..].bytes().position(|b| b != b'#')
197187
.ok_or(perr(None, InvalidLiteral))?;
@@ -204,31 +194,18 @@ pub(crate) fn scan_raw_string<E: Escapee>(
204194

205195
let mut closing_quote_pos = None;
206196
let mut i = start_inner;
207-
let mut end_last_escape = start_inner;
208-
let mut value = String::new();
209197
while i < input.len() {
210198
let b = input.as_bytes()[i];
211199
if b == b'"' && input[i + 1..].starts_with(hashes) {
212200
closing_quote_pos = Some(i);
213201
break;
214202
}
215203

204+
// CR are just always disallowed in all (raw) strings. Rust performs
205+
// a normalization of CR LF to just LF in a pass prior to lexing. But
206+
// in lexing, it's disallowed.
216207
if b == b'\r' {
217-
// Convert `\r\n` into `\n`. This is currently not well documented
218-
// in the Rust reference, but is done even for raw strings. That's
219-
// because rustc simply converts all line endings when reading
220-
// source files.
221-
if input.as_bytes().get(i + 1) == Some(&b'\n') {
222-
value.push_str(&input[end_last_escape..i]);
223-
value.push('\n');
224-
i += 2;
225-
end_last_escape = i;
226-
continue;
227-
} else if E::SUPPORTS_UNICODE {
228-
// If no \n follows the \r and we are scanning a raw string
229-
// (not raw byte string), we error.
230-
return Err(perr(i, IsolatedCr))
231-
}
208+
return Err(perr(i, CarriageReturn));
232209
}
233210

234211
if !E::SUPPORTS_UNICODE {
@@ -246,17 +223,5 @@ pub(crate) fn scan_raw_string<E: Escapee>(
246223
let suffix = &input[start_suffix..];
247224
check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
248225

249-
// `value` is only empty if there was no \r\n in the input string (with the
250-
// special case of the input being empty). This means the string value
251-
// equals the input, so we store `None`.
252-
let value = if value.is_empty() {
253-
None
254-
} else {
255-
// There was an \r\n in the string, so we need to push the remaining
256-
// unescaped part of the string still.
257-
value.push_str(&input[end_last_escape..closing_quote_pos]);
258-
Some(value)
259-
};
260-
261-
Ok((value, num_hashes as u32, start_suffix))
226+
Ok((num_hashes as u32, start_suffix))
262227
}

src/string/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ impl<B: Buffer> fmt::Display for StringLit<B> {
113113
pub(crate) fn parse_impl(input: &str) -> Result<(Option<String>, Option<u32>, usize), ParseError> {
114114
if input.starts_with('r') {
115115
scan_raw_string::<char>(&input, 1)
116-
.map(|(v, hashes, start_suffix)| (v, Some(hashes), start_suffix))
116+
.map(|(hashes, start_suffix)| (None, Some(hashes), start_suffix))
117117
} else {
118118
unescape_string::<char>(&input, 1)
119119
.map(|(v, start_suffix)| (v, None, start_suffix))

src/string/tests.rs

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ fn string_continue() {
127127
banana", true, None);
128128

129129
// Weird whitespace characters
130-
let lit = StringLit::parse("\"foo\\\n\r\t\n \n\tbar\"").expect("failed to parse");
130+
let lit = StringLit::parse("\"foo\\\n\t\n \n\tbar\"").expect("failed to parse");
131131
assert_eq!(lit.value(), "foobar");
132132
let lit = StringLit::parse("\"foo\\\n\u{85}bar\"").expect("failed to parse");
133133
assert_eq!(lit.value(), "foo\u{85}bar");
@@ -139,27 +139,6 @@ fn string_continue() {
139139
bar", false, Some(0));
140140
}
141141

142-
#[test]
143-
fn crlf_newlines() {
144-
let lit = StringLit::parse("\"foo\r\nbar\"").expect("failed to parse");
145-
assert_eq!(lit.value(), "foo\nbar");
146-
147-
let lit = StringLit::parse("\"\r\nbar\"").expect("failed to parse");
148-
assert_eq!(lit.value(), "\nbar");
149-
150-
let lit = StringLit::parse("\"лиса\r\n\"").expect("failed to parse");
151-
assert_eq!(lit.value(), "лиса\n");
152-
153-
let lit = StringLit::parse("r\"foo\r\nbar\"").expect("failed to parse");
154-
assert_eq!(lit.value(), "foo\nbar");
155-
156-
let lit = StringLit::parse("r#\"\r\nbar\"#").expect("failed to parse");
157-
assert_eq!(lit.value(), "\nbar");
158-
159-
let lit = StringLit::parse("r##\"лиса\r\n\"##").expect("failed to parse");
160-
assert_eq!(lit.value(), "лиса\n");
161-
}
162-
163142
#[test]
164143
fn raw_string() {
165144
check!(r"", false, Some(0));
@@ -211,10 +190,10 @@ fn parse_err() {
211190
assert_err!(StringLit, r#""fox"peter""#, InvalidSuffix, 5);
212191
assert_err!(StringLit, r###"r#"foo "# bar"#"###, UnexpectedChar, 9);
213192

214-
assert_err!(StringLit, "\"\r\"", IsolatedCr, 1);
215-
assert_err!(StringLit, "\"fo\rx\"", IsolatedCr, 3);
216-
assert_err!(StringLit, "r\"\r\"", IsolatedCr, 2);
217-
assert_err!(StringLit, "r\"fo\rx\"", IsolatedCr, 4);
193+
assert_err!(StringLit, "\"\r\"", CarriageReturn, 1);
194+
assert_err!(StringLit, "\"fo\rx\"", CarriageReturn, 3);
195+
assert_err!(StringLit, "r\"\r\"", CarriageReturn, 2);
196+
assert_err!(StringLit, "r\"fo\rx\"", CarriageReturn, 4);
218197

219198
assert_err!(StringLit, r##"r####""##, UnterminatedRawString, None);
220199
assert_err!(StringLit, r#####"r##"foo"#bar"#####, UnterminatedRawString, None);

0 commit comments

Comments
 (0)