Remove CR LF normalization to align with spec

LukasKalbertodt · LukasKalbertodt · commit 15edbcd13887 · 2025-07-25T14:45:32.000+02:00
The specification now says that CR LF normalization is part of the pre-processing prior to tokenization. Since it's not part of the lex grammar, this commit removes it from the parsing code as well. CR are simply disallowed fully now. This is technically a breaking change, but unlikely to be noticed by any real world input. In proc macros, all input has been normalized by the Rust compiler anyway, so only very weird input (CR CR LF) would have been accepted previously, but not after this commit. See: - rust-lang/reference#1944 - rust-lang/reference@fa56fdb - rust-lang/reference@27e1ec9
diff --git a/src/bytestr/mod.rs b/src/bytestr/mod.rs
@@ -115,7 +115,7 @@ impl<B: Buffer> fmt::Display for ByteStringLit<B> {
 fn parse_impl(input: &str) -> Result<(Option<Vec<u8>>, Option<u32>, usize), ParseError> {
     if input.starts_with("br") {
         scan_raw_string::<u8>(&input, 2)
-            .map(|(v, num, start_suffix)| (v.map(String::into_bytes), Some(num), start_suffix))
+            .map(|(num, start_suffix)| (None, Some(num), start_suffix))
     } else {
         unescape_string::<u8>(&input, 2)
             .map(|(v, start_suffix)| (v.map(String::into_bytes), None, start_suffix))
diff --git a/src/bytestr/tests.rs b/src/bytestr/tests.rs
@@ -59,9 +59,6 @@ fn special_whitespace() {
             assert_eq!(ByteStringLit::parse(&*input).unwrap().into_value(), s.as_bytes());
         }
     }
-
-    let res = ByteStringLit::parse("br\"\r\"").expect("failed to parse");
-    assert_eq!(res.value(), b"\r");
 }
 
 #[test]
@@ -97,35 +94,14 @@ bar", true, None);
         banana", true, None);
 
     // Weird whitespace characters
-    let lit = ByteStringLit::parse("b\"foo\\\n\r\t\n \n\tbar\"").expect("failed to parse");
+    let lit = ByteStringLit::parse("b\"foo\\\n\t\n \n\tbar\"").expect("failed to parse");
     assert_eq!(lit.value(), b"foobar");
 
     // Raw strings do not handle "string continues"
     check!(br"foo\
         bar", false, Some(0));
 }
 
-#[test]
-fn crlf_newlines() {
-    let lit = ByteStringLit::parse("b\"foo\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), b"foo\nbar");
-
-    let lit = ByteStringLit::parse("b\"\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), b"\nbar");
-
-    let lit = ByteStringLit::parse("b\"foo\r\n\"").expect("failed to parse");
-    assert_eq!(lit.value(), b"foo\n");
-
-    let lit = ByteStringLit::parse("br\"foo\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), b"foo\nbar");
-
-    let lit = ByteStringLit::parse("br#\"\r\nbar\"#").expect("failed to parse");
-    assert_eq!(lit.value(), b"\nbar");
-
-    let lit = ByteStringLit::parse("br##\"foo\r\n\"##").expect("failed to parse");
-    assert_eq!(lit.value(), b"foo\n");
-}
-
 #[test]
 fn raw_byte_string() {
     check!(br"", false, Some(0));
@@ -172,8 +148,12 @@ fn parse_err() {
     assert_err!(ByteStringLit, r#"b"fox"peter""#, InvalidSuffix, 6);
     assert_err!(ByteStringLit, r###"br#"foo "# bar"#"###, UnexpectedChar, 10);
 
-    assert_err!(ByteStringLit, "b\"\r\"", IsolatedCr, 2);
-    assert_err!(ByteStringLit, "b\"fo\rx\"", IsolatedCr, 4);
+    assert_err!(ByteStringLit, "b\"\r\"", CarriageReturn, 2);
+    assert_err!(ByteStringLit, "b\"fo\rx\"", CarriageReturn, 4);
+    assert_err!(ByteStringLit, "br\"\r\"", CarriageReturn, 3);
+    assert_err!(ByteStringLit, "br\"fo\rx\"", CarriageReturn, 5);
+    assert_err!(ByteStringLit, "b\"a\\\r\"", UnknownEscape, 3..5);
+    assert_err!(ByteStringLit, "br\"a\\\r\"", CarriageReturn, 5);
 
     assert_err!(ByteStringLit, r##"br####""##, UnterminatedRawString, None);
     assert_err!(ByteStringLit, r#####"br##"foo"#bar"#####, UnterminatedRawString, None);
diff --git a/src/err.rs b/src/err.rs
@@ -299,9 +299,8 @@ pub(crate) enum ParseErrorKind {
 
     InvalidByteStringLiteralStart,
 
-    /// An literal `\r` character not followed by a `\n` character in a
-    /// (raw) string or byte string literal.
-    IsolatedCr,
+    /// `\r` in a (raw) string or (raw) byte string literal.
+    CarriageReturn,
 
     /// Literal suffix is not a valid identifier.
     InvalidSuffix,
@@ -355,7 +354,7 @@ impl fmt::Display for ParseError {
             InvalidStringLiteralStart => "invalid start for string literal",
             InvalidByteLiteralStart => "invalid start for byte literal",
             InvalidByteStringLiteralStart => "invalid start for byte string literal",
-            IsolatedCr => r"`\r` not immediately followed by `\n` in string",
+            CarriageReturn => r"`\r` not allowed in string literals",
             InvalidSuffix => "literal suffix is not a valid identifier",
             UnexpectedIntegerLit => "expected float literal, but found integer",
             IntegerSuffixStartingWithE => "integer literal suffix must not start with 'e' or 'E'",
diff --git a/src/escape.rs b/src/escape.rs
@@ -109,7 +109,7 @@ impl Escapee for char {
 /// Checks whether the character is skipped after a string continue start
 /// (unescaped backlash followed by `\n`).
 fn is_string_continue_skipable_whitespace(b: u8) -> bool {
-    b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
+    b == b' ' || b == b'\t' || b == b'\n'
 }
 
 /// Unescapes a whole string or byte string.
@@ -143,16 +143,7 @@ pub(crate) fn unescape_string<E: Escapee>(
                 i += len;
                 end_last_escape = i;
             }
-            b'\r' => {
-                if input.as_bytes().get(i + 1) == Some(&b'\n') {
-                    value.push_str(&input[end_last_escape..i]);
-                    value.push('\n');
-                    i += 2;
-                    end_last_escape = i;
-                } else {
-                    return Err(perr(i, IsolatedCr))
-                }
-            }
+            b'\r' => return Err(perr(i, CarriageReturn)),
             b'"' => {
                 closing_quote_pos = Some(i);
                 break;
@@ -184,14 +175,13 @@ pub(crate) fn unescape_string<E: Escapee>(
     Ok((value, start_suffix))
 }
 
-/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
-/// just `\n` sequences. Returns an optional new string (if the input contained
-/// any `\r\n`) and the number of hashes used by the literal.
+/// Reads and checks a raw (byte) string literal. Returns the number of hashes
+/// and the index when the suffix starts.
 #[inline(never)]
 pub(crate) fn scan_raw_string<E: Escapee>(
     input: &str,
     offset: usize,
-) -> Result<(Option<String>, u32, usize), ParseError> {
+) -> Result<(u32, usize), ParseError> {
     // Raw string literal
     let num_hashes = input[offset..].bytes().position(|b| b != b'#')
         .ok_or(perr(None, InvalidLiteral))?;
@@ -204,31 +194,18 @@ pub(crate) fn scan_raw_string<E: Escapee>(
 
     let mut closing_quote_pos = None;
     let mut i = start_inner;
-    let mut end_last_escape = start_inner;
-    let mut value = String::new();
     while i < input.len() {
         let b = input.as_bytes()[i];
         if b == b'"' && input[i + 1..].starts_with(hashes) {
             closing_quote_pos = Some(i);
             break;
         }
 
+        // CR are just always disallowed in all (raw) strings. Rust performs
+        // a normalization of CR LF to just LF in a pass prior to lexing. But
+        // in lexing, it's disallowed.
         if b == b'\r' {
-            // Convert `\r\n` into `\n`. This is currently not well documented
-            // in the Rust reference, but is done even for raw strings. That's
-            // because rustc simply converts all line endings when reading
-            // source files.
-            if input.as_bytes().get(i + 1) == Some(&b'\n') {
-                value.push_str(&input[end_last_escape..i]);
-                value.push('\n');
-                i += 2;
-                end_last_escape = i;
-                continue;
-            } else if E::SUPPORTS_UNICODE {
-                // If no \n follows the \r and we are scanning a raw string
-                // (not raw byte string), we error.
-                return Err(perr(i, IsolatedCr))
-            }
+            return Err(perr(i, CarriageReturn));
         }
 
         if !E::SUPPORTS_UNICODE {
@@ -246,17 +223,5 @@ pub(crate) fn scan_raw_string<E: Escapee>(
     let suffix = &input[start_suffix..];
     check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
 
-    // `value` is only empty if there was no \r\n in the input string (with the
-    // special case of the input being empty). This means the string value
-    // equals the input, so we store `None`.
-    let value = if value.is_empty() {
-        None
-    } else {
-        // There was an \r\n in the string, so we need to push the remaining
-        // unescaped part of the string still.
-        value.push_str(&input[end_last_escape..closing_quote_pos]);
-        Some(value)
-    };
-
-    Ok((value, num_hashes as u32, start_suffix))
+    Ok((num_hashes as u32, start_suffix))
 }
diff --git a/src/string/mod.rs b/src/string/mod.rs
@@ -113,7 +113,7 @@ impl<B: Buffer> fmt::Display for StringLit<B> {
 pub(crate) fn parse_impl(input: &str) -> Result<(Option<String>, Option<u32>, usize), ParseError> {
     if input.starts_with('r') {
         scan_raw_string::<char>(&input, 1)
-            .map(|(v, hashes, start_suffix)| (v, Some(hashes), start_suffix))
+            .map(|(hashes, start_suffix)| (None, Some(hashes), start_suffix))
     } else {
         unescape_string::<char>(&input, 1)
             .map(|(v, start_suffix)| (v, None, start_suffix))
diff --git a/src/string/tests.rs b/src/string/tests.rs
@@ -127,7 +127,7 @@ fn string_continue() {
         banana", true, None);
 
     // Weird whitespace characters
-    let lit = StringLit::parse("\"foo\\\n\r\t\n \n\tbar\"").expect("failed to parse");
+    let lit = StringLit::parse("\"foo\\\n\t\n \n\tbar\"").expect("failed to parse");
     assert_eq!(lit.value(), "foobar");
     let lit = StringLit::parse("\"foo\\\n\u{85}bar\"").expect("failed to parse");
     assert_eq!(lit.value(), "foo\u{85}bar");
@@ -139,27 +139,6 @@ fn string_continue() {
         bar", false, Some(0));
 }
 
-#[test]
-fn crlf_newlines() {
-    let lit = StringLit::parse("\"foo\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), "foo\nbar");
-
-    let lit = StringLit::parse("\"\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), "\nbar");
-
-    let lit = StringLit::parse("\"лиса\r\n\"").expect("failed to parse");
-    assert_eq!(lit.value(), "лиса\n");
-
-    let lit = StringLit::parse("r\"foo\r\nbar\"").expect("failed to parse");
-    assert_eq!(lit.value(), "foo\nbar");
-
-    let lit = StringLit::parse("r#\"\r\nbar\"#").expect("failed to parse");
-    assert_eq!(lit.value(), "\nbar");
-
-    let lit = StringLit::parse("r##\"лиса\r\n\"##").expect("failed to parse");
-    assert_eq!(lit.value(), "лиса\n");
-}
-
 #[test]
 fn raw_string() {
     check!(r"", false, Some(0));
@@ -211,10 +190,10 @@ fn parse_err() {
     assert_err!(StringLit, r#""fox"peter""#, InvalidSuffix, 5);
     assert_err!(StringLit, r###"r#"foo "# bar"#"###, UnexpectedChar, 9);
 
-    assert_err!(StringLit, "\"\r\"", IsolatedCr, 1);
-    assert_err!(StringLit, "\"fo\rx\"", IsolatedCr, 3);
-    assert_err!(StringLit, "r\"\r\"", IsolatedCr, 2);
-    assert_err!(StringLit, "r\"fo\rx\"", IsolatedCr, 4);
+    assert_err!(StringLit, "\"\r\"", CarriageReturn, 1);
+    assert_err!(StringLit, "\"fo\rx\"", CarriageReturn, 3);
+    assert_err!(StringLit, "r\"\r\"", CarriageReturn, 2);
+    assert_err!(StringLit, "r\"fo\rx\"", CarriageReturn, 4);
 
     assert_err!(StringLit, r##"r####""##, UnterminatedRawString, None);
     assert_err!(StringLit, r#####"r##"foo"#bar"#####, UnterminatedRawString, None);