diff --git a/Cargo.lock b/Cargo.lock index f882e59a2834..b7e9a54f2dc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2521,7 +2521,7 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hstr" -version = "3.0.1" +version = "3.0.3" dependencies = [ "compact_str", "criterion", diff --git a/crates/hstr/Cargo.toml b/crates/hstr/Cargo.toml index 9834315feab8..0fce3074d522 100644 --- a/crates/hstr/Cargo.toml +++ b/crates/hstr/Cargo.toml @@ -5,7 +5,7 @@ edition = { workspace = true } license = { workspace = true } name = "hstr" repository = { workspace = true } -version = "3.0.1" +version = "3.0.3" [lib] bench = false diff --git a/crates/hstr/src/wtf8_atom.rs b/crates/hstr/src/wtf8_atom.rs index 6fabec997bb4..dcb9df27be92 100644 --- a/crates/hstr/src/wtf8_atom.rs +++ b/crates/hstr/src/wtf8_atom.rs @@ -100,9 +100,43 @@ impl serde::ser::Serialize for Wtf8Atom { // By escaping literal '\u' to '\\u', we ensure: // - Unpaired surrogates serialize as '\uXXXX' // - Literal '\u' text serializes as '\\uXXXX' + // + // However, we should only escape '\u' if it's followed by exactly 4 hex digits, + // which would indicate a Unicode escape sequence. Otherwise, '\u' followed by + // non-hex characters (like '\util') should not be escaped. if c == '\\' && iter.peek().map(|cp| cp.to_u32()) == Some('u' as u32) { - iter.next(); // skip 'u' - result.push_str("\\\\u"); + // Look ahead to see if this is followed by exactly 4 hex digits + let mut lookahead = iter.clone(); + lookahead.next(); // skip 'u' + + let mut hex_count = 0; + let mut all_hex = true; + for _ in 0..4 { + if let Some(next_cp) = lookahead.next() { + if let Some(next_c) = next_cp.to_char() { + if next_c.is_ascii_hexdigit() { + hex_count += 1; + } else { + all_hex = false; + break; + } + } else { + all_hex = false; + break; + } + } else { + all_hex = false; + break; + } + } + + // Only escape if we have exactly 4 hex digits after '\u' + if hex_count == 4 && all_hex { + iter.next(); // skip 'u' + result.push_str("\\\\u"); + } else { + result.push(c); + } } else { result.push(c) } @@ -553,4 +587,32 @@ mod tests { let err_atom = result.unwrap_err(); assert_eq!(err_atom.to_string_lossy(), "\u{FFFD}"); } + + #[test] + fn test_backslash_util_issue_11214() { + let atom = + Wtf8Atom::from("C:\\github\\swc-plugin-coverage-instrument\\spec\\util\\verifier.ts"); + let serialized = serde_json::to_string(&atom).unwrap(); + + assert!( + !serialized.contains("spec\\\\\\\\util"), + "Found quadruple backslashes in spec segment! Serialized: {serialized}" + ); + + assert!( + serialized.contains("spec\\\\util"), + "Expected double backslashes in spec segment not found! Serialized: {serialized}", + ); + + // The expected serialized value should have consistent escaping + let expected = r#""C:\\github\\swc-plugin-coverage-instrument\\spec\\util\\verifier.ts""#; + assert_eq!( + serialized, expected, + "Serialized value should have consistent backslash escaping" + ); + + // Test round-trip + let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap(); + assert_eq!(atom, deserialized); + } } diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 852877860f3f..74ce7b40ef03 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -1462,6 +1462,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let c = match c { '\\' => '\\', + '\'' => '\'', + '"' => '"', 'n' => '\n', 'r' => '\r', 't' => '\t', @@ -1557,7 +1559,26 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { return Ok(CodePoint::from_u32(value as u32)); } - _ => c, + // For unrecognized escape sequences, return the backslash and don't consume + // the following character. According to ECMAScript, when a backslash precedes + // a character that doesn't form a valid escape sequence, both the backslash + // and the character should be preserved in the string value. + // + // However, in strict mode, unrecognized escape sequences are syntax errors. + // In template literals, they should always be errors (pre-ES2018 behavior). + _ => { + // In template literals, unrecognized escape sequences are always errors + if in_template { + self.error(start, SyntaxError::InvalidStrEscape)? + } + + // In strict mode, unrecognized escape sequences are syntax errors + self.emit_strict_mode_error(start, SyntaxError::InvalidStrEscape); + + // Don't bump - let the following character be read normally in the next + // iteration + return Ok(Some(CodePoint::from_char('\\'))); + } }; unsafe { diff --git a/crates/swc_ecma_lexer/src/lexer/tests.rs b/crates/swc_ecma_lexer/src/lexer/tests.rs index 338521f071ec..0c5539861c94 100644 --- a/crates/swc_ecma_lexer/src/lexer/tests.rs +++ b/crates/swc_ecma_lexer/src/lexer/tests.rs @@ -2241,3 +2241,51 @@ fn issue_9106() { ] ); } + +#[test] +fn issue_11214_windows_path_escape() { + // Test for Windows file paths with backslashes + // When a backslash precedes a character that doesn't form a valid escape + // sequence, the backslash should be treated as a literal backslash + assert_eq!( + lex_tokens( + Syntax::default(), + r#""C:\\github\\swc-plugin-coverage-instrument\\spec\\util\\verifier.ts""# + ), + vec![Token::Str { + value: atom!("C:\\github\\swc-plugin-coverage-instrument\\spec\\util\\verifier.ts") + .into(), + raw: atom!(r#""C:\\github\\swc-plugin-coverage-instrument\\spec\\util\\verifier.ts""#), + }] + ); +} + +#[test] +fn issue_11214_unrecognized_escape_sequences() { + // Test various unrecognized escape sequences + // According to ECMAScript, \s, \g, \a etc. (when not part of a valid escape) + // preserve the backslash: the value should be backslash + character + assert_eq!( + lex_tokens(Syntax::default(), r#""\s""#), + vec![Token::Str { + value: atom!(r"\s").into(), + raw: atom!(r#""\s""#), + }] + ); + + assert_eq!( + lex_tokens(Syntax::default(), r#""\g""#), + vec![Token::Str { + value: atom!(r"\g").into(), + raw: atom!(r#""\g""#), + }] + ); + + assert_eq!( + lex_tokens(Syntax::default(), r#""\a""#), + vec![Token::Str { + value: atom!(r"\a").into(), + raw: atom!(r#""\a""#), + }] + ); +}