From 82dece19c7b7132d5e7ab480ddf3d1e6d59e301e Mon Sep 17 00:00:00 2001 From: MayankRaj435 <1bi22ri032@bit-bangalore.edu.in> Date: Mon, 16 Mar 2026 12:19:22 +0530 Subject: [PATCH] Refactor: Optimize hex escape parsing in lexer --- core/parser/src/lexer/cursor.rs | 21 +----- core/parser/src/lexer/string.rs | 122 ++++++++++++++------------------ 2 files changed, 53 insertions(+), 90 deletions(-) diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index 97624b11e64..cd562068257 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -2,7 +2,7 @@ use crate::source::{ReadChar, UTF8Input}; use boa_ast::{LinearPosition, Position, PositionGroup, SourceText}; -use std::io::{self, Error, ErrorKind}; +use std::io::{self, Error}; /// Cursor over the source code. #[derive(Debug)] @@ -139,25 +139,6 @@ impl Cursor { }) } - /// Fills the buffer with all bytes until the stop byte is found. - /// Returns error when reaching the end of the buffer. - /// - /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before. - pub(super) fn take_until(&mut self, stop: u32, buf: &mut Vec) -> io::Result<()> { - loop { - if self.next_if(stop)? { - return Ok(()); - } else if let Some(c) = self.next_char()? { - buf.push(c); - } else { - return Err(Error::new( - ErrorKind::UnexpectedEof, - format!("Unexpected end of file when looking for character {stop}"), - )); - } - } - } - /// Fills a mutable slice up to the ends while characters are alphabetic. Returns /// the number of characters read, or `N+1` if the buffer was filled but there were /// still characters after. diff --git a/core/parser/src/lexer/string.rs b/core/parser/src/lexer/string.rs index 064b7dfcbaf..ec75df03664 100644 --- a/core/parser/src/lexer/string.rs +++ b/core/parser/src/lexer/string.rs @@ -250,69 +250,63 @@ impl StringLiteral { { // Support \u{X..X} (Unicode CodePoint) if cursor.next_if(0x7B /* { */)? { - // TODO: use bytes for a bit better performance (using stack) - let mut code_point_buf = Vec::with_capacity(6); - cursor.take_until(0x7D /* } */, &mut code_point_buf)?; - - let mut s = String::with_capacity(code_point_buf.len()); - for c in code_point_buf { - if let Some(c) = char::from_u32(c) { - s.push(c); - } else { + let mut code_point = 0u32; + let mut first_digit = true; + loop { + let pos = cursor.pos(); + let Some(c) = cursor.next_char()? else { + return Err(Error::syntax( + "Unexpected end of file when looking for character }", + pos, + )); + }; + if c == 0x7D + /* } */ + { + if first_digit { + return Err(Error::syntax( + "malformed Unicode character escape sequence", + start_pos, + )); + } + break; + } + + let Some(digit) = char::from_u32(c).and_then(|c| c.to_digit(16)) else { return Err(Error::syntax( "malformed Unicode character escape sequence", start_pos, )); - } - } + }; - let Ok(code_point) = u32::from_str_radix(&s, 16) else { - return Err(Error::syntax( - "malformed Unicode character escape sequence", - start_pos, - )); - }; + code_point = (code_point << 4) | digit; - // UTF16Encoding of a numeric code point value - if code_point > 0x10_FFFF { - return Err(Error::syntax( - "Unicode codepoint must not be greater than 0x10FFFF in escape sequence", - start_pos, - )); + if code_point > 0x10_FFFF { + return Err(Error::syntax( + "Unicode codepoint must not be greater than 0x10FFFF in escape sequence", + start_pos, + )); + } + first_digit = false; } Ok(code_point) } else { // Grammar: Hex4Digits // Collect each character after \u e.g \uD83D will give "D83D" - let mut buffer = [0u32; 4]; - buffer[0] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; - buffer[1] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; - buffer[2] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; - buffer[3] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; - - let mut s = String::with_capacity(buffer.len()); - for c in buffer { - if let Some(c) = char::from_u32(c) { - s.push(c); - } else { + let mut code_point = 0u32; + for _ in 0..4 { + let pos = cursor.pos(); + let c = cursor + .next_char()? + .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", pos))?; + let Some(digit) = char::from_u32(c).and_then(|c| c.to_digit(16)) else { return Err(Error::syntax("invalid Unicode escape sequence", start_pos)); - } + }; + code_point = (code_point << 4) | digit; } - let Ok(code_point) = u16::from_str_radix(&s, 16) else { - return Err(Error::syntax("invalid Unicode escape sequence", start_pos)); - }; - - Ok(u32::from(code_point)) + Ok(code_point) } } @@ -323,34 +317,22 @@ impl StringLiteral { where R: ReadChar, { - let mut buffer = [0u32; 2]; - buffer[0] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; - buffer[1] = cursor - .next_char()? - .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; - - let mut s = String::with_capacity(buffer.len()); - for c in buffer { - if let Some(c) = char::from_u32(c) { - s.push(c); - } else { + let mut code_point = 0u32; + for _ in 0..2 { + let pos = cursor.pos(); + let c = cursor + .next_char()? + .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", pos))?; + let Some(digit) = char::from_u32(c).and_then(|c| c.to_digit(16)) else { return Err(Error::syntax( "invalid Hexadecimal escape sequence", start_pos, )); - } + }; + code_point = (code_point << 4) | digit; } - let Ok(code_point) = u16::from_str_radix(&s, 16) else { - return Err(Error::syntax( - "invalid Hexadecimal escape sequence", - start_pos, - )); - }; - - Ok(u32::from(code_point)) + Ok(code_point) } fn take_legacy_octal_escape_sequence(