perf: detect Latin1-encodable strings at intern time instead of per-c… (boa-dev#4896)

linisha15 · MayankRaj435 · commit 3c0fb8823795 · 2026-03-17T08:49:25.000+05:30
This Pull Request closes boa-dev#4881 Background- When the bytecode compiler converts an interned string (`Sym`) to a `JsString`, it needs to decide whether to store it as Latin1 (1 byte per character) or UTF-16 (2 bytes per character). Previously, this was done by scanning every character of the string on each call — even if the same string was used many times. What changed- - The `Interner` now checks once, at the moment a string is first stored, whether all its characters fit in Latin1 (code point ≤ U+00FF). The result is saved in a new `latin1_flags` field. - A new `is_latin1(sym)` method lets callers read that saved result instantly, without re-scanning the string. - `ToJsString for Sym` in both `boa_ast` and `boa_engine::bytecompiler` now calls `is_latin1()` instead of scanning the string's characters every time. - `From<&str> for JsString` was also fixed to correctly produce a Latin1 string for characters in the U+0080–U+00FF range, not just plain ASCII.
diff --git a/core/ast/src/lib.rs b/core/ast/src/lib.rs
@@ -121,15 +121,13 @@ pub(crate) trait ToJsString {
 impl ToJsString for Sym {
     #[allow(clippy::cast_possible_truncation)]
     fn to_js_string(&self, interner: &Interner) -> JsString {
-        // TODO: Identify latin1 encodeable strings during parsing to avoid this check.
-        let string = interner.resolve_expect(*self).utf16();
-        for c in string {
-            if u8::try_from(*c).is_err() {
-                return JsString::from(string);
-            }
+        let utf16 = interner.resolve_expect(*self).utf16();
+        if interner.is_latin1(*self) {
+            let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
+            JsString::from(JsStr::latin1(&bytes))
+        } else {
+            JsString::from(utf16)
         }
-        let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
-        JsString::from(JsStr::latin1(&string))
     }
 }
 
diff --git a/core/engine/src/bytecompiler/mod.rs b/core/engine/src/bytecompiler/mod.rs
@@ -83,16 +83,15 @@ pub(crate) trait ToJsString {
 }
 
 impl ToJsString for Sym {
+    #[allow(clippy::cast_possible_truncation)]
     fn to_js_string(&self, interner: &Interner) -> JsString {
-        // TODO: Identify latin1 encodeable strings during parsing to avoid this check.
-        let string = interner.resolve_expect(*self).utf16();
-        for c in string {
-            if u8::try_from(*c).is_err() {
-                return js_string!(string);
-            }
+        let utf16 = interner.resolve_expect(*self).utf16();
+        if interner.is_latin1(*self) {
+            let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
+            js_string!(JsStr::latin1(&bytes))
+        } else {
+            js_string!(utf16)
         }
-        let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
-        js_string!(JsStr::latin1(&string))
     }
 }
 
diff --git a/core/interner/src/lib.rs b/core/interner/src/lib.rs
@@ -18,7 +18,9 @@
 #![allow(
     clippy::redundant_pub_crate,
     // TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
-    clippy::trait_duplication_in_bounds
+    clippy::trait_duplication_in_bounds,
+    // Field names intentionally mirror the encoding type they store.
+    clippy::struct_field_names
 )]
 #![cfg_attr(not(feature = "arbitrary"), no_std)]
 
@@ -32,7 +34,7 @@ mod sym;
 #[cfg(test)]
 mod tests;
 
-use alloc::{borrow::Cow, format, string::String};
+use alloc::{borrow::Cow, format, string::String, vec::Vec};
 use raw::RawInterner;
 
 pub use sym::*;
@@ -251,6 +253,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
 pub struct Interner {
     utf8_interner: RawInterner<u8>,
     utf16_interner: RawInterner<u16>,
+    /// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
+    latin1_flags: Vec<bool>,
 }
 
 impl Interner {
@@ -288,6 +292,7 @@ impl Interner {
         Self {
             utf8_interner: RawInterner::with_capacity(capacity),
             utf16_interner: RawInterner::with_capacity(capacity),
+            latin1_flags: Vec::with_capacity(capacity),
         }
     }
 
@@ -410,6 +415,8 @@ impl Interner {
 
             assert_eq!(index, utf16_index);
 
+            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
+
             index
                 .checked_add(1 + COMMON_STRINGS_UTF8.len())
                 .and_then(Sym::new)
@@ -453,6 +460,8 @@ impl Interner {
 
             debug_assert_eq!(index, utf16_index);
 
+            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
+
             index
                 .checked_add(1 + COMMON_STRINGS_UTF8.len())
                 .and_then(Sym::new)
@@ -538,6 +547,39 @@ impl Interner {
         self.resolve(symbol).expect("string disappeared")
     }
 
+    /// Returns `true` if the string identified by `symbol` can be encoded as Latin1
+    /// (i.e. all code units are in the range `0x00..=0xFF`).
+    ///
+    /// This information is computed **once** when the string is first interned, so callers pay no
+    /// O(n) scanning cost beyond the initial intern call.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use boa_interner::Interner;
+    ///
+    /// let mut interner = Interner::new();
+    /// let ascii = interner.get_or_intern("hello");
+    /// assert!(interner.is_latin1(ascii));
+    ///
+    /// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
+    /// let sym = interner.get_or_intern(non_latin1.as_slice());
+    /// assert!(!interner.is_latin1(sym));
+    /// ```
+    #[inline]
+    #[must_use]
+    pub fn is_latin1(&self, symbol: Sym) -> bool {
+        let index = symbol.get() - 1;
+        if index < COMMON_STRINGS_UTF8.len() {
+            return true;
+        }
+        let dynamic_index = index - COMMON_STRINGS_UTF8.len();
+        self.latin1_flags
+            .get(dynamic_index)
+            .copied()
+            .unwrap_or(false)
+    }
+
     fn get_common(string: JStrRef<'_>) -> Option<Sym> {
         match string {
             JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {
diff --git a/core/interner/src/tests.rs b/core/interner/src/tests.rs
@@ -130,3 +130,40 @@ fn check_capacity() {
 
     assert!(interner.resolve(sym).is_none());
 }
+
+#[test]
+fn check_is_latin1() {
+    static STATIC_STR: &str = "static_latin1";
+    static STATIC_UTF16: &[u16] = &[
+        's' as u16, 't' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'c' as u16, '_' as u16,
+        'l' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'n' as u16, '1' as u16,
+    ];
+
+    let mut interner = Interner::default();
+
+    // Common/static strings (e.g. keywords) are always Latin1.
+    let common_sym = interner.get_or_intern("break");
+    assert!(interner.is_latin1(common_sym));
+
+    // Dynamic ASCII string.
+    let ascii_sym = interner.get_or_intern("hello_world");
+    assert!(interner.is_latin1(ascii_sym));
+
+    // Dynamic non-ASCII but Latin1-encodable (U+0080..=U+00FF).
+    let latin1_sym = interner.get_or_intern(&[0x00E9u16, 0x00FC, 0x00F1][..]);
+    assert!(interner.is_latin1(latin1_sym));
+
+    // Dynamic non-Latin1 (code unit > 0xFF).
+    let non_latin1_sym = interner.get_or_intern(&[0x4E2Du16, 0x6587][..]);
+    assert!(!interner.is_latin1(non_latin1_sym));
+
+    // Boundary: U+00FF (last Latin1) and U+0100 (first non-Latin1).
+    let boundary_sym = interner.get_or_intern(&[0x00FFu16][..]);
+    assert!(interner.is_latin1(boundary_sym));
+    let boundary_non_sym = interner.get_or_intern(&[0x0100u16][..]);
+    assert!(!interner.is_latin1(boundary_non_sym));
+
+    // get_or_intern_static also caches correctly.
+    let static_sym = interner.get_or_intern_static(STATIC_STR, STATIC_UTF16);
+    assert!(interner.is_latin1(static_sym));
+}
diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs
@@ -824,12 +824,18 @@ impl From<&[u16]> for JsString {
 impl From<&str> for JsString {
     #[inline]
     fn from(s: &str) -> Self {
-        // TODO: Check for latin1 encoding
         if s.is_ascii() {
             let js_str = JsStr::latin1(s.as_bytes());
             return StaticJsStrings::get_string(&js_str)
                 .unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
         }
+        // Non-ASCII but still Latin1-encodable (U+0080..=U+00FF): chars map 1-to-1 to u8.
+        if s.chars().all(|c| c as u32 <= 0xFF) {
+            let bytes: Vec<u8> = s.chars().map(|c| c as u8).collect();
+            let js_str = JsStr::latin1(&bytes);
+            return StaticJsStrings::get_string(&js_str)
+                .unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
+        }
         let s = s.encode_utf16().collect::<Vec<_>>();
         JsString::from_slice_skip_interning(JsStr::utf16(&s[..]))
     }

Original file line number	Diff line number	Diff line change
`@@ -83,16 +83,15 @@ pub(crate) trait ToJsString {`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`impl ToJsString for Sym {`
	`86`	`+ #[allow(clippy::cast_possible_truncation)]`
`86`	`87`	`fn to_js_string(&self, interner: &Interner) -> JsString {`
`87`		`- // TODO: Identify latin1 encodeable strings during parsing to avoid this check.`
`88`		`- let string = interner.resolve_expect(*self).utf16();`
`89`		`- for c in string {`
`90`		`- if u8::try_from(*c).is_err() {`
`91`		`- return js_string!(string);`
`92`		`- }`
	`88`	`+ let utf16 = interner.resolve_expect(*self).utf16();`
	`89`	`+ if interner.is_latin1(*self) {`
	`90`	`+ let bytes: Vec<u8> = utf16.iter().map(\|&c\| c as u8).collect();`
	`91`	`+ js_string!(JsStr::latin1(&bytes))`
	`92`	`+ } else {`
	`93`	`+ js_string!(utf16)`
`93`	`94`	`}`
`94`		`- let string = string.iter().map(\|c\| *c as u8).collect::<Vec<_>>();`
`95`		`- js_string!(JsStr::latin1(&string))`
`96`	`95`	`}`
`97`	`96`	`}`
`98`	`97`