boa-dev · jedel1043 · Mar 13, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -121,15 +121,13 @@ pub(crate) trait ToJsString {
 impl ToJsString for Sym {
     #[allow(clippy::cast_possible_truncation)]
     fn to_js_string(&self, interner: &Interner) -> JsString {
-        // TODO: Identify latin1 encodeable strings during parsing to avoid this check.
-        let string = interner.resolve_expect(*self).utf16();
-        for c in string {
-            if u8::try_from(*c).is_err() {
-                return JsString::from(string);
-            }
+        let utf16 = interner.resolve_expect(*self).utf16();
+        if interner.is_latin1(*self) {
+            let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
+            JsString::from(JsStr::latin1(&bytes))
+        } else {
+            JsString::from(utf16)
         }
-        let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
-        JsString::from(JsStr::latin1(&string))
     }
 }
 

@@ -74,16 +74,15 @@ pub(crate) trait ToJsString {
 }
 
 impl ToJsString for Sym {
+    #[allow(clippy::cast_possible_truncation)]
     fn to_js_string(&self, interner: &Interner) -> JsString {
-        // TODO: Identify latin1 encodeable strings during parsing to avoid this check.
-        let string = interner.resolve_expect(*self).utf16();
-        for c in string {
-            if u8::try_from(*c).is_err() {
-                return js_string!(string);
-            }
+        let utf16 = interner.resolve_expect(*self).utf16();
+        if interner.is_latin1(*self) {
+            let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
+            js_string!(JsStr::latin1(&bytes))
+        } else {
+            js_string!(utf16)
         }
-        let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
-        js_string!(JsStr::latin1(&string))
     }
 }
 

@@ -18,7 +18,9 @@
 #![allow(
     clippy::redundant_pub_crate,
     // TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
-    clippy::trait_duplication_in_bounds
+    clippy::trait_duplication_in_bounds,
+    // Field names intentionally mirror the encoding type they store.
+    clippy::struct_field_names
 )]
 #![cfg_attr(not(feature = "arbitrary"), no_std)]
 
@@ -32,7 +34,7 @@ mod sym;
 #[cfg(test)]
 mod tests;
 
-use alloc::{borrow::Cow, format, string::String};
+use alloc::{borrow::Cow, format, string::String, vec::Vec};
 use raw::RawInterner;
 
 pub use sym::*;
@@ -251,6 +253,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
 pub struct Interner {
     utf8_interner: RawInterner<u8>,
     utf16_interner: RawInterner<u16>,
+    /// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
+    latin1_flags: Vec<bool>,
 }
 
 impl Interner {
@@ -288,6 +292,7 @@ impl Interner {
         Self {
             utf8_interner: RawInterner::with_capacity(capacity),
             utf16_interner: RawInterner::with_capacity(capacity),
+            latin1_flags: Vec::with_capacity(capacity),
         }
     }
 
@@ -410,6 +415,8 @@ impl Interner {
 
             assert_eq!(index, utf16_index);
 
+            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
+
             index
                 .checked_add(1 + COMMON_STRINGS_UTF8.len())
                 .and_then(Sym::new)
@@ -453,6 +460,8 @@ impl Interner {
 
             debug_assert_eq!(index, utf16_index);
 
+            self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
+
             index
                 .checked_add(1 + COMMON_STRINGS_UTF8.len())
                 .and_then(Sym::new)
@@ -538,6 +547,39 @@ impl Interner {
         self.resolve(symbol).expect("string disappeared")
     }
 
+    /// Returns `true` if the string identified by `symbol` can be encoded as Latin1
+    /// (i.e. all code units are in the range `0x00..=0xFF`).
+    ///
+    /// This information is computed **once** when the string is first interned, so callers pay no
+    /// O(n) scanning cost beyond the initial intern call.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use boa_interner::Interner;
+    ///
+    /// let mut interner = Interner::new();
+    /// let ascii = interner.get_or_intern("hello");
+    /// assert!(interner.is_latin1(ascii));
+    ///
+    /// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
+    /// let sym = interner.get_or_intern(non_latin1.as_slice());
+    /// assert!(!interner.is_latin1(sym));
+    /// ```
+    #[inline]
+    #[must_use]
+    pub fn is_latin1(&self, symbol: Sym) -> bool {
+        let index = symbol.get() - 1;
+        if index < COMMON_STRINGS_UTF8.len() {
+            return true;
+        }
+        let dynamic_index = index - COMMON_STRINGS_UTF8.len();
+        self.latin1_flags
+            .get(dynamic_index)
+            .copied()
+            .unwrap_or(false)
+    }
+
     fn get_common(string: JStrRef<'_>) -> Option<Sym> {
         match string {
             JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {

@@ -130,3 +130,40 @@ fn check_capacity() {
 
     assert!(interner.resolve(sym).is_none());
 }
+
+#[test]
+fn check_is_latin1() {
+    static STATIC_STR: &str = "static_latin1";
+    static STATIC_UTF16: &[u16] = &[
+        's' as u16, 't' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'c' as u16, '_' as u16,
+        'l' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'n' as u16, '1' as u16,
+    ];
+
+    let mut interner = Interner::default();
+
+    // Common/static strings (e.g. keywords) are always Latin1.
+    let common_sym = interner.get_or_intern("break");
+    assert!(interner.is_latin1(common_sym));
+
+    // Dynamic ASCII string.
+    let ascii_sym = interner.get_or_intern("hello_world");
+    assert!(interner.is_latin1(ascii_sym));
+
+    // Dynamic non-ASCII but Latin1-encodable (U+0080..=U+00FF).
+    let latin1_sym = interner.get_or_intern(&[0x00E9u16, 0x00FC, 0x00F1][..]);
+    assert!(interner.is_latin1(latin1_sym));
+
+    // Dynamic non-Latin1 (code unit > 0xFF).
+    let non_latin1_sym = interner.get_or_intern(&[0x4E2Du16, 0x6587][..]);
+    assert!(!interner.is_latin1(non_latin1_sym));
+
+    // Boundary: U+00FF (last Latin1) and U+0100 (first non-Latin1).
+    let boundary_sym = interner.get_or_intern(&[0x00FFu16][..]);
+    assert!(interner.is_latin1(boundary_sym));
+    let boundary_non_sym = interner.get_or_intern(&[0x0100u16][..]);
+    assert!(!interner.is_latin1(boundary_non_sym));
+
+    // get_or_intern_static also caches correctly.
+    let static_sym = interner.get_or_intern_static(STATIC_STR, STATIC_UTF16);
+    assert!(interner.is_latin1(static_sym));
+}
@@ -824,12 +824,18 @@ impl From<&[u16]> for JsString {
 impl From<&str> for JsString {
     #[inline]
     fn from(s: &str) -> Self {
-        // TODO: Check for latin1 encoding
         if s.is_ascii() {
             let js_str = JsStr::latin1(s.as_bytes());
             return StaticJsStrings::get_string(&js_str)
                 .unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
         }
+        // Non-ASCII but still Latin1-encodable (U+0080..=U+00FF): chars map 1-to-1 to u8.
+        if s.chars().all(|c| c as u32 <= 0xFF) {
+            let bytes: Vec<u8> = s.chars().map(|c| c as u8).collect();
+            let js_str = JsStr::latin1(&bytes);
+            return StaticJsStrings::get_string(&js_str)
+                .unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
+        }
         let s = s.encode_utf16().collect::<Vec<_>>();
         JsString::from_slice_skip_interning(JsStr::utf16(&s[..]))
     }