Skip to content

Commit 3c0fb88

Browse files
linisha15MayankRaj435
authored andcommitted
perf: detect Latin1-encodable strings at intern time instead of per-c… (boa-dev#4896)
This Pull Request closes boa-dev#4881 Background- When the bytecode compiler converts an interned string (`Sym`) to a `JsString`, it needs to decide whether to store it as Latin1 (1 byte per character) or UTF-16 (2 bytes per character). Previously, this was done by scanning every character of the string on each call — even if the same string was used many times. What changed- - The `Interner` now checks once, at the moment a string is first stored, whether all its characters fit in Latin1 (code point ≤ U+00FF). The result is saved in a new `latin1_flags` field. - A new `is_latin1(sym)` method lets callers read that saved result instantly, without re-scanning the string. - `ToJsString for Sym` in both `boa_ast` and `boa_engine::bytecompiler` now calls `is_latin1()` instead of scanning the string's characters every time. - `From<&str> for JsString` was also fixed to correctly produce a Latin1 string for characters in the U+0080–U+00FF range, not just plain ASCII.
1 parent dd9ac1e commit 3c0fb88

File tree

5 files changed

+101
-19
lines changed

5 files changed

+101
-19
lines changed

core/ast/src/lib.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,13 @@ pub(crate) trait ToJsString {
121121
impl ToJsString for Sym {
122122
#[allow(clippy::cast_possible_truncation)]
123123
fn to_js_string(&self, interner: &Interner) -> JsString {
124-
// TODO: Identify latin1 encodeable strings during parsing to avoid this check.
125-
let string = interner.resolve_expect(*self).utf16();
126-
for c in string {
127-
if u8::try_from(*c).is_err() {
128-
return JsString::from(string);
129-
}
124+
let utf16 = interner.resolve_expect(*self).utf16();
125+
if interner.is_latin1(*self) {
126+
let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
127+
JsString::from(JsStr::latin1(&bytes))
128+
} else {
129+
JsString::from(utf16)
130130
}
131-
let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
132-
JsString::from(JsStr::latin1(&string))
133131
}
134132
}
135133

core/engine/src/bytecompiler/mod.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,15 @@ pub(crate) trait ToJsString {
8383
}
8484

8585
impl ToJsString for Sym {
86+
#[allow(clippy::cast_possible_truncation)]
8687
fn to_js_string(&self, interner: &Interner) -> JsString {
87-
// TODO: Identify latin1 encodeable strings during parsing to avoid this check.
88-
let string = interner.resolve_expect(*self).utf16();
89-
for c in string {
90-
if u8::try_from(*c).is_err() {
91-
return js_string!(string);
92-
}
88+
let utf16 = interner.resolve_expect(*self).utf16();
89+
if interner.is_latin1(*self) {
90+
let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
91+
js_string!(JsStr::latin1(&bytes))
92+
} else {
93+
js_string!(utf16)
9394
}
94-
let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
95-
js_string!(JsStr::latin1(&string))
9695
}
9796
}
9897

core/interner/src/lib.rs

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
#![allow(
1919
clippy::redundant_pub_crate,
2020
// TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
21-
clippy::trait_duplication_in_bounds
21+
clippy::trait_duplication_in_bounds,
22+
// Field names intentionally mirror the encoding type they store.
23+
clippy::struct_field_names
2224
)]
2325
#![cfg_attr(not(feature = "arbitrary"), no_std)]
2426

@@ -32,7 +34,7 @@ mod sym;
3234
#[cfg(test)]
3335
mod tests;
3436

35-
use alloc::{borrow::Cow, format, string::String};
37+
use alloc::{borrow::Cow, format, string::String, vec::Vec};
3638
use raw::RawInterner;
3739

3840
pub use sym::*;
@@ -251,6 +253,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
251253
pub struct Interner {
252254
utf8_interner: RawInterner<u8>,
253255
utf16_interner: RawInterner<u16>,
256+
/// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
257+
latin1_flags: Vec<bool>,
254258
}
255259

256260
impl Interner {
@@ -288,6 +292,7 @@ impl Interner {
288292
Self {
289293
utf8_interner: RawInterner::with_capacity(capacity),
290294
utf16_interner: RawInterner::with_capacity(capacity),
295+
latin1_flags: Vec::with_capacity(capacity),
291296
}
292297
}
293298

@@ -410,6 +415,8 @@ impl Interner {
410415

411416
assert_eq!(index, utf16_index);
412417

418+
self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
419+
413420
index
414421
.checked_add(1 + COMMON_STRINGS_UTF8.len())
415422
.and_then(Sym::new)
@@ -453,6 +460,8 @@ impl Interner {
453460

454461
debug_assert_eq!(index, utf16_index);
455462

463+
self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));
464+
456465
index
457466
.checked_add(1 + COMMON_STRINGS_UTF8.len())
458467
.and_then(Sym::new)
@@ -538,6 +547,39 @@ impl Interner {
538547
self.resolve(symbol).expect("string disappeared")
539548
}
540549

550+
/// Returns `true` if the string identified by `symbol` can be encoded as Latin1
551+
/// (i.e. all code units are in the range `0x00..=0xFF`).
552+
///
553+
/// This information is computed **once** when the string is first interned, so callers pay no
554+
/// O(n) scanning cost beyond the initial intern call.
555+
///
556+
/// # Examples
557+
///
558+
/// ```
559+
/// use boa_interner::Interner;
560+
///
561+
/// let mut interner = Interner::new();
562+
/// let ascii = interner.get_or_intern("hello");
563+
/// assert!(interner.is_latin1(ascii));
564+
///
565+
/// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
566+
/// let sym = interner.get_or_intern(non_latin1.as_slice());
567+
/// assert!(!interner.is_latin1(sym));
568+
/// ```
569+
#[inline]
570+
#[must_use]
571+
pub fn is_latin1(&self, symbol: Sym) -> bool {
572+
let index = symbol.get() - 1;
573+
if index < COMMON_STRINGS_UTF8.len() {
574+
return true;
575+
}
576+
let dynamic_index = index - COMMON_STRINGS_UTF8.len();
577+
self.latin1_flags
578+
.get(dynamic_index)
579+
.copied()
580+
.unwrap_or(false)
581+
}
582+
541583
fn get_common(string: JStrRef<'_>) -> Option<Sym> {
542584
match string {
543585
JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {

core/interner/src/tests.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,40 @@ fn check_capacity() {
130130

131131
assert!(interner.resolve(sym).is_none());
132132
}
133+
134+
#[test]
135+
fn check_is_latin1() {
136+
static STATIC_STR: &str = "static_latin1";
137+
static STATIC_UTF16: &[u16] = &[
138+
's' as u16, 't' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'c' as u16, '_' as u16,
139+
'l' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'n' as u16, '1' as u16,
140+
];
141+
142+
let mut interner = Interner::default();
143+
144+
// Common/static strings (e.g. keywords) are always Latin1.
145+
let common_sym = interner.get_or_intern("break");
146+
assert!(interner.is_latin1(common_sym));
147+
148+
// Dynamic ASCII string.
149+
let ascii_sym = interner.get_or_intern("hello_world");
150+
assert!(interner.is_latin1(ascii_sym));
151+
152+
// Dynamic non-ASCII but Latin1-encodable (U+0080..=U+00FF).
153+
let latin1_sym = interner.get_or_intern(&[0x00E9u16, 0x00FC, 0x00F1][..]);
154+
assert!(interner.is_latin1(latin1_sym));
155+
156+
// Dynamic non-Latin1 (code unit > 0xFF).
157+
let non_latin1_sym = interner.get_or_intern(&[0x4E2Du16, 0x6587][..]);
158+
assert!(!interner.is_latin1(non_latin1_sym));
159+
160+
// Boundary: U+00FF (last Latin1) and U+0100 (first non-Latin1).
161+
let boundary_sym = interner.get_or_intern(&[0x00FFu16][..]);
162+
assert!(interner.is_latin1(boundary_sym));
163+
let boundary_non_sym = interner.get_or_intern(&[0x0100u16][..]);
164+
assert!(!interner.is_latin1(boundary_non_sym));
165+
166+
// get_or_intern_static also caches correctly.
167+
let static_sym = interner.get_or_intern_static(STATIC_STR, STATIC_UTF16);
168+
assert!(interner.is_latin1(static_sym));
169+
}

core/string/src/lib.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -824,12 +824,18 @@ impl From<&[u16]> for JsString {
824824
impl From<&str> for JsString {
825825
#[inline]
826826
fn from(s: &str) -> Self {
827-
// TODO: Check for latin1 encoding
828827
if s.is_ascii() {
829828
let js_str = JsStr::latin1(s.as_bytes());
830829
return StaticJsStrings::get_string(&js_str)
831830
.unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
832831
}
832+
// Non-ASCII but still Latin1-encodable (U+0080..=U+00FF): chars map 1-to-1 to u8.
833+
if s.chars().all(|c| c as u32 <= 0xFF) {
834+
let bytes: Vec<u8> = s.chars().map(|c| c as u8).collect();
835+
let js_str = JsStr::latin1(&bytes);
836+
return StaticJsStrings::get_string(&js_str)
837+
.unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
838+
}
833839
let s = s.encode_utf16().collect::<Vec<_>>();
834840
JsString::from_slice_skip_interning(JsStr::utf16(&s[..]))
835841
}

0 commit comments

Comments
 (0)