Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions core/ast/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,13 @@ pub(crate) trait ToJsString {
impl ToJsString for Sym {
#[allow(clippy::cast_possible_truncation)]
fn to_js_string(&self, interner: &Interner) -> JsString {
// TODO: Identify latin1 encodeable strings during parsing to avoid this check.
let string = interner.resolve_expect(*self).utf16();
for c in string {
if u8::try_from(*c).is_err() {
return JsString::from(string);
}
let utf16 = interner.resolve_expect(*self).utf16();
if interner.is_latin1(*self) {
let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
JsString::from(JsStr::latin1(&bytes))
} else {
JsString::from(utf16)
}
let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
JsString::from(JsStr::latin1(&string))
}
}

Expand Down
15 changes: 7 additions & 8 deletions core/engine/src/bytecompiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,15 @@ pub(crate) trait ToJsString {
}

impl ToJsString for Sym {
#[allow(clippy::cast_possible_truncation)]
fn to_js_string(&self, interner: &Interner) -> JsString {
// TODO: Identify latin1 encodeable strings during parsing to avoid this check.
let string = interner.resolve_expect(*self).utf16();
for c in string {
if u8::try_from(*c).is_err() {
return js_string!(string);
}
let utf16 = interner.resolve_expect(*self).utf16();
if interner.is_latin1(*self) {
let bytes: Vec<u8> = utf16.iter().map(|&c| c as u8).collect();
js_string!(JsStr::latin1(&bytes))
} else {
js_string!(utf16)
}
let string = string.iter().map(|c| *c as u8).collect::<Vec<_>>();
js_string!(JsStr::latin1(&string))
}
}

Expand Down
46 changes: 44 additions & 2 deletions core/interner/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#![allow(
clippy::redundant_pub_crate,
// TODO deny once false positive is fixed (https://github.com/rust-lang/rust-clippy/issues/9626).
clippy::trait_duplication_in_bounds
clippy::trait_duplication_in_bounds,
// Field names intentionally mirror the encoding type they store.
clippy::struct_field_names
)]
#![cfg_attr(not(feature = "arbitrary"), no_std)]

Expand All @@ -32,7 +34,7 @@ mod sym;
#[cfg(test)]
mod tests;

use alloc::{borrow::Cow, format, string::String};
use alloc::{borrow::Cow, format, string::String, vec::Vec};
use raw::RawInterner;

pub use sym::*;
Expand Down Expand Up @@ -251,6 +253,8 @@ impl core::fmt::Display for JSInternedStrRef<'_, '_> {
pub struct Interner {
utf8_interner: RawInterner<u8>,
utf16_interner: RawInterner<u16>,
/// Latin1-encodability cache for dynamically-interned strings (all code units ≤ 0xFF).
latin1_flags: Vec<bool>,
}

impl Interner {
Expand Down Expand Up @@ -288,6 +292,7 @@ impl Interner {
Self {
utf8_interner: RawInterner::with_capacity(capacity),
utf16_interner: RawInterner::with_capacity(capacity),
latin1_flags: Vec::with_capacity(capacity),
}
}

Expand Down Expand Up @@ -410,6 +415,8 @@ impl Interner {

assert_eq!(index, utf16_index);

self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));

index
.checked_add(1 + COMMON_STRINGS_UTF8.len())
.and_then(Sym::new)
Expand Down Expand Up @@ -453,6 +460,8 @@ impl Interner {

debug_assert_eq!(index, utf16_index);

self.latin1_flags.push(utf16.iter().all(|&c| c <= 0xFF));

index
.checked_add(1 + COMMON_STRINGS_UTF8.len())
.and_then(Sym::new)
Expand Down Expand Up @@ -538,6 +547,39 @@ impl Interner {
self.resolve(symbol).expect("string disappeared")
}

/// Returns `true` if the string identified by `symbol` can be encoded as Latin1
/// (i.e. all code units are in the range `0x00..=0xFF`).
///
/// This information is computed **once** when the string is first interned, so callers pay no
/// O(n) scanning cost beyond the initial intern call.
///
/// # Examples
///
/// ```
/// use boa_interner::Interner;
///
/// let mut interner = Interner::new();
/// let ascii = interner.get_or_intern("hello");
/// assert!(interner.is_latin1(ascii));
///
/// let non_latin1: Vec<u16> = vec![0x4e2d, 0x6587]; // "中文"
/// let sym = interner.get_or_intern(non_latin1.as_slice());
/// assert!(!interner.is_latin1(sym));
/// ```
#[inline]
#[must_use]
pub fn is_latin1(&self, symbol: Sym) -> bool {
let index = symbol.get() - 1;
if index < COMMON_STRINGS_UTF8.len() {
return true;
}
let dynamic_index = index - COMMON_STRINGS_UTF8.len();
self.latin1_flags
.get(dynamic_index)
.copied()
.unwrap_or(false)
}

fn get_common(string: JStrRef<'_>) -> Option<Sym> {
match string {
JStrRef::Utf8(s) => COMMON_STRINGS_UTF8.get_index(s).map(|idx| {
Expand Down
37 changes: 37 additions & 0 deletions core/interner/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,40 @@ fn check_capacity() {

assert!(interner.resolve(sym).is_none());
}

#[test]
fn check_is_latin1() {
static STATIC_STR: &str = "static_latin1";
static STATIC_UTF16: &[u16] = &[
's' as u16, 't' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'c' as u16, '_' as u16,
'l' as u16, 'a' as u16, 't' as u16, 'i' as u16, 'n' as u16, '1' as u16,
];

let mut interner = Interner::default();

// Common/static strings (e.g. keywords) are always Latin1.
let common_sym = interner.get_or_intern("break");
assert!(interner.is_latin1(common_sym));

// Dynamic ASCII string.
let ascii_sym = interner.get_or_intern("hello_world");
assert!(interner.is_latin1(ascii_sym));

// Dynamic non-ASCII but Latin1-encodable (U+0080..=U+00FF).
let latin1_sym = interner.get_or_intern(&[0x00E9u16, 0x00FC, 0x00F1][..]);
assert!(interner.is_latin1(latin1_sym));

// Dynamic non-Latin1 (code unit > 0xFF).
let non_latin1_sym = interner.get_or_intern(&[0x4E2Du16, 0x6587][..]);
assert!(!interner.is_latin1(non_latin1_sym));

// Boundary: U+00FF (last Latin1) and U+0100 (first non-Latin1).
let boundary_sym = interner.get_or_intern(&[0x00FFu16][..]);
assert!(interner.is_latin1(boundary_sym));
let boundary_non_sym = interner.get_or_intern(&[0x0100u16][..]);
assert!(!interner.is_latin1(boundary_non_sym));

// get_or_intern_static also caches correctly.
let static_sym = interner.get_or_intern_static(STATIC_STR, STATIC_UTF16);
assert!(interner.is_latin1(static_sym));
}
8 changes: 7 additions & 1 deletion core/string/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -824,12 +824,18 @@ impl From<&[u16]> for JsString {
impl From<&str> for JsString {
#[inline]
fn from(s: &str) -> Self {
// TODO: Check for latin1 encoding
if s.is_ascii() {
let js_str = JsStr::latin1(s.as_bytes());
return StaticJsStrings::get_string(&js_str)
.unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
}
// Non-ASCII but still Latin1-encodable (U+0080..=U+00FF): chars map 1-to-1 to u8.
if s.chars().all(|c| c as u32 <= 0xFF) {
let bytes: Vec<u8> = s.chars().map(|c| c as u8).collect();
let js_str = JsStr::latin1(&bytes);
return StaticJsStrings::get_string(&js_str)
.unwrap_or_else(|| JsString::from_slice_skip_interning(js_str));
}
let s = s.encode_utf16().collect::<Vec<_>>();
JsString::from_slice_skip_interning(JsStr::utf16(&s[..]))
}
Expand Down
Loading