Skip to content

Commit cba4cd1

Browse files
GearsDatapackslpil
authored andcommitted
Support UTF-16 and UTF-32 in decision tree generation
1 parent 3016aaa commit cba4cd1

File tree

3 files changed

+120
-34
lines changed

3 files changed

+120
-34
lines changed

compiler-core/src/exhaustiveness.rs

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,11 @@ mod missing_patterns;
8585
pub mod printer;
8686

8787
use crate::{
88-
ast::{self, AssignName, Endianness, TypedClause, TypedPattern, TypedPatternBitArraySegment},
89-
strings::convert_string_escape_chars,
88+
ast::{
89+
self, AssignName, BitArrayOption, Endianness, TypedClause, TypedPattern,
90+
TypedPatternBitArraySegment,
91+
},
92+
strings::{convert_string_escape_chars, length_utf16, length_utf32},
9093
type_::{
9194
Environment, Type, TypeValueConstructor, TypeValueConstructorField, TypeVar,
9295
TypeVariantConstructors, collapse_links, error::UnreachablePatternReason,
@@ -424,7 +427,9 @@ impl Body {
424427
let value = match value {
425428
BitArrayMatchedValue::LiteralFloat(value) => BoundValue::LiteralFloat(value.clone()),
426429
BitArrayMatchedValue::LiteralInt(value) => BoundValue::LiteralInt(value.clone()),
427-
BitArrayMatchedValue::LiteralString(value) => BoundValue::LiteralString(value.clone()),
430+
BitArrayMatchedValue::LiteralString { value, .. } => {
431+
BoundValue::LiteralString(value.clone())
432+
}
428433
BitArrayMatchedValue::Variable(_)
429434
| BitArrayMatchedValue::Discard(_)
430435
| BitArrayMatchedValue::Assign { .. } => {
@@ -1003,7 +1008,10 @@ pub struct MatchTest {
10031008
pub enum BitArrayMatchedValue {
10041009
LiteralFloat(EcoString),
10051010
LiteralInt(BigInt),
1006-
LiteralString(EcoString),
1011+
LiteralString {
1012+
value: EcoString,
1013+
encoding: StringEncoding,
1014+
},
10071015
Variable(EcoString),
10081016
Discard(EcoString),
10091017
Assign {
@@ -1012,13 +1020,20 @@ pub enum BitArrayMatchedValue {
10121020
},
10131021
}
10141022

1023+
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
1024+
pub enum StringEncoding {
1025+
Utf8,
1026+
Utf16,
1027+
Utf32,
1028+
}
1029+
10151030
impl BitArrayMatchedValue {
10161031
pub fn is_discard(&self) -> bool {
10171032
match self {
10181033
BitArrayMatchedValue::Discard(_) => true,
10191034
BitArrayMatchedValue::LiteralFloat(_)
10201035
| BitArrayMatchedValue::LiteralInt(_)
1021-
| BitArrayMatchedValue::LiteralString(_)
1036+
| BitArrayMatchedValue::LiteralString { .. }
10221037
| BitArrayMatchedValue::Variable(_)
10231038
| BitArrayMatchedValue::Assign { .. } => false,
10241039
}
@@ -2816,7 +2831,7 @@ impl CaseToCompile {
28162831

28172832
// Each segment is also turned into a match test, checking the
28182833
// selected bits match with the pattern's value.
2819-
let value = segment_matched_value(&segment.value);
2834+
let value = segment_matched_value(&segment.value, &segment.options);
28202835

28212836
let type_ = match &segment.type_ {
28222837
type_ if type_.is_int() => ReadType::Int,
@@ -2841,7 +2856,7 @@ impl CaseToCompile {
28412856
match &value {
28422857
BitArrayMatchedValue::LiteralFloat(_)
28432858
| BitArrayMatchedValue::LiteralInt(_)
2844-
| BitArrayMatchedValue::LiteralString(_)
2859+
| BitArrayMatchedValue::LiteralString { .. }
28452860
| BitArrayMatchedValue::Discard(_) => {}
28462861
BitArrayMatchedValue::Variable(name)
28472862
| BitArrayMatchedValue::Assign { name, .. } => {
@@ -2857,16 +2872,42 @@ impl CaseToCompile {
28572872
}
28582873
}
28592874

2860-
fn segment_matched_value(pattern: &TypedPattern) -> BitArrayMatchedValue {
2875+
fn segment_matched_value(
2876+
pattern: &TypedPattern,
2877+
options: &[BitArrayOption<TypedPattern>],
2878+
) -> BitArrayMatchedValue {
28612879
match pattern {
28622880
ast::Pattern::Int { int_value, .. } => BitArrayMatchedValue::LiteralInt(int_value.clone()),
28632881
ast::Pattern::Float { value, .. } => BitArrayMatchedValue::LiteralFloat(value.clone()),
2864-
ast::Pattern::String { value, .. } => BitArrayMatchedValue::LiteralString(value.clone()),
2882+
ast::Pattern::String { value, .. }
2883+
if options
2884+
.iter()
2885+
.any(|x| matches!(x, BitArrayOption::Utf16 { .. })) =>
2886+
{
2887+
BitArrayMatchedValue::LiteralString {
2888+
value: value.clone(),
2889+
encoding: StringEncoding::Utf16,
2890+
}
2891+
}
2892+
ast::Pattern::String { value, .. }
2893+
if options
2894+
.iter()
2895+
.any(|x| matches!(x, BitArrayOption::Utf32 { .. })) =>
2896+
{
2897+
BitArrayMatchedValue::LiteralString {
2898+
value: value.clone(),
2899+
encoding: StringEncoding::Utf32,
2900+
}
2901+
}
2902+
ast::Pattern::String { value, .. } => BitArrayMatchedValue::LiteralString {
2903+
value: value.clone(),
2904+
encoding: StringEncoding::Utf8,
2905+
},
28652906
ast::Pattern::Variable { name, .. } => BitArrayMatchedValue::Variable(name.clone()),
28662907
ast::Pattern::Discard { name, .. } => BitArrayMatchedValue::Discard(name.clone()),
28672908
ast::Pattern::Assign { name, pattern, .. } => BitArrayMatchedValue::Assign {
28682909
name: name.clone(),
2869-
value: Box::new(segment_matched_value(pattern)),
2910+
value: Box::new(segment_matched_value(pattern, options)),
28702911
},
28712912
x => panic!("unexpected segment value pattern {:?}", x),
28722913
}
@@ -2906,10 +2947,17 @@ fn segment_size(
29062947
// segments, and 64 for anything else.
29072948
None if segment.type_.is_int() => ReadSize::ConstantBits(8.into()),
29082949
None => match segment.value.as_ref() {
2909-
ast::Pattern::String { .. }
2910-
if segment.has_utf16_option() || segment.has_utf32_option() =>
2911-
{
2912-
panic!("non utf8 string in bit array pattern on js target")
2950+
ast::Pattern::String { value, .. } if segment.has_utf16_option() => {
2951+
ReadSize::ConstantBits(
2952+
// Each utf16 code unit is 16 bits
2953+
length_utf16(&convert_string_escape_chars(value)) * BigInt::from(16),
2954+
)
2955+
}
2956+
ast::Pattern::String { value, .. } if segment.has_utf32_option() => {
2957+
// Each utf32 codepoint is 32 bits
2958+
ReadSize::ConstantBits(
2959+
length_utf32(&convert_string_escape_chars(value)) * BigInt::from(32),
2960+
)
29132961
}
29142962
// If the segment is a literal string then it has an automatic size
29152963
// given by its number of bytes.

compiler-core/src/javascript/decision.rs

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@ use crate::{
88
exhaustiveness::{
99
BitArrayMatchedValue, BitArrayTest, Body, BoundValue, CompiledCase, Decision,
1010
FallbackCheck, MatchTest, Offset, ReadAction, ReadSize, ReadType, RuntimeCheck,
11-
SizeOperator, SizeTest, Variable, VariableUsage,
11+
SizeOperator, SizeTest, StringEncoding, Variable, VariableUsage,
1212
},
1313
format::break_block,
1414
javascript::{
1515
expression::{eco_string_int, string},
1616
maybe_escape_property,
1717
},
1818
pretty::{Document, Documentable, break_, join, line, nil},
19-
strings::convert_string_escape_chars,
19+
strings::{convert_string_escape_chars, string_to_utf16_bytes, string_to_utf32_bytes},
2020
};
2121
use ecow::{EcoString, eco_format};
2222
use itertools::Itertools;
@@ -902,8 +902,16 @@ impl<'generator, 'module, 'a> Variables<'generator, 'module, 'a> {
902902
value: expected,
903903
read_action,
904904
}) => match expected {
905-
BitArrayMatchedValue::LiteralString(expected) => self
906-
.literal_string_segment_bytes_check(value, expected, read_action, negation),
905+
BitArrayMatchedValue::LiteralString {
906+
value: expected,
907+
encoding,
908+
} => self.literal_string_segment_bytes_check(
909+
value,
910+
expected,
911+
read_action,
912+
negation,
913+
*encoding,
914+
),
907915
BitArrayMatchedValue::LiteralFloat(expected) => self
908916
.literal_float_segment_bytes_check(value, expected, read_action, negation),
909917
BitArrayMatchedValue::LiteralInt(expected) => self
@@ -1213,6 +1221,7 @@ impl<'generator, 'module, 'a> Variables<'generator, 'module, 'a> {
12131221
literal_string: &EcoString,
12141222
read_action: &ReadAction,
12151223
check_negation: CheckNegation,
1224+
encoding: StringEncoding,
12161225
) -> Document<'a> {
12171226
let ReadAction {
12181227
from: start,
@@ -1228,19 +1237,36 @@ impl<'generator, 'module, 'a> Variables<'generator, 'module, 'a> {
12281237
" === "
12291238
};
12301239

1240+
let escaped = convert_string_escape_chars(literal_string);
1241+
// We need to have this vector here so that we don't run into lifetime
1242+
// issues when calling `.as_slice` on the local vectors created when this
1243+
// isn't a UTF-8 string.
1244+
let mut _bytes_vec = Vec::new();
1245+
let bytes = match encoding {
1246+
StringEncoding::Utf8 => escaped.as_bytes(),
1247+
StringEncoding::Utf16 => {
1248+
_bytes_vec = string_to_utf16_bytes(&escaped, read_action.endianness);
1249+
_bytes_vec.as_slice()
1250+
}
1251+
StringEncoding::Utf32 => {
1252+
_bytes_vec = string_to_utf32_bytes(&escaped, read_action.endianness);
1253+
_bytes_vec.as_slice()
1254+
}
1255+
};
1256+
12311257
if let Some(mut from_byte) = start.constant_bytes() {
12321258
// If the string starts at a compile-time known byte, then we can
12331259
// optimise this by reading all the subsequent bytes and checking
12341260
// they have a specific value.
1235-
for byte in convert_string_escape_chars(literal_string).as_bytes() {
1261+
for byte in bytes {
12361262
let byte_access = docvec![bit_array.clone(), ".byteAt(", from_byte.clone(), ")"];
12371263
checks.push(docvec![byte_access, equality, byte]);
12381264
from_byte += 1;
12391265
}
12401266
} else {
12411267
// If the string doesn't start at a byte aligned offset then we'll
12421268
// have to take slices out of it to check that each byte matches.
1243-
for byte in convert_string_escape_chars(literal_string).as_bytes() {
1269+
for byte in bytes {
12441270
let end = self.offset_to_doc(&start.add_constant(8), false);
12451271
let from = self.offset_to_doc(start, false);
12461272
let byte_access =

compiler-core/src/strings.rs

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,23 +117,18 @@ pub fn to_upper_camel_case(string: &str) -> EcoString {
117117
pub fn string_to_utf16_bytes(string: &str, endianness: Endianness) -> Vec<u8> {
118118
let mut bytes = Vec::with_capacity(string.len() * 2);
119119

120+
let mut character_buffer = [0, 0];
120121
for character in string.chars() {
121-
let mut character_buffer = [0, 0];
122+
let segments = character.encode_utf16(&mut character_buffer);
122123

123-
_ = character.encode_utf16(&mut character_buffer);
124-
125-
let first_two_bytes = match endianness {
126-
Endianness::Big => character_buffer[0].to_be_bytes(),
127-
Endianness::Little => character_buffer[0].to_le_bytes(),
128-
};
129-
bytes.extend(first_two_bytes);
130-
131-
if character_buffer[1] != 0 {
132-
let next_two_bytes = match endianness {
133-
Endianness::Big => character_buffer[1].to_be_bytes(),
134-
Endianness::Little => character_buffer[1].to_le_bytes(),
124+
for segment in segments {
125+
let segment_bytes = match endianness {
126+
Endianness::Big => segment.to_be_bytes(),
127+
Endianness::Little => segment.to_le_bytes(),
135128
};
136-
bytes.extend(next_two_bytes);
129+
130+
bytes.push(segment_bytes[0]);
131+
bytes.push(segment_bytes[1]);
137132
}
138133
}
139134

@@ -154,3 +149,20 @@ pub fn string_to_utf32_bytes(string: &str, endianness: Endianness) -> Vec<u8> {
154149

155150
bytes
156151
}
152+
153+
/// Gets the number of UTF-16 codepoints it would take to encode a given string.
154+
pub fn length_utf16(string: &str) -> usize {
155+
let mut length = 0;
156+
157+
for char in string.chars() {
158+
length += char.len_utf16()
159+
}
160+
161+
length
162+
}
163+
164+
/// Gets the number of UTF-32 codepoints in a string (also known as the number of
165+
/// characters).
166+
pub fn length_utf32(string: &str) -> usize {
167+
string.chars().count()
168+
}

0 commit comments

Comments
 (0)