Skip to content

Commit d6ac366

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit d6ac366

File tree

1 file changed

+138
-8
lines changed

1 file changed

+138
-8
lines changed

src/encoding_binding.cc

Lines changed: 138 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,84 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171
return info;
7272
}
7373

74+
namespace {
75+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
76+
77+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
78+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
79+
}
80+
81+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
82+
if (c < 0x80) return 1;
83+
if (c < 0x400) return 2;
84+
return 3;
85+
}
86+
87+
template <typename Char>
88+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
89+
size_t pos = 0;
90+
size_t utf8Accumulated = 0;
91+
constexpr size_t CHUNK = 257;
92+
constexpr bool UTF16 = sizeof(Char) == 2;
93+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
94+
95+
double expansion = 1.15;
96+
97+
while (pos < length && utf8Accumulated < bufferSize) {
98+
size_t remainingInput = length - pos;
99+
size_t spaceRemaining = bufferSize - utf8Accumulated;
100+
DCHECK_GE(expansion, 1.15);
101+
102+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
103+
if (guaranteedToFit >= remainingInput) {
104+
return length;
105+
}
106+
size_t likelyToFit =
107+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
108+
size_t fitEstimate =
109+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
110+
size_t chunkSize = std::min(remainingInput, fitEstimate);
111+
if (chunkSize == 1) break;
112+
DCHECK_GE(chunkSize, 1);
113+
114+
size_t chunkUtf8Len;
115+
if constexpr (UTF16) {
116+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
117+
// available For now, validate and use utf8_length_from_utf16
118+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
119+
} else {
120+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
121+
}
122+
123+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
124+
DCHECK_GT(chunkSize, guaranteedToFit);
125+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
126+
} else {
127+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
128+
pos += chunkSize;
129+
utf8Accumulated += chunkUtf8Len;
130+
}
131+
}
132+
133+
while (pos < length && utf8Accumulated < bufferSize) {
134+
size_t extra = simpleUtfEncodingLength(data[pos]);
135+
if (utf8Accumulated + extra > bufferSize) break;
136+
pos++;
137+
utf8Accumulated += extra;
138+
}
139+
140+
if (UTF16 && pos != 0 && pos != length &&
141+
isSurrogatePair(data[pos - 1], data[pos])) {
142+
if (utf8Accumulated < bufferSize) {
143+
pos++;
144+
} else {
145+
pos--;
146+
}
147+
}
148+
return pos;
149+
}
150+
} // namespace
151+
74152
void BindingData::Deserialize(Local<Context> context,
75153
Local<Object> holder,
76154
int index,
@@ -101,15 +179,67 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101179
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102180
size_t dest_length = dest->ByteLength();
103181

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
182+
size_t read = 0;
183+
size_t written = 0;
184+
v8::String::ValueView view(isolate, source);
185+
uint32_t length = view.length();
186+
187+
if (view.is_one_byte()) {
188+
auto data = reinterpret_cast<const char*>(view.data8());
189+
simdutf::result result = simdutf::validate_ascii_with_errors(
190+
data, std::min(static_cast<size_t>(length), dest_length));
191+
written = read = result.count;
192+
auto out_addr = write_result;
193+
memcpy(out_addr, data, read);
194+
out_addr += read;
195+
data += read;
196+
length -= read;
197+
dest_length -= read;
198+
if (length != 0 && dest_length != 0) {
199+
size_t rest = findBestFit(data, length, dest_length);
200+
if (rest != 0) {
201+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
202+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
203+
read += rest;
204+
}
205+
}
206+
} else {
207+
auto data = reinterpret_cast<const char16_t*>(view.data16());
208+
209+
// Check if input has unpaired surrogates - if so, convert to well-formed
210+
// first
211+
simdutf::result validation_result =
212+
simdutf::validate_utf16_with_errors(data, length);
213+
214+
if (validation_result.error == simdutf::SUCCESS) {
215+
// Valid UTF-16 - use the fast path
216+
read = findBestFit(data, length, dest_length);
217+
if (read != 0) {
218+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
219+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
220+
}
221+
} else {
222+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
223+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
224+
// available
225+
std::vector<char16_t> conversion_buffer(length);
226+
simdutf::to_well_formed_utf16(data, length, conversion_buffer.data());
227+
228+
// Now use findBestFit with the well-formed data
229+
read = findBestFit(conversion_buffer.data(), length, dest_length);
230+
if (read != 0) {
231+
DCHECK_LE(
232+
simdutf::utf8_length_from_utf16(conversion_buffer.data(), read),
233+
dest_length);
234+
written = simdutf::convert_utf16_to_utf8(
235+
conversion_buffer.data(), read, write_result);
236+
}
237+
}
238+
}
239+
DCHECK_LE(written, dest_length);
110240

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
241+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
242+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113243
}
114244

115245
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)