Skip to content

Commit 699bf56

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit 699bf56

File tree

1 file changed

+142
-8
lines changed

1 file changed

+142
-8
lines changed

src/encoding_binding.cc

Lines changed: 142 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,88 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171
return info;
7272
}
7373

74+
// The following code is adapted from Cloudflare workers.
75+
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
76+
//
77+
// Copyright (c) 2017-2025 Cloudflare, Inc.
78+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
79+
// https://opensource.org/licenses/Apache-2.0
80+
namespace {
81+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
82+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
83+
}
84+
85+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
86+
if (c < 0x80) return 1;
87+
if (c < 0x400) return 2;
88+
return 3;
89+
}
90+
91+
template <typename Char>
92+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
93+
size_t pos = 0;
94+
size_t utf8Accumulated = 0;
95+
constexpr size_t CHUNK = 257;
96+
constexpr bool UTF16 = sizeof(Char) == 2;
97+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
98+
99+
double expansion = 1.15;
100+
101+
while (pos < length && utf8Accumulated < bufferSize) {
102+
size_t remainingInput = length - pos;
103+
size_t spaceRemaining = bufferSize - utf8Accumulated;
104+
DCHECK_GE(expansion, 1.15);
105+
106+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
107+
if (guaranteedToFit >= remainingInput) {
108+
return length;
109+
}
110+
size_t likelyToFit =
111+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
112+
size_t fitEstimate =
113+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
114+
size_t chunkSize = std::min(remainingInput, fitEstimate);
115+
if (chunkSize == 1) break;
116+
DCHECK_GE(chunkSize, 1);
117+
118+
size_t chunkUtf8Len;
119+
if constexpr (UTF16) {
120+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
121+
// available For now, validate and use utf8_length_from_utf16
122+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
123+
} else {
124+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
125+
}
126+
127+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
128+
DCHECK_GT(chunkSize, guaranteedToFit);
129+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
130+
} else {
131+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
132+
pos += chunkSize;
133+
utf8Accumulated += chunkUtf8Len;
134+
}
135+
}
136+
137+
while (pos < length && utf8Accumulated < bufferSize) {
138+
size_t extra = simpleUtfEncodingLength(data[pos]);
139+
if (utf8Accumulated + extra > bufferSize) break;
140+
pos++;
141+
utf8Accumulated += extra;
142+
}
143+
144+
if (UTF16 && pos != 0 && pos != length &&
145+
isSurrogatePair(data[pos - 1], data[pos])) {
146+
if (utf8Accumulated < bufferSize) {
147+
pos++;
148+
} else {
149+
pos--;
150+
}
151+
}
152+
return pos;
153+
}
154+
} // namespace
155+
74156
void BindingData::Deserialize(Local<Context> context,
75157
Local<Object> holder,
76158
int index,
@@ -101,15 +183,67 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101183
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102184
size_t dest_length = dest->ByteLength();
103185

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
186+
size_t read = 0;
187+
size_t written = 0;
188+
v8::String::ValueView view(isolate, source);
189+
uint32_t length = view.length();
190+
191+
if (view.is_one_byte()) {
192+
auto data = reinterpret_cast<const char*>(view.data8());
193+
simdutf::result result = simdutf::validate_ascii_with_errors(
194+
data, std::min(static_cast<size_t>(length), dest_length));
195+
written = read = result.count;
196+
auto out_addr = write_result;
197+
memcpy(out_addr, data, read);
198+
out_addr += read;
199+
data += read;
200+
length -= read;
201+
dest_length -= read;
202+
if (length != 0 && dest_length != 0) {
203+
size_t rest = findBestFit(data, length, dest_length);
204+
if (rest != 0) {
205+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
206+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
207+
read += rest;
208+
}
209+
}
210+
} else {
211+
auto data = reinterpret_cast<const char16_t*>(view.data16());
212+
213+
// Check if input has unpaired surrogates - if so, convert to well-formed
214+
// first
215+
simdutf::result validation_result =
216+
simdutf::validate_utf16_with_errors(data, length);
217+
218+
if (validation_result.error == simdutf::SUCCESS) {
219+
// Valid UTF-16 - use the fast path
220+
read = findBestFit(data, length, dest_length);
221+
if (read != 0) {
222+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
223+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
224+
}
225+
} else {
226+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
227+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
228+
// available
229+
std::vector<char16_t> conversion_buffer(length);
230+
simdutf::to_well_formed_utf16(data, length, conversion_buffer.data());
231+
232+
// Now use findBestFit with the well-formed data
233+
read = findBestFit(conversion_buffer.data(), length, dest_length);
234+
if (read != 0) {
235+
DCHECK_LE(
236+
simdutf::utf8_length_from_utf16(conversion_buffer.data(), read),
237+
dest_length);
238+
written = simdutf::convert_utf16_to_utf8(
239+
conversion_buffer.data(), read, write_result);
240+
}
241+
}
242+
}
243+
DCHECK_LE(written, dest_length);
110244

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
245+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
246+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113247
}
114248

115249
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)