Skip to content

Commit f993411

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit f993411

File tree

1 file changed

+164
-8
lines changed

1 file changed

+164
-8
lines changed

src/encoding_binding.cc

Lines changed: 164 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
#include "node_external_reference.h"
77
#include "simdutf.h"
88
#include "string_bytes.h"
9+
#include "util.h"
910
#include "v8.h"
1011

12+
#include <algorithm>
1113
#include <cstdint>
1214

1315
namespace node {
@@ -71,6 +73,90 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7173
return info;
7274
}
7375

76+
// The following code is adapted from Cloudflare workers.
77+
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
78+
//
79+
// Copyright (c) 2017-2025 Cloudflare, Inc.
80+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
81+
// https://opensource.org/licenses/Apache-2.0
82+
namespace {
83+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
84+
85+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
86+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
87+
}
88+
89+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
90+
if (c < 0x80) return 1;
91+
if (c < 0x400) return 2;
92+
return 3;
93+
}
94+
95+
template <typename Char>
96+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
97+
size_t pos = 0;
98+
size_t utf8Accumulated = 0;
99+
constexpr size_t CHUNK = 257;
100+
constexpr bool UTF16 = sizeof(Char) == 2;
101+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
102+
103+
double expansion = 1.15;
104+
105+
while (pos < length && utf8Accumulated < bufferSize) {
106+
size_t remainingInput = length - pos;
107+
size_t spaceRemaining = bufferSize - utf8Accumulated;
108+
DCHECK_GE(expansion, 1.15);
109+
110+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
111+
if (guaranteedToFit >= remainingInput) {
112+
return length;
113+
}
114+
size_t likelyToFit =
115+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
116+
size_t fitEstimate =
117+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
118+
size_t chunkSize = std::min(remainingInput, fitEstimate);
119+
if (chunkSize == 1) break;
120+
DCHECK_GE(chunkSize, 1);
121+
122+
size_t chunkUtf8Len;
123+
if constexpr (UTF16) {
124+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
125+
// available For now, validate and use utf8_length_from_utf16
126+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
127+
} else {
128+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
129+
}
130+
131+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
132+
DCHECK_GT(chunkSize, guaranteedToFit);
133+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
134+
} else {
135+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
136+
pos += chunkSize;
137+
utf8Accumulated += chunkUtf8Len;
138+
}
139+
}
140+
141+
while (pos < length && utf8Accumulated < bufferSize) {
142+
size_t extra = simpleUtfEncodingLength(data[pos]);
143+
if (utf8Accumulated + extra > bufferSize) break;
144+
pos++;
145+
utf8Accumulated += extra;
146+
}
147+
148+
if (UTF16 && pos != 0 && pos != length &&
149+
isSurrogatePair(data[pos - 1], data[pos])) {
150+
if (utf8Accumulated < bufferSize) {
151+
pos++;
152+
} else {
153+
pos--;
154+
}
155+
}
156+
return pos;
157+
}
158+
} // namespace
159+
74160
void BindingData::Deserialize(Local<Context> context,
75161
Local<Object> holder,
76162
int index,
@@ -98,18 +184,88 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
98184

99185
Local<Uint8Array> dest = args[1].As<Uint8Array>();
100186
Local<ArrayBuffer> buf = dest->Buffer();
187+
188+
// Handle detached buffers - return {read: 0, written: 0}
189+
if (buf->Data() == nullptr) {
190+
binding_data->encode_into_results_buffer_[0] = 0;
191+
binding_data->encode_into_results_buffer_[1] = 0;
192+
return;
193+
}
194+
101195
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102196
size_t dest_length = dest->ByteLength();
103197

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
198+
size_t read = 0;
199+
size_t written = 0;
200+
v8::String::ValueView view(isolate, source);
201+
size_t length = view.length();
202+
203+
if (view.is_one_byte()) {
204+
auto data = reinterpret_cast<const char*>(view.data8());
205+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
206+
// Only copy what fits in the destination
207+
written = read = std::min(result.count, dest_length);
208+
if (read > 0) {
209+
memcpy(write_result, data, read);
210+
write_result += read;
211+
data += read;
212+
length -= read;
213+
dest_length -= read;
214+
}
215+
if (length != 0 && dest_length != 0) {
216+
size_t rest = findBestFit(data, length, dest_length);
217+
if (rest != 0) {
218+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
219+
written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
220+
read += rest;
221+
}
222+
}
223+
} else {
224+
auto data = reinterpret_cast<const char16_t*>(view.data16());
225+
226+
// Check if input has unpaired surrogates - if so, convert to well-formed
227+
// first
228+
simdutf::result validation_result =
229+
simdutf::validate_utf16_with_errors(data, length);
230+
231+
if (validation_result.error == simdutf::SUCCESS) {
232+
// Valid UTF-16 - use the fast path
233+
read = findBestFit(data, length, dest_length);
234+
if (read != 0) {
235+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
236+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
237+
}
238+
} else {
239+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
240+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
241+
// available
242+
// Limit conversion to what could fit in destination, avoiding splitting
243+
// a valid surrogate pair at the boundary
244+
size_t safe_length = std::min(length, dest_length);
245+
if (safe_length > 0 && safe_length < view.length() &&
246+
isSurrogatePair(data[safe_length - 1], data[safe_length])) {
247+
safe_length--;
248+
}
249+
250+
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
251+
safe_length);
252+
simdutf::to_well_formed_utf16(data, safe_length, conversion_buffer.out());
253+
254+
// Now use findBestFit with the well-formed data
255+
read = findBestFit(conversion_buffer.out(), safe_length, dest_length);
256+
if (read != 0) {
257+
DCHECK_LE(
258+
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
259+
dest_length);
260+
written = simdutf::convert_utf16_to_utf8(
261+
conversion_buffer.out(), read, write_result);
262+
}
263+
}
264+
}
265+
DCHECK_LE(written, dest_length);
110266

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
267+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
268+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113269
}
114270

115271
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)