Skip to content

Commit d47585e

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <[email protected]> Co-authored-by: Daniel Lemire <[email protected]>
1 parent 340e619 commit d47585e

File tree

1 file changed

+136
-8
lines changed

1 file changed

+136
-8
lines changed

src/encoding_binding.cc

Lines changed: 136 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,82 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7171
return info;
7272
}
7373

74+
namespace {
75+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
76+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
77+
}
78+
79+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
80+
if (c < 0x80) return 1;
81+
if (c < 0x400) return 2;
82+
return 3;
83+
}
84+
85+
template <typename Char>
86+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
87+
size_t pos = 0;
88+
size_t utf8Accumulated = 0;
89+
constexpr size_t CHUNK = 257;
90+
constexpr bool UTF16 = sizeof(Char) == 2;
91+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
92+
93+
double expansion = 1.15;
94+
95+
while (pos < length && utf8Accumulated < bufferSize) {
96+
size_t remainingInput = length - pos;
97+
size_t spaceRemaining = bufferSize - utf8Accumulated;
98+
DCHECK_GE(expansion, 1.15);
99+
100+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
101+
if (guaranteedToFit >= remainingInput) {
102+
return length;
103+
}
104+
size_t likelyToFit =
105+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
106+
size_t fitEstimate =
107+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
108+
size_t chunkSize = std::min(remainingInput, fitEstimate);
109+
if (chunkSize == 1) break;
110+
DCHECK_GE(chunkSize, 1);
111+
112+
size_t chunkUtf8Len;
113+
if constexpr (UTF16) {
114+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
115+
// available For now, validate and use utf8_length_from_utf16
116+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
117+
} else {
118+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
119+
}
120+
121+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
122+
DCHECK_GT(chunkSize, guaranteedToFit);
123+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
124+
} else {
125+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
126+
pos += chunkSize;
127+
utf8Accumulated += chunkUtf8Len;
128+
}
129+
}
130+
131+
while (pos < length && utf8Accumulated < bufferSize) {
132+
size_t extra = simpleUtfEncodingLength(data[pos]);
133+
if (utf8Accumulated + extra > bufferSize) break;
134+
pos++;
135+
utf8Accumulated += extra;
136+
}
137+
138+
if (UTF16 && pos != 0 && pos != length &&
139+
isSurrogatePair(data[pos - 1], data[pos])) {
140+
if (utf8Accumulated < bufferSize) {
141+
pos++;
142+
} else {
143+
pos--;
144+
}
145+
}
146+
return pos;
147+
}
148+
} // namespace
149+
74150
void BindingData::Deserialize(Local<Context> context,
75151
Local<Object> holder,
76152
int index,
@@ -101,15 +177,67 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101177
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102178
size_t dest_length = dest->ByteLength();
103179

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
180+
size_t read = 0;
181+
size_t written = 0;
182+
v8::String::ValueView view(isolate, source);
183+
uint32_t length = view.length();
184+
185+
if (view.is_one_byte()) {
186+
auto data = reinterpret_cast<const char*>(view.data8());
187+
simdutf::result result = simdutf::validate_ascii_with_errors(
188+
data, std::min(static_cast<size_t>(length), dest_length));
189+
written = read = result.count;
190+
auto out_addr = write_result;
191+
memcpy(out_addr, data, read);
192+
out_addr += read;
193+
data += read;
194+
length -= read;
195+
dest_length -= read;
196+
if (length != 0 && dest_length != 0) {
197+
size_t rest = findBestFit(data, length, dest_length);
198+
if (rest != 0) {
199+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
200+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
201+
read += rest;
202+
}
203+
}
204+
} else {
205+
auto data = reinterpret_cast<const char16_t*>(view.data16());
206+
207+
// Check if input has unpaired surrogates - if so, convert to well-formed
208+
// first
209+
simdutf::result validation_result =
210+
simdutf::validate_utf16_with_errors(data, length);
211+
212+
if (validation_result.error == simdutf::SUCCESS) {
213+
// Valid UTF-16 - use the fast path
214+
read = findBestFit(data, length, dest_length);
215+
if (read != 0) {
216+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
217+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
218+
}
219+
} else {
220+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
221+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
222+
// available
223+
std::vector<char16_t> conversion_buffer(length);
224+
simdutf::to_well_formed_utf16(data, length, conversion_buffer.data());
225+
226+
// Now use findBestFit with the well-formed data
227+
read = findBestFit(conversion_buffer.data(), length, dest_length);
228+
if (read != 0) {
229+
DCHECK_LE(
230+
simdutf::utf8_length_from_utf16(conversion_buffer.data(), read),
231+
dest_length);
232+
written = simdutf::convert_utf16_to_utf8(
233+
conversion_buffer.data(), read, write_result);
234+
}
235+
}
236+
}
237+
DCHECK_LE(written, dest_length);
110238

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
239+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
240+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113241
}
114242

115243
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)