util: improve textencoder encodeInto performance

anonrig · erikcorry · lemire · web-flow · commit b1e941e7b259 · 2025-12-01T01:17:44.000Z
Co-authored-by: Erik Corry <ecorry@cloudflare.com> Co-authored-by: Daniel Lemire <daniel@lemire.me> PR-URL: #60843 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc
@@ -6,8 +6,10 @@
 #include "node_external_reference.h"
 #include "simdutf.h"
 #include "string_bytes.h"
+#include "util.h"
 #include "v8.h"
 
+#include <algorithm>
 #include <cstdint>
 
 namespace node {
@@ -71,6 +73,113 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
   return info;
 }
 
+// The following code is adapted from Cloudflare workers.
+// Particularly from: https://github.com/cloudflare/workerd/pull/5448
+//
+// Copyright (c) 2017-2025 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
+namespace {
+constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
+
+constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
+  return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
+}
+
+constexpr size_t simpleUtfEncodingLength(uint16_t c) {
+  if (c < 0x80) return 1;
+  if (c < 0x400) return 2;
+  return 3;
+}
+
+// Finds the maximum number of input characters (UTF-16 or Latin1) that can be
+// encoded into a UTF-8 buffer of the given size.
+//
+// The challenge is that UTF-8 encoding expands characters by variable amounts:
+// - ASCII (< 0x80): 1 byte
+// - Code points < 0x800: 2 bytes
+// - Other BMP characters: 3 bytes
+// - Surrogate pairs (supplementary planes): 4 bytes total
+//
+// This function uses an adaptive chunking algorithm:
+// 1. Process the input in chunks, estimating how many characters will fit
+// 2. Calculate the actual UTF-8 length for each chunk using simdutf
+// 3. Adjust the expansion factor based on observed encoding ratios
+// 4. Fall back to character-by-character processing near the buffer boundary
+// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries
+//
+// The algorithm starts with a conservative expansion estimate (1.15x) and
+// dynamically adjusts based on actual character distribution, making it
+// efficient for common ASCII-heavy text while remaining correct for
+// multi-byte heavy content.
+template <typename Char>
+size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
+  size_t pos = 0;
+  size_t utf8Accumulated = 0;
+  constexpr size_t CHUNK = 257;
+  constexpr bool UTF16 = sizeof(Char) == 2;
+  constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
+
+  double expansion = 1.15;
+
+  while (pos < length && utf8Accumulated < bufferSize) {
+    size_t remainingInput = length - pos;
+    size_t spaceRemaining = bufferSize - utf8Accumulated;
+    DCHECK_GE(expansion, 1.15);
+
+    size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
+    if (guaranteedToFit >= remainingInput) {
+      return length;
+    }
+    size_t likelyToFit =
+        std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
+    size_t fitEstimate =
+        std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
+    size_t chunkSize = std::min(remainingInput, fitEstimate);
+    if (chunkSize == 1) break;
+    CHECK_GT(chunkSize, 1);
+
+    size_t chunkUtf8Len;
+    if constexpr (UTF16) {
+      // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
+      // available For now, validate and use utf8_length_from_utf16
+      size_t newPos = pos + chunkSize;
+      if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos]))
+        chunkSize--;
+      chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
+    } else {
+      chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
+    }
+
+    if (utf8Accumulated + chunkUtf8Len > bufferSize) {
+      DCHECK_GT(chunkSize, guaranteedToFit);
+      expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
+    } else {
+      expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
+      pos += chunkSize;
+      utf8Accumulated += chunkUtf8Len;
+    }
+  }
+
+  while (pos < length && utf8Accumulated < bufferSize) {
+    size_t extra = simpleUtfEncodingLength(data[pos]);
+    if (utf8Accumulated + extra > bufferSize) break;
+    pos++;
+    utf8Accumulated += extra;
+  }
+
+  if (UTF16 && pos != 0 && pos != length &&
+      isSurrogatePair(data[pos - 1], data[pos])) {
+    if (utf8Accumulated < bufferSize) {
+      pos++;
+    } else {
+      pos--;
+    }
+  }
+  return pos;
+}
+}  // namespace
+
 void BindingData::Deserialize(Local<Context> context,
                               Local<Object> holder,
                               int index,
@@ -98,18 +207,102 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
 
   Local<Uint8Array> dest = args[1].As<Uint8Array>();
   Local<ArrayBuffer> buf = dest->Buffer();
+
+  // Handle detached buffers - return {read: 0, written: 0}
+  if (buf->Data() == nullptr) {
+    binding_data->encode_into_results_buffer_[0] = 0;
+    binding_data->encode_into_results_buffer_[1] = 0;
+    return;
+  }
+
   char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
   size_t dest_length = dest->ByteLength();
+  size_t read = 0;
+  size_t written = 0;
+
+  // For small strings (length <= 32), use the old V8 path for better
+  // performance
+  static constexpr int kSmallStringThreshold = 32;
+  if (source->Length() <= kSmallStringThreshold) {
+    written = source->WriteUtf8V2(isolate,
+                                  write_result,
+                                  dest_length,
+                                  String::WriteFlags::kReplaceInvalidUtf8,
+                                  &read);
+    binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
+    binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
+    return;
+  }
 
-  size_t nchars;
-  size_t written = source->WriteUtf8V2(isolate,
-                                       write_result,
-                                       dest_length,
-                                       String::WriteFlags::kReplaceInvalidUtf8,
-                                       &nchars);
+  v8::String::ValueView view(isolate, source);
+  size_t length_that_fits =
+      std::min(static_cast<size_t>(view.length()), dest_length);
+
+  if (view.is_one_byte()) {
+    auto data = reinterpret_cast<const char*>(view.data8());
+    simdutf::result result =
+        simdutf::validate_ascii_with_errors(data, length_that_fits);
+    written = read = result.count;
+    memcpy(write_result, data, read);
+    write_result += read;
+    data += read;
+    length_that_fits -= read;
+    dest_length -= read;
+    if (length_that_fits != 0 && dest_length != 0) {
+      if (size_t rest = findBestFit(data, length_that_fits, dest_length)) {
+        DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
+        written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
+        read += rest;
+      }
+    }
+  } else {
+    auto data = reinterpret_cast<const char16_t*>(view.data16());
+
+    // Limit conversion to what could fit in destination, avoiding splitting
+    // a valid surrogate pair at the boundary, which could cause a spurious call
+    // of simdutf::to_well_formed_utf16()
+    if (length_that_fits > 0 && length_that_fits < view.length() &&
+        isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) {
+      length_that_fits--;
+    }
+
+    // Check if input has unpaired surrogates - if so, convert to well-formed
+    // first
+    simdutf::result validation_result =
+        simdutf::validate_utf16_with_errors(data, length_that_fits);
+
+    if (validation_result.error == simdutf::SUCCESS) {
+      // Valid UTF-16 - use the fast path
+      read = findBestFit(data, length_that_fits, dest_length);
+      if (read != 0) {
+        DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
+        written = simdutf::convert_utf16_to_utf8(data, read, write_result);
+      }
+    } else {
+      // Invalid UTF-16 with unpaired surrogates - convert to well-formed first
+      // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
+      // available
+      MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
+          length_that_fits);
+      simdutf::to_well_formed_utf16(
+          data, length_that_fits, conversion_buffer.out());
+
+      // Now use findBestFit with the well-formed data
+      read =
+          findBestFit(conversion_buffer.out(), length_that_fits, dest_length);
+      if (read != 0) {
+        DCHECK_LE(
+            simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
+            dest_length);
+        written = simdutf::convert_utf16_to_utf8(
+            conversion_buffer.out(), read, write_result);
+      }
+    }
+  }
+  DCHECK_LE(written, dest->ByteLength());
 
-  binding_data->encode_into_results_buffer_[0] = nchars;
-  binding_data->encode_into_results_buffer_[1] = written;
+  binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
+  binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
 }
 
 // Encode a single string to a UTF-8 Uint8Array (not Buffer).