Integrate Max Stream Size Chunking in Velox Writer (facebookincubator#249)

macvincent · meta-codesync[bot] · commit 8892f7f9028c · 2025-11-03T10:22:13.000-08:00
Summary: Pull Request resolved: facebookincubator#249 This diff completes the new chunking policy described in [this doc](https://fburl.com/gdoc/gkdwwju1). It introduces integration of the `StreamChunker` with the `VeloxWriter`, enabling large streams to be broken down into multiple chunks of size up to `maxStreamChunkRawSize`. This prevents readers from materializing excessively large chunks and improves memory management. ****Two-Phase Chunking Policy**** **Phase 1 – Memory Pressure Management (`shouldChunk`)** - Monitors total in-memory data size. - When memory usage exceeds the threshold, initiates chunking for streams above `maxStreamChunkRawSize`. - If memory pressure persists, continues chunking by sorting streams by memory footprint and processing them in batches until pressure is relieved or no more streams can be chunked. **Phase 2 – Storage Size Optimization (`shouldFlush`)** - Implements compression-aware stripe size prediction. - If chunking fails to resolve memory pressure, forces a full stripe flush. - Calculates the anticipated final compressed stripe size and triggers a flush when the predicted size reaches the target threshold. Reviewed By: helfman Differential Revision: D82175496
diff --git a/dwio/nimble/velox/CMakeLists.txt b/dwio/nimble/velox/CMakeLists.txt
@@ -153,6 +153,7 @@ add_library(
   VeloxWriter.cpp
   ChunkedStreamWriter.cpp
   VeloxWriterDefaultMetadataOSS.cpp
+  StreamChunker.cpp
 )
 target_link_libraries(
   nimble_velox_writer
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -33,6 +33,7 @@
 #include "dwio/nimble/velox/SchemaSerialization.h"
 #include "dwio/nimble/velox/SchemaTypes.h"
 #include "dwio/nimble/velox/StatsGenerated.h"
+#include "dwio/nimble/velox/StreamChunker.h"
 #include "velox/common/time/CpuWallTimer.h"
 #include "velox/dwio/common/ExecutorBarrier.h"
 #include "velox/type/Type.h"
@@ -808,6 +809,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
 
 bool VeloxWriter::writeChunks(
     std::span<const uint32_t> streamIndices,
+    bool ensureFullChunks,
     bool lastChunk) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
@@ -823,56 +825,34 @@ bool VeloxWriter::writeChunks(
     streams_.resize(context_->schemaBuilder.nodeCount());
 
     auto processStream = [&](StreamData& streamData) {
-      // TODO: Breakdown large streams above a threshold into smaller chunks.
-      const auto minStreamSize =
-          lastChunk ? 0 : context_->options.minStreamChunkRawSize;
-      const auto* context =
-          streamData.descriptor().context<WriterStreamContext>();
-      bool isNullStream = context && context->isNullStream;
-      bool shouldChunkStream = false;
-      if (isNullStream) {
-        // We apply the same null logic, where if all values
-        // are non-nulls, we omit the entire stream.
-        shouldChunkStream = streamData.hasNulls() &&
-            streamData.nonNulls().size() > minStreamSize;
-      } else {
-        shouldChunkStream = streamData.data().size() > minStreamSize;
-      }
-
-      // If we have previous written chunks for this stream, during final
-      // chunk, always write any remaining data.
-      const auto offset = streamData.descriptor().offset();
-      NIMBLE_DASSERT(offset < streams_.size(), "Stream offset out of range.");
-      auto& stream = streams_[offset];
-      if (lastChunk && !shouldChunkStream && !stream.content.empty()) {
-        shouldChunkStream =
-            !streamData.empty() || !streamData.nonNulls().empty();
-      }
-
-      if (shouldChunkStream) {
-        std::string_view encoded;
-        if (isNullStream) {
-          // For null streams we promote the null values to be written as
-          // boolean data.
-          encoded = encodeStream(
-              *context_, *encodingBuffer_, NullsAsDataStreamData(streamData));
-        } else {
-          encoded = encodeStream(*context_, *encodingBuffer_, streamData);
-        }
-
+      const auto& offset = streamData.descriptor().offset();
+      auto& streamSize = context_->columnStats[offset].physicalSize;
+      logicalSizeBeforeEncoding += streamData.memoryUsed();
+      auto& streamContent = streams_[offset].content;
+      auto chunker = getStreamChunker(
+          streamData,
+          StreamChunkerOptions{
+              .minChunkSize =
+                  lastChunk ? 0 : context_->options.minStreamChunkRawSize,
+              .maxChunkSize = context_->options.maxStreamChunkRawSize,
+              .ensureFullChunks = ensureFullChunks,
+              .isFirstChunk = streamContent.empty()});
+      while (auto streamDataView = chunker->next()) {
+        std::string_view encoded =
+            encodeStream(*context_, *encodingBuffer_, *streamDataView);
         if (!encoded.empty()) {
-          auto& streamSize = context_->columnStats[offset].physicalSize;
           ChunkedStreamWriter chunkWriter{*encodingBuffer_};
           for (auto& buffer : chunkWriter.encode(encoded)) {
             streamSize += buffer.size();
             chunkSize += buffer.size();
-            stream.content.push_back(std::move(buffer));
+            streamContent.push_back(std::move(buffer));
           }
         }
         wroteChunk = true;
-        logicalSizeBeforeEncoding += streamData.memoryUsed();
-        streamData.reset();
       }
+      // Compact erases processed stream data to reclaim memory.
+      chunker->compact();
+      logicalSizeBeforeEncoding -= streamData.memoryUsed();
     };
 
     const auto& streams = context_->streams();
@@ -918,7 +898,7 @@ bool VeloxWriter::writeStripe() {
     // Chunk all streams.
     std::vector<uint32_t> streamIndices(context_->streams().size());
     std::iota(streamIndices.begin(), streamIndices.end(), 0);
-    writeChunks(streamIndices, true);
+    writeChunks(streamIndices, /*ensureFullChunks=*/false, /*lastChunk=*/true);
   } else {
     writeChunk(true);
   }
@@ -997,32 +977,50 @@ bool VeloxWriter::evalauateFlushPolicy() {
   };
 
   if (context_->options.enableChunking && shouldChunk()) {
-    const auto& streams = context_->streams();
-    const size_t streamCount = streams.size();
-    // Sort streams for chunking based on raw memory usage.
-    // TODO(T240072104): Improve performance by bucketing the streams by size
-    // (most significant bit) instead of sorting.
-    std::vector<uint32_t> streamIndices(streamCount);
-    std::iota(streamIndices.begin(), streamIndices.end(), 0);
-    std::sort(
-        streamIndices.begin(),
-        streamIndices.end(),
-        [&](const uint32_t& a, const uint32_t& b) {
-          return streams[a]->memoryUsed() > streams[b]->memoryUsed();
-        });
+    auto batchChunkStreams = [&](const std::vector<uint32_t>& indices,
+                                 bool ensureFullChunks) {
+      const size_t indicesCount = indices.size();
+      const auto batchSize = context_->options.chunkedStreamBatchSize;
+      for (size_t index = 0; index < indicesCount; index += batchSize) {
+        size_t currentBatchSize = std::min(batchSize, indicesCount - index);
+        std::span<const uint32_t> batchIndices(
+            indices.begin() + index, currentBatchSize);
+        // Stop attempting chunking once streams are too small to chunk or
+        // memory pressure is relieved.
+        if (!writeChunks(batchIndices, ensureFullChunks) || !shouldChunk()) {
+          return false;
+        }
+      }
+      return true;
+    };
 
-    // Chunk streams in batches.
-    const auto batchSize = context_->options.chunkedStreamBatchSize;
-    for (size_t index = 0; index < streamCount; index += batchSize) {
-      const size_t currentBatchSize = std::min(batchSize, streamCount - index);
-      std::span<const uint32_t> batchIndices(
-          streamIndices.begin() + index, currentBatchSize);
-      // Stop attempting chunking once streams are too small to chunk or
-      // memory pressure is relieved.
-      if (!(writeChunks(batchIndices, false) && shouldChunk())) {
-        break;
+    // Relieve memory pressure by chunking streams above max size.
+    const auto& streams = context_->streams();
+    std::vector<uint32_t> streamIndices;
+    streamIndices.reserve(streams.size());
+    for (auto streamIndex = 0; streamIndex < streams.size(); ++streamIndex) {
+      if (streams[streamIndex]->memoryUsed() >=
+          context_->options.maxStreamChunkRawSize) {
+        streamIndices.push_back(streamIndex);
       }
     }
+    const bool continueChunking =
+        batchChunkStreams(streamIndices, /*ensureFullChunks=*/true);
+    if (continueChunking) {
+      // Relieve memory pressure by chunking small streams.
+      // Sort streams for chunking based on raw memory usage.
+      // TODO(T240072104): Improve performance by bucketing the streams
+      // by size (by most significant bit) instead of sorting them.
+      streamIndices.resize(streams.size());
+      std::iota(streamIndices.begin(), streamIndices.end(), 0);
+      std::sort(
+          streamIndices.begin(),
+          streamIndices.end(),
+          [&](const uint32_t& a, const uint32_t& b) {
+            return streams[a]->memoryUsed() > streams[b]->memoryUsed();
+          });
+      batchChunkStreams(streamIndices, /*ensureFullChunks=*/false);
+    }
   }
 
   if (shouldFlush()) {
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -92,7 +92,8 @@ class VeloxWriter {
   // Returns 'true' if chunks were written.
   bool writeChunks(
       std::span<const uint32_t> streamIndices,
-      bool lastChunk = true);
+      bool ensureFullChunks = false,
+      bool lastChunk = false);
 };
 
 } // namespace facebook::nimble
diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h
@@ -96,6 +96,11 @@ struct VeloxWriterOptions {
   // Note: this threshold is ignored when it is time to flush a stripe.
   uint64_t minStreamChunkRawSize = 1024;
 
+  // When flushing data streams into chunks, streams with raw data size larger
+  // than this threshold will be broken down into multiple smaller chunks. Each
+  // chunk will be at most this size.
+  uint64_t maxStreamChunkRawSize = 4 << 20;
+
   // Number of streams to try chunking between memory pressure evaluations.
   // Note: this is ignored when it is time to flush a stripe.
   size_t chunkedStreamBatchSize = 1024;
diff --git a/dwio/nimble/velox/tests/CMakeLists.txt b/dwio/nimble/velox/tests/CMakeLists.txt
@@ -46,6 +46,7 @@ target_link_libraries(
   nimble_common_file_writer
   nimble_common
   nimble_encodings
+  nimble_encodings_tests_utils
   velox_vector
   velox_vector_fuzzer
   velox_vector_test_lib
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp

Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ add_library(`
`153`	`153`	`VeloxWriter.cpp`
`154`	`154`	`ChunkedStreamWriter.cpp`
`155`	`155`	`VeloxWriterDefaultMetadataOSS.cpp`
	`156`	`+ StreamChunker.cpp`
`156`	`157`	`)`
`157`	`158`	`target_link_libraries(`
`158`	`159`	`nimble_velox_writer`