Support Per Stream Chunking to Relieve Memory Pressure (facebookincubator#243)

macvincent · meta-codesync[bot] · commit 3930c6783aa1 · 2025-10-07T17:52:46.000-07:00
Summary: Pull Request resolved: facebookincubator#243 This is an implementation of a detail in the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Rather than chunking all eligible streams, we chunk individual streams in the order of their raw size until memory pressure is relieved. For our unit tests, the maximum number of chunks produced is identical to the previous implementation. But there may be differences for large file sizes. This requires more experimentation and tuning to determine the right threshold value that takes advantage of this. Differential Revision: D81715655
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -791,7 +791,9 @@ void VeloxWriter::writeChunk(bool lastChunk) {
           << ", chunk bytes: " << chunkSize;
 }
 
-bool VeloxWriter::writeChunks(bool lastChunk) {
+bool VeloxWriter::writeChunks(
+    bool lastChunk,
+    std::span<const uint32_t> streamIndices) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
   std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
@@ -865,15 +867,26 @@ bool VeloxWriter::writeChunks(bool lastChunk) {
       }
     };
 
+    const auto& streams = context_->streams();
+    std::vector<uint32_t> streamIndicesVec;
+    if (streamIndices.empty()) {
+      // Chunk all streams if no stream indices are provided
+      streamIndicesVec.resize(streams.size());
+      std::iota(streamIndicesVec.begin(), streamIndicesVec.end(), 0);
+      streamIndices = streamIndicesVec;
+    }
+
     if (context_->options.encodingExecutor) {
       velox::dwio::common::ExecutorBarrier barrier{
           context_->options.encodingExecutor};
-      for (auto& streamData : context_->streams()) {
+      for (auto streamIndex : streamIndices) {
+        auto& streamData = streams[streamIndex];
         barrier.add([&] { processStream(*streamData); });
       }
       barrier.waitAll();
     } else {
-      for (auto& streamData : context_->streams()) {
+      for (auto streamIndex : streamIndices) {
+        auto& streamData = streams[streamIndex];
         processStream(*streamData);
       }
     }
@@ -972,8 +985,41 @@ bool VeloxWriter::tryWriteStripe(bool force) {
 
   try {
     // TODO: we can improve merge the last chunk write with stripe
-    if (context_->options.enableChunking) {
-      while (shouldChunk() == ChunkDecision::Chunk && writeChunks(false)) {
+    if (context_->options.enableChunking &&
+        shouldChunk() == ChunkDecision::Chunk) {
+      const auto& streams = context_->streams();
+      const uint32_t streamCount = streams.size();
+      // Sort streams for chunking based on raw memory usage.
+      // TODO(T240072104): Improve performance by bucketing the streams by size
+      // (most significant bit) instead of sorting.
+      std::vector<uint32_t> streamIndices(streamCount);
+      std::iota(streamIndices.begin(), streamIndices.end(), 0);
+      std::sort(
+          streamIndices.begin(),
+          streamIndices.end(),
+          [&](const uint32_t& a, const uint32_t& b) {
+            return streams[a]->memoryUsed() > streams[b]->memoryUsed();
+          });
+
+      // Chunk streams in batches.
+      uint32_t currentIndex = 0;
+      ChunkDecision decision = ChunkDecision::Chunk;
+      NIMBLE_DASSERT(
+          context_->options.chunkedStreamBatchSize > 0,
+          "streamEncodingBatchSize must be greater than 0");
+      while (currentIndex < streams.size() &&
+             decision == ChunkDecision::Chunk) {
+        uint32_t endStreamIndex = std::min(
+            streamCount,
+            currentIndex + context_->options.chunkedStreamBatchSize);
+        std::span<const uint32_t> batchIndices(
+            streamIndices.data() + currentIndex, endStreamIndex - currentIndex);
+        // Stop attempting chunking once streams are too small to chunk.
+        if (!writeChunks(false, batchIndices)) {
+          break;
+        }
+        currentIndex = endStreamIndex;
+        decision = shouldChunk();
       }
     }
 
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -88,7 +88,9 @@ class VeloxWriter {
   bool tryWriteStripe(bool force = false);
   void writeChunk(bool lastChunk = true);
   // Returns 'true' if chunks were written.
-  bool writeChunks(bool lastChunk = true);
+  bool writeChunks(
+      bool lastChunk = true,
+      std::span<const uint32_t> streamIndices = {});
   uint32_t writeStripe();
 };
 
diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h
@@ -96,6 +96,10 @@ struct VeloxWriterOptions {
   // Note: this threshold is ignored when it is time to flush a stripe.
   uint64_t minStreamChunkRawSize = 1024;
 
+  // Number of streams to try chunking between memory pressure evaluations.
+  // Note: this is ignored when it is time to flush a stripe.
+  uint32_t chunkedStreamBatchSize = 1024;
+
   // The factory function that produces the root encoding selection policy.
   // Encoding selection policy is the way to balance the tradeoffs of
   // different performance factors (at both read and write times). Heuristics
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -1965,6 +1965,7 @@ struct ChunkFlushPolicyTestCase {
   const uint32_t expectedStripeCount{0};
   const uint32_t expectedMaxChunkCount{0};
   const uint32_t expectedMinChunkCount{0};
+  const uint32_t chunkedStreamBatchSize{2};
 };
 
 class ChunkFlushPolicyTest
@@ -1976,6 +1977,7 @@ TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
       {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
   nimble::VeloxWriterOptions writerOptions{
       .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+      .chunkedStreamBatchSize = GetParam().chunkedStreamBatchSize,
       .flushPolicyFactory = GetParam().enableChunking
           ? []() -> std::unique_ptr<nimble::FlushPolicy> {
               return std::make_unique<nimble::ChunkFlushPolicy>(
@@ -2097,6 +2099,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
         },
         // Base case with default settings (has chunking)
         ChunkFlushPolicyTestCase{
@@ -2110,6 +2113,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 7,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
         },
         // High memory regression threshold
         // Produces file identical to RawStripeSizeFlushPolicy
@@ -2124,6 +2128,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
         },
         // Low memory regression threshold
         // Produces file with more chunks per stripe
@@ -2138,6 +2143,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 10,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 2,
+            .chunkedStreamBatchSize = 10,
         },
         // High target stripe size bytes (with disabled memory pressure
         // optimization) produces fewer stripes. Single chunks.
@@ -2152,6 +2158,8 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 1, // -2 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
+
         },
         // Low target stripe size bytes (with disabled memory pressure
         // optimization) produces more stripes. Single chunks.
@@ -2166,5 +2174,21 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 7, // +6 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
-        }));
+            .chunkedStreamBatchSize = 10,
+
+        },
+        // Higher chunked stream batch size (no change in policy)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThresholdBytes = 80 << 10,
+            .writerMemoryLowThresholdBytes = 75 << 10,
+            .estimatedCompressionFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 7,
+            .expectedMaxChunkCount = 2,
+            .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 3} // +1
+        ));
 } // namespace facebook