fix

jiangtian · jiangtian · commit e6e7d0a81543 · 2026-03-09T17:50:14.000+08:00
diff --git a/velox/exec/Aggregate.h b/velox/exec/Aggregate.h
@@ -302,6 +302,7 @@ class Aggregate {
 
     for (auto* group : groups) {
       group[initializedByte_] &= ~initializedMask_;
+      clearNull(group);
     }
   }
 
diff --git a/velox/exec/AggregateWindow.cpp b/velox/exec/AggregateWindow.cpp
@@ -150,7 +150,6 @@ class AggregateWindowFunction : public exec::WindowFunction {
         // This is the start of a new incremental aggregation. So the
         // aggregate_ function object should be initialized.
         auto singleGroup = std::vector<vector_size_t>{0};
-        aggregate_->clear();
         aggregate_->destroy(folly::Range<char**>(&rawSingleGroupRow_, 1));
         aggregate_->initializeNewGroups(&rawSingleGroupRow_, singleGroup);
         aggregateInitialized_ = true;
@@ -337,7 +336,6 @@ class AggregateWindowFunction : public exec::WindowFunction {
       // TODO : Try to re-use previous computations by advancing and retracting
       // the aggregation based on the frame changes with each row. This would
       // require adding new APIs to the Aggregate framework.
-      aggregate_->clear();
       aggregate_->destroy(folly::Range<char**>(&rawSingleGroupRow_, 1));
       aggregate_->initializeNewGroups(&rawSingleGroupRow_, kSingleGroup);
       aggregateInitialized_ = true;
diff --git a/velox/functions/sparksql/window/tests/SparkWindowTest.cpp b/velox/functions/sparksql/window/tests/SparkWindowTest.cpp
@@ -115,8 +115,8 @@ class SparkAggregateWindowLimitMemoryTest
   static void SetUpTestCase() {
     OperatorTestBase::SetUpTestCase();
     OperatorTestBase::setupMemory(
-        256 << 20, // allocatorCapacity
-        256 << 20, // arbitratorCapacity
+        192 << 20, // allocatorCapacity
+        192 << 20, // arbitratorCapacity
         0, // arbitratorReservedCapacity
         0, // memoryPoolInitCapacity
         0, // memoryPoolReservedCapacity
@@ -137,18 +137,22 @@ class SparkAggregateWindowLimitMemoryTest
 // the limit and cause failure.
 // The capacity is calculated as:
 //   1. size of input data: 1 rows * 32KB (the length of the string) = 32KB
-//   2. size of RowContainer: ((1,024 rows * 32KB (data) + 1,024 rows * 32KB
-//      (Accumulators))) * 3 = 192MB
-//      (Accumulators won't be destroyed now)
+//   2. size of RowContainer: ((1,024 rows * 32KB (data))) * 3 +
+//   32KB(Accumulator) = 96MB
+//      (Accumulators will be destroyed when processing a new partition.)
 //   3. size of results: 10 rows * 32KB * 2 (column 'd' and the result column) =
 //      640KB
 //   4. other overheads
-//    Total: ~ 192MB
+//    Total: ~ 96MB
 // If we don't clear the string buffers in time, the size of the string buffers
-// would be at least 1,024 rows * 32KB * 3 = 96MB. So without the fix, we need
-// capacity to be set to more than 192MB + 96MB = 288MB to pass the test.
-// So we set the capacity to 256MB here.
-TEST_F(SparkAggregateWindowLimitMemoryTest, clearStringBuffersInTime) {
+// would be at least 1,024 rows * 32KB * 3 = 96MB. If we don't destroy the
+// previously created accumulator, the memory size of accumulators accumulated
+// will be (32KB * 1024 rows * 3) = 96MB. So without the fix, we need capacity
+// to be set to more than 96MB + 96MB + 96MB = 288MB to pass the test. We set
+// the capacity to 192MB here to verify that our fixes work.
+TEST_F(
+    SparkAggregateWindowLimitMemoryTest,
+    clearStringBuffersAndAccumulatorsInTime) {
   constexpr vector_size_t size = 1'024 * 3;
   constexpr vector_size_t resultSize = 10;
   // For this test, it is important to create a single-row partition.

Original file line number	Diff line number	Diff line change
`@@ -302,6 +302,7 @@ class Aggregate {`
`302`	`302`
`303`	`303`	`for (auto* group : groups) {`
`304`	`304`	`group[initializedByte_] &= ~initializedMask_;`
	`305`	`+ clearNull(group);`
`305`	`306`	`}`
`306`	`307`	`}`
`307`	`308`