fix(accumulators): preserve state in evaluate() for window frame queries

GaneshPatil7517 · GaneshPatil7517 · commit 6954497600e4 · 2026-01-03T13:57:36.000+05:30
This commit fixes issue #19612 where accumulators that don't implement retract_batch exhibit buggy behavior in window frame queries. ## Problem When aggregate functions are used with window frames like `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`, DataFusion uses PlainAggregateWindowExpr which calls evaluate() multiple times on the same accumulator instance. Accumulators that use std::mem::take() in their evaluate() method consume their internal state, causing incorrect results on subsequent calls. ## Solution 1. **percentile_cont**: Modified evaluate() to use mutable reference instead of consuming the Vec. Added retract_batch() support for both PercentileContAccumulator and DistinctPercentileContAccumulator. 2. **string_agg**: Changed SimpleStringAggAccumulator::evaluate() to clone the accumulated string instead of taking it. ## Changes - datafusion/functions-aggregate/src/percentile_cont.rs: - Changed calculate_percentile() to take &mut [T::Native] instead of Vec<T::Native> - Updated PercentileContAccumulator::evaluate() to pass reference - Updated DistinctPercentileContAccumulator::evaluate() to clone values - Added retract_batch() implementation using HashMap for efficient removal - Updated PercentileContGroupsAccumulator::evaluate() for consistency - datafusion/functions-aggregate/src/string_agg.rs: - Changed evaluate() to use clone() instead of std::mem::take() - datafusion/sqllogictest/test_files/aggregate.slt: - Added test cases for percentile_cont with window frames - Added test comparing median() vs percentile_cont(0.5) behavior - Added test for string_agg cumulative window frame - docs/source/library-user-guide/functions/adding-udfs.md: - Added documentation about window-compatible accumulators - Explained evaluate() state preservation requirements - Documented retract_batch() implementation guidance Closes #19612
diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
 use std::fmt::{Debug, Formatter};
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
@@ -52,7 +53,7 @@ use datafusion_expr::{
 };
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask;
-use datafusion_functions_aggregate_common::utils::GenericDistinctBuffer;
+use datafusion_functions_aggregate_common::utils::{GenericDistinctBuffer, Hashable};
 use datafusion_macros::user_doc;
 
 use crate::utils::validate_percentile_expr;
@@ -533,14 +534,57 @@ impl<T: ArrowNumericType> Accumulator for PercentileContAccumulator<T> {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.all_values);
-        let value = calculate_percentile::<T>(d, self.percentile);
+        let value = calculate_percentile::<T>(&mut self.all_values, self.percentile);
         ScalarValue::new_primitive::<T>(value, &self.data_type)
     }
 
     fn size(&self) -> usize {
         size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
     }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        // Cast to target type if needed (e.g., integer to Float64)
+        let values = if values[0].data_type() != &self.data_type {
+            arrow::compute::cast(&values[0], &self.data_type)?
+        } else {
+            Arc::clone(&values[0])
+        };
+
+        let mut to_remove: HashMap<ScalarValue, usize> = HashMap::new();
+        for i in 0..values.len() {
+            let v = ScalarValue::try_from_array(&values, i)?;
+            if !v.is_null() {
+                *to_remove.entry(v).or_default() += 1;
+            }
+        }
+
+        let mut i = 0;
+        while i < self.all_values.len() {
+            let k = ScalarValue::new_primitive::<T>(
+                Some(self.all_values[i]),
+                &self.data_type,
+            )?;
+            if let Some(count) = to_remove.get_mut(&k)
+                && *count > 0
+            {
+                self.all_values.swap_remove(i);
+                *count -= 1;
+                if *count == 0 {
+                    to_remove.remove(&k);
+                    if to_remove.is_empty() {
+                        break;
+                    }
+                }
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
 }
 
 /// The percentile_cont groups accumulator accumulates the raw input values
@@ -665,13 +709,13 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator
 
     fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
         // Emit values
-        let emit_group_values = emit_to.take_needed(&mut self.group_values);
+        let mut emit_group_values = emit_to.take_needed(&mut self.group_values);
 
         // Calculate percentile for each group
         let mut evaluate_result_builder =
             PrimitiveBuilder::<T>::new().with_data_type(self.data_type.clone());
-        for values in emit_group_values {
-            let value = calculate_percentile::<T>(values, self.percentile);
+        for values in &mut emit_group_values {
+            let value = calculate_percentile::<T>(values.as_mut_slice(), self.percentile);
             evaluate_result_builder.append_option(value);
         }
 
@@ -768,17 +812,35 @@ impl<T: ArrowNumericType + Debug> Accumulator for DistinctPercentileContAccumula
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.distinct_values.values)
-            .into_iter()
+        let mut values: Vec<T::Native> = self
+            .distinct_values
+            .values
+            .iter()
             .map(|v| v.0)
-            .collect::<Vec<_>>();
-        let value = calculate_percentile::<T>(d, self.percentile);
+            .collect();
+        let value = calculate_percentile::<T>(&mut values, self.percentile);
         ScalarValue::new_primitive::<T>(value, &self.data_type)
     }
 
     fn size(&self) -> usize {
         size_of_val(self) + self.distinct_values.size()
     }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = values[0].as_primitive::<T>();
+        for value in arr.iter().flatten() {
+            self.distinct_values.values.remove(&Hashable(value));
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
 }
 
 /// Calculate the percentile value for a given set of values.
@@ -788,8 +850,12 @@ impl<T: ArrowNumericType + Debug> Accumulator for DistinctPercentileContAccumula
 /// For percentile p and n values:
 /// - If p * (n-1) is an integer, return the value at that position
 /// - Otherwise, interpolate between the two closest values
+///
+/// Note: This function takes a mutable slice and sorts it in place, but does not
+/// consume the data. This is important for window frame queries where evaluate()
+/// may be called multiple times on the same accumulator state.
 fn calculate_percentile<T: ArrowNumericType>(
-    mut values: Vec<T::Native>,
+    values: &mut [T::Native],
     percentile: f64,
 ) -> Option<T::Native> {
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs
@@ -384,14 +384,11 @@ impl Accumulator for SimpleStringAggAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let result = if self.has_value {
-            ScalarValue::LargeUtf8(Some(std::mem::take(&mut self.accumulated_string)))
+        if self.has_value {
+            Ok(ScalarValue::LargeUtf8(Some(self.accumulated_string.clone())))
         } else {
-            ScalarValue::LargeUtf8(None)
-        };
-
-        self.has_value = false;
-        Ok(result)
+            Ok(ScalarValue::LargeUtf8(None))
+        }
     }
 
     fn size(&self) -> usize {
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -8241,3 +8241,137 @@ NULL NULL NULL NULL
 
 statement ok
 drop table distinct_avg;
+
+###########
+# Issue #19612: Test that percentile_cont and median produce identical results
+# in window frame queries with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW.
+# Previously percentile_cont consumed its internal state during evaluate(),
+# causing incorrect results when called multiple times in window queries.
+###########
+
+# Test percentile_cont window frame behavior (fix for issue #19612)
+statement ok
+CREATE TABLE percentile_window_test (
+    timestamp INT,
+    tags VARCHAR,
+    value DOUBLE
+);
+
+statement ok
+INSERT INTO percentile_window_test (timestamp, tags, value) VALUES
+(1, 'tag1', 10.0),
+(2, 'tag1', 20.0),
+(3, 'tag1', 30.0),
+(4, 'tag1', 40.0),
+(5, 'tag1', 50.0),
+(1, 'tag2', 60.0),
+(2, 'tag2', 70.0),
+(3, 'tag2', 80.0),
+(4, 'tag2', 90.0),
+(5, 'tag2', 100.0);
+
+# Test that median and percentile_cont(0.5) produce the same results
+# with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW frame.
+# Both functions should maintain state correctly across multiple evaluate() calls.
+query ITRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_median,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_percentile_50
+FROM percentile_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 10
+2 tag1 20 15 15
+3 tag1 30 20 20
+4 tag1 40 25 25
+5 tag1 50 30 30
+1 tag2 60 60 60
+2 tag2 70 65 65
+3 tag2 80 70 70
+4 tag2 90 75 75
+5 tag2 100 80 80
+
+# Test percentile_cont with different percentile values
+query ITRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.25) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p25,
+    percentile_cont(value, 0.75) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p75
+FROM percentile_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 10
+2 tag1 20 12.5 17.5
+3 tag1 30 15 25
+4 tag1 40 17.5 32.5
+5 tag1 50 20 40
+1 tag2 60 60 60
+2 tag2 70 62.5 67.5
+3 tag2 80 65 75
+4 tag2 90 67.5 82.5
+5 tag2 100 70 90
+
+statement ok
+DROP TABLE percentile_window_test;
+
+# Test string_agg window frame behavior (fix for issue #19612)
+statement ok
+CREATE TABLE string_agg_window_test (
+    id INT,
+    grp VARCHAR,
+    val VARCHAR
+);
+
+statement ok
+INSERT INTO string_agg_window_test (id, grp, val) VALUES
+(1, 'A', 'a'),
+(2, 'A', 'b'),
+(3, 'A', 'c'),
+(1, 'B', 'x'),
+(2, 'B', 'y'),
+(3, 'B', 'z');
+
+# Test string_agg with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+# The function should maintain state correctly across multiple evaluate() calls
+query ITT
+SELECT
+    id,
+    grp,
+    string_agg(val, ',') OVER (
+        PARTITION BY grp
+        ORDER BY id
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS cumulative_string
+FROM string_agg_window_test
+ORDER BY grp, id;
+----
+1 A a
+2 A a,b
+3 A a,b,c
+1 B x
+2 B x,y
+3 B x,y,z
+
+statement ok
+DROP TABLE string_agg_window_test;
diff --git a/docs/source/library-user-guide/functions/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md
@@ -1350,6 +1350,71 @@ async fn main() -> Result<()> {
 [`create_udaf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udaf.html
 [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 
+### Window Frame Compatible Accumulators
+
+When an aggregate function is used in a window context with a sliding frame (e.g., `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`),
+DataFusion may call `evaluate()` multiple times on the same accumulator instance to compute results for each row in the window.
+This has important implications for how you implement your accumulator:
+
+#### The `evaluate()` Method Must Not Consume State
+
+The `evaluate()` method should return the current aggregate value **without modifying or consuming the accumulator's internal state**.
+This is critical because:
+
+1. **Multiple evaluations**: For window queries, `evaluate()` is called once per row in the partition
+2. **State preservation**: The internal state must remain intact for subsequent `evaluate()` calls
+
+**Incorrect implementation** (consumes state):
+
+```rust
+fn evaluate(&mut self) -> Result<ScalarValue> {
+    // BAD: std::mem::take() consumes the values, leaving an empty Vec
+    let values = std::mem::take(&mut self.values);
+    // After this call, self.values is empty and subsequent
+    // evaluate() calls will return incorrect results
+    calculate_result(values)
+}
+```
+
+**Correct implementation** (preserves state):
+
+```rust
+fn evaluate(&mut self) -> Result<ScalarValue> {
+    // GOOD: Use a reference or clone to preserve state
+    calculate_result(&mut self.values)
+    // Or: calculate_result(self.values.clone())
+}
+```
+
+#### Implementing `retract_batch` for Sliding Windows
+
+For more efficient sliding window calculations, you can implement the `retract_batch` method.
+This allows DataFusion to remove values that have "left" the window frame instead of recalculating from scratch:
+
+```rust
+impl Accumulator for MyAccumulator {
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        // Remove the given values from the accumulator state
+        // This is the inverse of update_batch
+        for value in values[0].iter().flatten() {
+            self.remove_value(value);
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true  // Enable this optimization
+    }
+}
+```
+
+If your accumulator does not support `retract_batch` (returns `false` from `supports_retract_batch()`),
+DataFusion will use `PlainAggregateWindowExpr` which calls `evaluate()` multiple times on the same
+accumulator. In this case, it is **essential** that your `evaluate()` method does not consume the
+accumulator's state.
+
+See [issue #19612](https://github.com/apache/datafusion/issues/19612) for more details on this behavior.
+
 ## Adding a Table UDF
 
 A User-Defined Table Function (UDTF) is a function that takes parameters and returns a `TableProvider`.