Skip to content

Commit c386ba0

Browse files
committed
docs(aggregate): clarify AccumulatorArgs schema handling and usage
- Improve documentation for AccumulatorArgs.schema and exprs: - Add example showing how to retrieve field metadata and return field. - Explain synthesized schema behavior for literal-only inputs. - Clarify precedence when inputs are mixed (physical schema metadata wins; synthesized metadata used only when physical schema is empty). - Update AggregateFunctionExpr::args_schema docs: - Explain field order guarantees, synthesized schema usage, and that std::borrow::Cow is used to avoid allocations when possible. - Add a TODO to factor AccumulatorArgs construction into a private helper. Documentation-only changes; no behavioral changes.
1 parent fea8c1c commit c386ba0

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

datafusion/expr/src/udaf.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,19 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
752752
///
753753
/// acc_args: [`AccumulatorArgs`] contains information about how the
754754
/// aggregate function was called. Use `acc_args.exprs` together with
755-
/// `acc_args.schema` to inspect the [`FieldRef`] of each input. When an
755+
/// `acc_args.schema` to inspect the [`FieldRef`] of each input.
756+
///
757+
/// Example: retrieving metadata and return field for input `i`:
758+
/// ```rust
759+
/// let metadata = acc_args.schema.field(i).metadata();
760+
/// let field = acc_args.exprs[i].return_field(&acc_args.schema)?;
761+
/// ```
762+
/// Multi-argument functions: `exprs[i]` corresponds to `schema.field(i)`.
763+
/// Mixed inputs (columns and literals): the physical input schema is used
764+
/// when not empty; `acc_args.schema` is only synthesized from literals when
765+
/// the physical schema is empty.
766+
///
767+
/// When an
756768
/// aggregate is invoked with literal values only, `acc_args.schema` is
757769
/// synthesized from those literals so that any field metadata (for
758770
/// example Arrow extension types) is available to the accumulator

datafusion/functions-aggregate-common/src/accumulator.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,12 @@ pub struct AccumulatorArgs<'a> {
3232

3333
/// The schema of the input arguments.
3434
///
35-
/// This schema contains the fields corresponding to [`Self::exprs`]. When
36-
/// an aggregate is invoked with only literal values, this schema is
37-
/// synthesized from those literals so that field-level metadata (such as
38-
/// Arrow extension types) remains available to accumulator
39-
/// implementations.
35+
/// This schema contains the fields corresponding to the function’s input
36+
/// expressions (`exprs`). When an aggregate is invoked with only literal
37+
/// values, this schema is synthesized from those literals to preserve
38+
/// field-level metadata (such as Arrow extension types). In mixed column
39+
/// and literal inputs, metadata from the physical schema takes precedence;
40+
/// synthesized metadata is only used when the physical schema is empty.
4041
pub schema: &'a Schema,
4142

4243
/// Whether to ignore nulls.
@@ -74,6 +75,12 @@ pub struct AccumulatorArgs<'a> {
7475
/// The physical expressions for the aggregate function's arguments.
7576
/// Use these expressions together with [`Self::schema`] to obtain the
7677
/// [`FieldRef`] of each input via `expr.return_field(schema)`.
78+
///
79+
/// Example:
80+
/// ```rust
81+
/// let input_field = exprs[i].return_field(&schema)?;
82+
/// ```
83+
/// Note: physical schema metadata takes precedence in mixed inputs.
7784
pub exprs: &'a [Arc<dyn PhysicalExpr>],
7885
}
7986

datafusion/physical-expr/src/aggregate.rs

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,13 +378,19 @@ impl AggregateFunctionExpr {
378378
}
379379

380380
/// Returns a schema containing the fields corresponding to this
381-
/// aggregate's input expressions.
381+
/// aggregate's input expressions in the same order as `input_fields`/`exprs`.
382382
///
383-
/// When an aggregate is invoked with only literal values, the
384-
/// physical input schema is empty. In this case a schema is
385-
/// synthesized from the literal expressions so that field metadata
386-
/// (such as Arrow extension types) remains available to the
387-
/// [`Accumulator`].
383+
/// If the physical input schema is empty (literal-only inputs),
384+
/// synthesizes a new schema from the literal expressions to preserve
385+
/// field-level metadata (such as Arrow extension types).
386+
/// Field order is guaranteed to match the order of input expressions.
387+
/// In mixed column and literal inputs, existing physical schema fields
388+
/// win; synthesized metadata is only applied when the physical schema
389+
/// has no fields.
390+
///
391+
/// Uses [`std::borrow::Cow`] to avoid allocation when the existing
392+
/// schema is non-empty. For micro-optimizations, implementers may
393+
/// cache the owned schema if multiple calls are made per instance.
388394
fn args_schema(&self) -> Cow<'_, Schema> {
389395
if self.schema.fields().is_empty() {
390396
Cow::Owned(Schema::new(
@@ -401,6 +407,7 @@ impl AggregateFunctionExpr {
401407
/// the accumulator used to accumulate values from the expressions.
402408
/// the accumulator expects the same number of arguments as `expressions` and must
403409
/// return states with the same description as `state_fields`
410+
// TODO: factor AccumulatorArgs construction into a private helper to avoid duplication
404411
pub fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
405412
let schema = self.args_schema();
406413
let acc_args = AccumulatorArgs {

0 commit comments

Comments
 (0)