@@ -4,7 +4,7 @@ use crate::config_extension_ext::{
44} ;
55use crate :: distributed_planner:: set_distributed_task_estimator;
66use crate :: protobuf:: { set_distributed_user_codec, set_distributed_user_codec_arc} ;
7- use crate :: { ChannelResolver , DistributedPhysicalOptimizerRule , TaskEstimator } ;
7+ use crate :: { ChannelResolver , DistributedConfig , DistributedPhysicalOptimizerRule , TaskEstimator } ;
88use datafusion:: common:: DataFusionError ;
99use datafusion:: config:: ConfigExtension ;
1010use datafusion:: execution:: SessionStateBuilder ;
@@ -250,6 +250,45 @@ pub trait DistributedExt: Sized {
250250 & mut self ,
251251 estimator : T ,
252252 ) ;
253+
254+ /// Sets the maximum number of files each task in a stage with a FileScanConfig node will
255+ /// handle. Reducing this number will increment the amount of tasks. By default, this
256+ /// is close to the number of cores in the machine.
257+ fn with_distributed_files_per_task (
258+ self ,
259+ files_per_task : usize ,
260+ ) -> Result < Self , DataFusionError > ;
261+
262+ /// Same as [DistributedExt::with_distributed_files_per_task] but with an in-place mutation.
263+ fn set_distributed_files_per_task (
264+ & mut self ,
265+ files_per_task : usize ,
266+ ) -> Result < ( ) , DataFusionError > ;
267+
268+ /// The number of tasks in each stage is calculated in a bottom-to-top fashion.
269+ ///
270+ /// Bottom stages containing leaf nodes will provide an estimation of the amount of tasks
271+ /// for those stages, but upper stages might see a reduction (or increment) in the amount
272+ /// of tasks based on the cardinality effect bottom stages have in the data.
273+ ///
274+ /// For example: If there are two stages, and the leaf stage is estimated to use 10 tasks,
275+ /// the upper stage might use less (e.g. 5) if it sees that the leaf stage is returning
276+ /// less data because of filters or aggregations.
277+ ///
278+ /// This function sets the scale factor for when encountering these nodes that change the
279+ /// cardinality of the data. For example, if a stage with 10 tasks contains an AggregateExec
280+ /// node, and the scale factor is 2.0, the following stage will use 10 / 2.0 = 5 tasks.
281+ fn with_distributed_cardinality_effect_task_scale_factor (
282+ self ,
283+ factor : f64 ,
284+ ) -> Result < Self , DataFusionError > ;
285+
286+ /// Same as [DistributedExt::with_distributed_cardinality_effect_task_scale_factor] but with
287+ /// an in-place mutation.
288+ fn set_distributed_cardinality_effect_task_scale_factor (
289+ & mut self ,
290+ factor : f64 ,
291+ ) -> Result < ( ) , DataFusionError > ;
253292}
254293
255294impl DistributedExt for SessionStateBuilder {
@@ -295,6 +334,26 @@ impl DistributedExt for SessionStateBuilder {
295334 set_distributed_task_estimator ( self . config ( ) . get_or_insert_default ( ) , estimator)
296335 }
297336
337+ fn set_distributed_files_per_task (
338+ & mut self ,
339+ files_per_task : usize ,
340+ ) -> Result < ( ) , DataFusionError > {
341+ let cfg = self . config ( ) . get_or_insert_default ( ) ;
342+ let d_cfg = DistributedConfig :: from_config_options_mut ( cfg. options_mut ( ) ) ?;
343+ d_cfg. files_per_task = files_per_task;
344+ Ok ( ( ) )
345+ }
346+
347+ fn set_distributed_cardinality_effect_task_scale_factor (
348+ & mut self ,
349+ factor : f64 ,
350+ ) -> Result < ( ) , DataFusionError > {
351+ let cfg = self . config ( ) . get_or_insert_default ( ) ;
352+ let d_cfg = DistributedConfig :: from_config_options_mut ( cfg. options_mut ( ) ) ?;
353+ d_cfg. cardinality_task_count_factor = factor;
354+ Ok ( ( ) )
355+ }
356+
298357 delegate ! {
299358 to self {
300359 #[ call( set_distributed_option_extension) ]
@@ -320,6 +379,14 @@ impl DistributedExt for SessionStateBuilder {
320379 #[ call( set_distributed_task_estimator) ]
321380 #[ expr( $; self ) ]
322381 fn with_distributed_task_estimator<T : TaskEstimator + Send + Sync + ' static >( mut self , estimator: T ) -> Self ;
382+
383+ #[ call( set_distributed_files_per_task) ]
384+ #[ expr( $?; Ok ( self ) ) ]
385+ fn with_distributed_files_per_task( mut self , files_per_task: usize ) -> Result <Self , DataFusionError >;
386+
387+ #[ call( set_distributed_cardinality_effect_task_scale_factor) ]
388+ #[ expr( $?; Ok ( self ) ) ]
389+ fn with_distributed_cardinality_effect_task_scale_factor( mut self , factor: f64 ) -> Result <Self , DataFusionError >;
323390 }
324391 }
325392}
0 commit comments