@@ -84,6 +84,9 @@ async fn main() -> Result<()> {
8484 // See how to analyze boundaries in different kinds of expressions.
8585 boundary_analysis_and_selectivity_demo ( ) ?;
8686
87+ // See how boundary analysis works for `AND` & `OR` conjunctions.
88+ boundary_analysis_in_conjuctions_demo ( ) ?;
89+
8790 // See how to determine the data types of expressions
8891 expression_type_demo ( ) ?;
8992
@@ -279,15 +282,15 @@ fn range_analysis_demo() -> Result<()> {
279282 Ok ( ( ) )
280283}
281284
282- // DataFusion's analysis can infer boundary statistics and selectivity in
283- // various situations which can be helpful in building more efficient
284- // query plans.
285+ /// DataFusion's analysis can infer boundary statistics and selectivity in
286+ /// various situations which can be helpful in building more efficient
287+ /// query plans.
285288fn boundary_analysis_and_selectivity_demo ( ) -> Result < ( ) > {
286289 // Consider the example where we want all rows with an `id` greater than
287290 // 5000.
288291 let id_greater_5000 = col ( "id" ) . gt_eq ( lit ( 5000i64 ) ) ;
289292
290- // As in most examples we must tell DaataFusion the type of the column.
293+ // As in most examples we must tell DataFusion the type of the column.
291294 let schema = Arc :: new ( Schema :: new ( vec ! [ make_field( "id" , DataType :: Int64 ) ] ) ) ;
292295
293296 // DataFusion is able to do cardinality estimation on various column types
@@ -312,10 +315,10 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
312315 let df_schema = DFSchema :: try_from ( schema. clone ( ) ) ?;
313316
314317 // Analysis case id >= 5000
315- let physical_expr1 =
318+ let physical_expr =
316319 SessionContext :: new ( ) . create_physical_expr ( id_greater_5000, & df_schema) ?;
317320 let analysis = analyze (
318- & physical_expr1 ,
321+ & physical_expr ,
319322 AnalysisContext :: new ( initial_boundaries. clone ( ) ) ,
320323 df_schema. as_ref ( ) ,
321324 ) ?;
@@ -347,14 +350,112 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
347350 Ok ( ( ) )
348351}
349352
350- fn make_field ( name : & str , data_type : DataType ) -> Field {
351- let nullable = false ;
352- Field :: new ( name, data_type, nullable)
353- }
353+ /// This function shows how to think about and leverage the analysis API
354+ /// to infer boundaries in `AND` & `OR` conjunctions.
355+ fn boundary_analysis_in_conjuctions_demo ( ) -> Result < ( ) > {
356+ // Let us consider the more common case of AND & OR conjunctions.
357+ //
358+ // age > 18 AND age <= 25
359+ let age_between_18_25 = col ( "age" ) . gt ( lit ( 18i64 ) ) . and ( col ( "age" ) . lt_eq ( lit ( 25 ) ) ) ;
354360
355- fn make_ts_field ( name : & str ) -> Field {
356- let tz = None ;
357- make_field ( name, DataType :: Timestamp ( TimeUnit :: Nanosecond , tz) )
361+ // As always we need to tell DataFusion the type of the column.
362+ let schema = Arc :: new ( Schema :: new ( vec ! [ make_field( "age" , DataType :: Int64 ) ] ) ) ;
363+
364+ // Similarly to the example in `boundary_analysis_and_selectivity_demo` we
365+ // can establish column statistics that can be used to describe certain
366+ // column properties.
367+ let column_stats = ColumnStatistics {
368+ null_count : Precision :: Exact ( 0 ) ,
369+ max_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 79 ) ) ) ,
370+ min_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 14 ) ) ) ,
371+ sum_value : Precision :: Absent ,
372+ distinct_count : Precision :: Absent ,
373+ } ;
374+
375+ let initial_boundaries =
376+ vec ! [ ExprBoundaries :: try_from_column( & schema, & column_stats, 0 ) ?] ;
377+
378+ // Before we run the analysis pass; let us describe what we can infer from
379+ // the initial information.
380+ //
381+ // To recap, the expression is `age > 18 AND age <= 25`.
382+ //
383+ // The column `age` can take any value in the `Int64` range.
384+ //
385+ // But using the `min`, `max` statistics we can reduce that initial range
386+ // to `[min_value, max_value]` which is [14, 79].
387+ //
388+ // During analysis, when evaluating, let's say the left-hand side of the `AND`
389+ // expression, we know that `age` must be greater than 18. Therefore our range
390+ // is now [19, 79].
391+ // And by evaluating the right-hand side we can get an upper bound, allowing
392+ // us to infer that `age` must be in the range [19, 25] inclusive.
393+ let df_schema = DFSchema :: try_from ( schema. clone ( ) ) ?;
394+
395+ let physical_expr =
396+ SessionContext :: new ( ) . create_physical_expr ( age_between_18_25, & df_schema) ?;
397+ let analysis = analyze (
398+ & physical_expr,
399+ // We re-use initial_boundaries elsewhere so we must clone it.
400+ AnalysisContext :: new ( initial_boundaries. clone ( ) ) ,
401+ df_schema. as_ref ( ) ,
402+ ) ?;
403+
404+ // We can check that DataFusion's analysis inferred the same bounds.
405+ assert_eq ! (
406+ analysis. boundaries. first( ) . map( |boundary| boundary
407+ . interval
408+ . clone( )
409+ . unwrap( )
410+ . into_bounds( ) ) ,
411+ Some ( ( ScalarValue :: Int64 ( Some ( 19 ) ) , ScalarValue :: Int64 ( Some ( 25 ) ) ) )
412+ ) ;
413+
414+ // We can also infer the selectivity using the same approach as before.
415+ //
416+ // Granted a column such as age will more likely follow a Normal distribution
417+ // as such our selectivity estimation will not be as good as it can.
418+ assert ! ( analysis
419+ . selectivity
420+ . is_some_and( |selectivity| ( 0.1 ..=0.2 ) . contains( & selectivity) ) ) ;
421+
422+ // The above example was a good way to look at how we can derive better
423+ // interval and get a lower selectivity during boundary analysis.
424+ //
425+ // But `AND` conjunctions are easier to reason with because their interval
426+ // arithmetic follows naturally from set intersection operations, let us
427+ // now look at an example that is a tad more complicated `OR` conjunctions.
428+
429+ // The expression we will look at is `age > 60 OR age <= 18`.
430+ let age_greater_than_60_less_than_18 =
431+ col ( "age" ) . gt ( lit ( 64i64 ) ) . or ( col ( "age" ) . lt_eq ( lit ( 18i64 ) ) ) ;
432+
433+ // We can re-use the same schema, initial boundaries and column statistics
434+ // described above. So let's think about this for a bit.
435+ //
436+ // Initial range: [14, 79] as described in our column statistics.
437+ //
438+ // From the left-hand side and right-hand side of our `OR` conjunctions
439+ // we end up with two ranges, instead of just one.
440+ //
441+ // - age > 60: [61, 79]
442+ // - age <= 18: [14, 18]
443+ //
444+ // Thus the range of possible values the `age` column might take is a
445+ // union of both sets [14, 18] U [61, 79].
446+ let physical_expr = SessionContext :: new ( )
447+ . create_physical_expr ( age_greater_than_60_less_than_18, & df_schema) ?;
448+
449+ // Since we don't handle interval arithmetic for `OR` operator this will error out.
450+ let analysis = analyze (
451+ & physical_expr,
452+ AnalysisContext :: new ( initial_boundaries) ,
453+ df_schema. as_ref ( ) ,
454+ ) ;
455+
456+ assert ! ( analysis. is_err( ) ) ;
457+
458+ Ok ( ( ) )
358459}
359460
360461/// This function shows how to use `Expr::get_type` to retrieve the DataType
@@ -494,3 +595,13 @@ fn type_coercion_demo() -> Result<()> {
494595
495596 Ok ( ( ) )
496597}
598+
599+ fn make_field ( name : & str , data_type : DataType ) -> Field {
600+ let nullable = false ;
601+ Field :: new ( name, data_type, nullable)
602+ }
603+
604+ fn make_ts_field ( name : & str ) -> Field {
605+ let tz = None ;
606+ make_field ( name, DataType :: Timestamp ( TimeUnit :: Nanosecond , tz) )
607+ }
0 commit comments