@@ -39,56 +39,74 @@ def df():
3939    )
4040    return  ctx .create_dataframe ([[batch ]])
4141
42+ 
4243@pytest .fixture  
4344def  df_aggregate_100 ():
4445    ctx  =  SessionContext ()
4546    ctx .register_csv ("aggregate_test_data" , "./testing/data/csv/aggregate_test_100.csv" )
4647    return  ctx .table ("aggregate_test_data" )
4748
4849
49- @pytest .mark .parametrize ("agg_expr, calc_expected" , [ 
50-     (f .avg (column ("a" )), lambda  a , b , c , d : np .array (np .average (a ))), 
51-     (f .corr (column ("a" ), column ("b" )), lambda  a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ])), 
52-     (f .count (column ("a" )), lambda  a , b , c , d : pa .array ([len (a )])), 
53-     # Sample (co)variance -> ddof=1  
54-     # Population (co)variance -> ddof=0  
55-     (f .covar (column ("a" ), column ("b" )), lambda  a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ])), 
56-     (f .covar_pop (column ("a" ), column ("c" )), lambda  a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ])), 
57-     (f .covar_samp (column ("b" ), column ("c" )), lambda  a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ])), 
58-     # f.grouping(col_a),  # No physical plan implemented yet  
59-     (f .max (column ("a" )), lambda  a , b , c , d : np .array (np .max (a ))), 
60-     (f .mean (column ("b" )), lambda  a , b , c , d : np .array (np .mean (b ))), 
61-     (f .median (column ("b" )), lambda  a , b , c , d : np .array (np .median (b ))), 
62-     (f .min (column ("a" )), lambda  a , b , c , d : np .array (np .min (a ))), 
63-     (f .sum (column ("b" )), lambda  a , b , c , d : np .array (np .sum (b .to_pylist ()))), 
64-     # Sample stdev -> ddof=1  
65-     # Population stdev -> ddof=0  
66-     (f .stddev (column ("a" )), lambda  a , b , c , d : np .array (np .std (a , ddof = 1 ))), 
67-     (f .stddev_pop (column ("b" )), lambda  a , b , c , d : np .array (np .std (b , ddof = 0 ))), 
68-     (f .stddev_samp (column ("c" )), lambda  a , b , c , d : np .array (np .std (c , ddof = 1 ))), 
69-     (f .var (column ("a" )), lambda  a , b , c , d : np .array (np .var (a , ddof = 1 ))), 
70-     (f .var_pop (column ("b" )), lambda  a , b , c , d : np .array (np .var (b , ddof = 0 ))), 
71-     (f .var_samp (column ("c" )), lambda  a , b , c , d : np .array (np .var (c , ddof = 1 ))), 
72- ]) 
50+ @pytest .mark .parametrize ( 
51+     "agg_expr, calc_expected" , 
52+     [ 
53+         (f .avg (column ("a" )), lambda  a , b , c , d : np .array (np .average (a ))), 
54+         ( 
55+             f .corr (column ("a" ), column ("b" )), 
56+             lambda  a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ]), 
57+         ), 
58+         (f .count (column ("a" )), lambda  a , b , c , d : pa .array ([len (a )])), 
59+         # Sample (co)variance -> ddof=1  
60+         # Population (co)variance -> ddof=0  
61+         ( 
62+             f .covar (column ("a" ), column ("b" )), 
63+             lambda  a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ]), 
64+         ), 
65+         ( 
66+             f .covar_pop (column ("a" ), column ("c" )), 
67+             lambda  a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ]), 
68+         ), 
69+         ( 
70+             f .covar_samp (column ("b" ), column ("c" )), 
71+             lambda  a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ]), 
72+         ), 
73+         # f.grouping(col_a),  # No physical plan implemented yet  
74+         (f .max (column ("a" )), lambda  a , b , c , d : np .array (np .max (a ))), 
75+         (f .mean (column ("b" )), lambda  a , b , c , d : np .array (np .mean (b ))), 
76+         (f .median (column ("b" )), lambda  a , b , c , d : np .array (np .median (b ))), 
77+         (f .min (column ("a" )), lambda  a , b , c , d : np .array (np .min (a ))), 
78+         (f .sum (column ("b" )), lambda  a , b , c , d : np .array (np .sum (b .to_pylist ()))), 
79+         # Sample stdev -> ddof=1  
80+         # Population stdev -> ddof=0  
81+         (f .stddev (column ("a" )), lambda  a , b , c , d : np .array (np .std (a , ddof = 1 ))), 
82+         (f .stddev_pop (column ("b" )), lambda  a , b , c , d : np .array (np .std (b , ddof = 0 ))), 
83+         (f .stddev_samp (column ("c" )), lambda  a , b , c , d : np .array (np .std (c , ddof = 1 ))), 
84+         (f .var (column ("a" )), lambda  a , b , c , d : np .array (np .var (a , ddof = 1 ))), 
85+         (f .var_pop (column ("b" )), lambda  a , b , c , d : np .array (np .var (b , ddof = 0 ))), 
86+         (f .var_samp (column ("c" )), lambda  a , b , c , d : np .array (np .var (c , ddof = 1 ))), 
87+     ], 
88+ ) 
7389def  test_aggregation_stats (df , agg_expr , calc_expected ):
74- 
7590    agg_df  =  df .aggregate ([], [agg_expr ])
7691    result  =  agg_df .collect ()[0 ]
7792    values_a , values_b , values_c , values_d  =  df .collect ()[0 ]
7893    expected  =  calc_expected (values_a , values_b , values_c , values_d )
7994    np .testing .assert_array_almost_equal (result .column (0 ), expected )
8095
8196
82- @pytest .mark .parametrize ("agg_expr, expected" , [ 
83-     (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())), 
84-     (f .approx_median (column ("b" )), pa .array ([4 ])), 
85-     (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])), 
86-     ( 
87-         f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )), 
88-         pa .array ([6 ], type = pa .float64 ()) 
89-     ), 
90-     (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])), 
91- ]) 
97+ @pytest .mark .parametrize ( 
98+     "agg_expr, expected" , 
99+     [ 
100+         (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())), 
101+         (f .approx_median (column ("b" )), pa .array ([4 ])), 
102+         (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])), 
103+         ( 
104+             f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )), 
105+             pa .array ([6 ], type = pa .float64 ()), 
106+         ), 
107+         (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])), 
108+     ], 
109+ ) 
92110def  test_aggregation (df , agg_expr , expected ):
93111    agg_df  =  df .aggregate ([], [agg_expr ])
94112    result  =  agg_df .collect ()[0 ]
@@ -98,20 +116,21 @@ def test_aggregation(df, agg_expr, expected):
98116def  test_aggregate_100 (df_aggregate_100 ):
99117    # https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498 
100118
101-     result  =  df_aggregate_100 . aggregate (
102-         [ 
103-             column ("c1" )
104-         ],
105-         [ 
106-              f . approx_percentile_cont (column ("c3"  ),  lit ( 0.95 ),  lit ( 200 )). alias ( "c3" )
107-         ] 
108-     ). sort ( column ( "c1" ). sort ( ascending = True )). collect () 
119+     result  =  (
120+         df_aggregate_100 . aggregate ( 
121+             [ column ("c1" )], 
122+             [ f . approx_percentile_cont ( column ( "c3" ),  lit ( 0.95 ),  lit ( 200 )). alias ( "c3" ) ],
123+         ) 
124+         . sort (column ("c1"  ). sort ( ascending = True ) )
125+         . collect () 
126+     )
109127
110128    assert  len (result ) ==  1 
111129    result  =  result [0 ]
112130    assert  result .column ("c1" ) ==  pa .array (["a" , "b" , "c" , "d" , "e" ])
113131    assert  result .column ("c3" ) ==  pa .array ([73 , 68 , 122 , 124 , 115 ])
114132
133+ 
115134def  test_bit_add_or_xor (df ):
116135    df  =  df .aggregate (
117136        [],
0 commit comments