@@ -68,6 +68,7 @@ def __init__(self, properties: QueryProperties, **parameters):
6868        self .column_map , self .aggregate_functions  =  build_aggregations (self .aggregates )
6969
7070        self .buffer  =  []
71+         self .max_buffer_size  =  50   # Process in chunks to avoid excessive memory usage 
7172
7273    @property  
7374    def  config (self ):  # pragma: no cover 
@@ -85,18 +86,19 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
8586                yield  EOS 
8687                return 
8788
88-             # merge all the morsels together into one table, selecting only the columns 
89-             # we're pretty sure we're going to use - this will fail for datasets 
90-             # larger than memory 
91-             table  =  pyarrow .concat_tables (
92-                 self .buffer ,
93-                 promote_options = "permissive" ,
94-             )
89+             # If we have partial results in buffer, do final aggregation 
90+             if  len (self .buffer ) >  0 :
91+                 table  =  pyarrow .concat_tables (
92+                     self .buffer ,
93+                     promote_options = "permissive" ,
94+                 )
95+                 table  =  table .combine_chunks ()
96+                 groups  =  table .group_by (self .group_by_columns )
97+                 groups  =  groups .aggregate (self .aggregate_functions )
98+                 self .buffer  =  [groups ]  # Replace buffer with final result 
9599
96-             # do the group by and aggregates 
97-             table  =  table .combine_chunks ()
98-             groups  =  table .group_by (self .group_by_columns )
99-             groups  =  groups .aggregate (self .aggregate_functions )
100+             # Now buffer has the final aggregated result 
101+             groups  =  self .buffer [0 ]
100102
101103            # do the secondary activities for ARRAY_AGG 
102104            for  node  in  get_all_nodes_of_type (self .aggregates , select_nodes = (NodeType .AGGREGATOR ,)):
@@ -135,4 +137,16 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
135137        morsel  =  evaluate_and_append (self .groups , morsel )
136138
137139        self .buffer .append (morsel )
140+ 
141+         # If buffer is full, do partial aggregation 
142+         if  len (self .buffer ) >=  self .max_buffer_size :
143+             table  =  pyarrow .concat_tables (
144+                 self .buffer ,
145+                 promote_options = "permissive" ,
146+             )
147+             table  =  table .combine_chunks ()
148+             groups  =  table .group_by (self .group_by_columns )
149+             groups  =  groups .aggregate (self .aggregate_functions )
150+             self .buffer  =  [groups ]  # Replace buffer with partial result 
151+ 
138152        yield  None 
0 commit comments