group by improvements

joocer · joocer · commit 85989f9caa75 · 2025-10-30T08:02:35.000Z
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1712
+__build__ = 1713
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1712"
+__version__ = "0.26.0-beta.1713"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py
@@ -68,8 +68,9 @@ def __init__(self, properties: QueryProperties, **parameters):
         self.column_map, self.aggregate_functions = build_aggregations(self.aggregates)
 
         self.buffer = []
-        self.max_buffer_size = 100  # Process in chunks to avoid excessive memory usage
+        self.max_buffer_size = 250  # Buffer size before partial aggregation (kept for future parallelization)
         self._partial_aggregated = False  # Track if we've done a partial aggregation
+        self._disable_partial_agg = False  # Can disable if partial agg isn't helping
 
     @property
     def config(self):  # pragma: no cover
@@ -221,15 +222,26 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
         self.buffer.append(morsel)
 
         # If buffer is full, do partial aggregation
-        if len(self.buffer) >= self.max_buffer_size:
+        # BUT: Skip partial aggregation if it's not reducing data effectively
+        if len(self.buffer) >= self.max_buffer_size and not self._disable_partial_agg:
             table = pyarrow.concat_tables(
                 self.buffer,
                 promote_options="permissive",
             )
 
             groups = table.group_by(self.group_by_columns)
             groups = groups.aggregate(self.aggregate_functions)
-            self.buffer = [groups]  # Replace buffer with partial result
-            self._partial_aggregated = True  # Mark that we've done a partial aggregation
+            
+            # Check if partial aggregation is effective
+            # If we're not reducing the row count significantly, stop doing partial aggs
+            reduction_ratio = groups.num_rows / table.num_rows if table.num_rows > 0 else 1
+            if reduction_ratio > 0.75:  # Kept more than 75% of rows - high cardinality!
+                # Partial aggregation isn't helping, disable it and keep buffering
+                self._disable_partial_agg = True
+                # Don't replace buffer with partial result, keep accumulating
+            else:
+                # Good reduction, keep using partial aggregation
+                self.buffer = [groups]  # Replace buffer with partial result
+                self._partial_aggregated = True  # Mark that we've done a partial aggregation
 
         yield None
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1712"
+version = "0.26.0-beta.1713"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/performance/clickbench/compare_runs.py b/tests/performance/clickbench/compare_runs.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Compare two ClickBench benchmark runs and show differences.
+
+Usage:
+    python compare_runs.py baseline.txt comparison.txt [--threshold 10]
+    
+Example:
+    python compare_runs.py 8500T@0.25.0b1444.txt 8500T@0.26.0b1710.txt
+    python compare_runs.py old_run.txt new_run.txt --threshold 5
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+
+
+def parse_benchmark(filename):
+    """Parse a benchmark file and extract version and timing data."""
+    try:
+        with open(filename, 'r', encoding='utf-8') as f:
+            content = f.read()
+    except FileNotFoundError:
+        print(f"Error: File '{filename}' not found", file=sys.stderr)
+        sys.exit(1)
+    
+    # Extract version - try multiple patterns
+    version = None
+    for pattern in [
+        r'0\.\d+\.\d+-beta\.\d+',  # 0.25.0-beta.1444
+        r'v?\d+\.\d+\.\d+',          # v1.0.0 or 1.0.0
+        r'\d+\.\d+\.\d+b\d+',        # 0.25.0b1444
+    ]:
+        version_match = re.search(pattern, content)
+        if version_match:
+            version = version_match.group(0)
+            break
+    
+    if not version:
+        # Use filename as fallback
+        version = Path(filename).stem
+    
+    # Extract timing arrays
+    times = []
+    for line in content.split('\n'):
+        match = re.match(r'\[([0-9.,null]+)\]', line)
+        if match:
+            values = match.group(1).split(',')
+            # Take the median (3rd value, index 2) or first if less values
+            try:
+                if 'null' in values:
+                    times.append(None)
+                else:
+                    times.append(float(values[2]) if len(values) >= 3 else float(values[0]))
+            except (ValueError, IndexError):
+                times.append(None)
+    
+    return version, times
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Compare two ClickBench benchmark runs',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument('baseline', help='Baseline benchmark file')
+    parser.add_argument('comparison', help='Comparison benchmark file')
+    parser.add_argument('--threshold', '-t', type=float, default=10.0,
+                       help='Percentage threshold for flagging regressions/improvements (default: 10%%)')
+    parser.add_argument('--top', '-n', type=int, default=10,
+                       help='Number of top regressions/improvements to show (default: 10)')
+    
+    args = parser.parse_args()
+    
+    baseline_version, baseline_times = parse_benchmark(args.baseline)
+    comparison_version, comparison_times = parse_benchmark(args.comparison)
+    
+    if not baseline_times or not comparison_times:
+        print("Error: Could not parse timing data from one or both files", file=sys.stderr)
+        sys.exit(1)
+    
+    print(f'Comparing {baseline_version} (baseline) vs {comparison_version} (comparison)')
+    print(f'Found {len(baseline_times)} queries in baseline and {len(comparison_times)} queries in comparison')
+    print()
+    print('Query # | Baseline | Compare | Delta   | Change')
+    print('--------|----------|---------|---------|--------')
+    
+    regressions = []
+    improvements = []
+    
+    for i, (t_base, t_comp) in enumerate(zip(baseline_times, comparison_times), 1):
+        if t_base is None or t_comp is None:
+            delta_str = 'N/A'
+            change_str = 'N/A'
+        else:
+            delta = t_comp - t_base
+            pct_change = ((t_comp / t_base) - 1) * 100 if t_base > 0 else 0
+            delta_str = f'{delta:+.2f}s'
+            change_str = f'{pct_change:+.1f}%'
+            
+            if pct_change > args.threshold:
+                regressions.append((i, t_base, t_comp, pct_change))
+            elif pct_change < -args.threshold:
+                improvements.append((i, t_base, t_comp, pct_change))
+        
+        print(f'{i:7} | {t_base if t_base else "null":8} | {t_comp if t_comp else "null":7} | {delta_str:7} | {change_str:>7}')
+    
+    print()
+    print(f'Total queries: {len(baseline_times)}')
+    print(f'Regressions (>{args.threshold}% slower): {len(regressions)}')
+    print(f'Improvements (>{args.threshold}% faster): {len(improvements)}')
+    print()
+    
+    if regressions:
+        print(f'Top {min(args.top, len(regressions))} Regressions:')
+        regressions.sort(key=lambda x: x[3], reverse=True)
+        for q, t_base, t_comp, pct in regressions[:args.top]:
+            print(f'  Query {q:2}: {t_base:.2f}s → {t_comp:.2f}s ({pct:+.1f}%)')
+    
+    if improvements:
+        print()
+        print(f'Top {min(args.top, len(improvements))} Improvements:')
+        improvements.sort(key=lambda x: x[3])
+        for q, t_base, t_comp, pct in improvements[:args.top]:
+            print(f'  Query {q:2}: {t_base:.2f}s → {t_comp:.2f}s ({pct:+.1f}%)')
+
+
+if __name__ == '__main__':
+    main()