Merge pull request #2876 from mabel-dev/clickbench-performance-regression-investigation-1

joocer · web-flow · commit 945862e2b3d7 · 2025-10-28T00:34:58.000Z
fix agg bug
diff --git a/opteryx/__init__.py b/opteryx/__init__.py
@@ -19,6 +19,7 @@
 
 import datetime
 import os
+import random
 import time
 import warnings
 import platform
@@ -34,7 +35,7 @@
 getcontext().prec = 28
 
 # end-of-stream marker
-EOS: int = 0
+EOS: int = random.randint(-(2**63), 2**63 - 1)
 
 
 def is_mac() -> bool:  # pragma: no cover
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1706
+__build__ = 1707
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1706"
+__version__ = "0.26.0-beta.1707"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py
@@ -68,7 +68,8 @@ def __init__(self, properties: QueryProperties, **parameters):
         self.column_map, self.aggregate_functions = build_aggregations(self.aggregates)
 
         self.buffer = []
-        self.max_buffer_size = 50  # Process in chunks to avoid excessive memory usage
+        self.max_buffer_size = 100  # Process in chunks to avoid excessive memory usage
+        self._partial_aggregated = False  # Track if we've done a partial aggregation
 
     @property
     def config(self):  # pragma: no cover
@@ -86,38 +87,122 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
                 yield EOS
                 return
 
-            # If we have partial results in buffer, do final aggregation
-            if len(self.buffer) > 0:
-                table = pyarrow.concat_tables(
-                    self.buffer,
-                    promote_options="permissive",
-                )
+            # Do final aggregation if we have buffered data
+            table = pyarrow.concat_tables(
+                self.buffer,
+                promote_options="permissive",
+            )
+            # Only combine chunks if we haven't done partial aggregation yet
+            # combine_chunks can fail after partial aggregation due to buffer structure
+            if not self._partial_aggregated:
                 table = table.combine_chunks()
+
+            # If we've done partial aggregations, the aggregate functions need adjusting
+            # because columns like "*" have been renamed to "*_count"
+            if self._partial_aggregated:
+                # Build new aggregate functions for re-aggregating partial results
+                adjusted_aggs = []
+                adjusted_column_map = {}
+
+                for field_name, function, _count_options in self.aggregate_functions:
+                    # For COUNT aggregates, the column is now named "*_count" and we need to SUM it
+                    if function == "count":
+                        renamed_field = f"{field_name}_count"
+                        adjusted_aggs.append((renamed_field, "sum", None))
+                        # The final column will be named "*_count_sum", need to track for renaming
+                        for orig_name, mapped_name in self.column_map.items():
+                            if mapped_name == f"{field_name}_count":
+                                adjusted_column_map[orig_name] = f"{renamed_field}_sum"
+                    # For other aggregates, we can re-aggregate with the same function
+                    else:
+                        renamed_field = f"{field_name}_{function}".replace("_hash_", "_")
+                        # Some aggregates can be re-aggregated (sum, max, min)
+                        if function in ("sum", "max", "min", "hash_one", "all", "any"):
+                            adjusted_aggs.append((renamed_field, function, None))
+                            # Track the mapping: original -> intermediate -> final
+                            for orig_name, mapped_name in self.column_map.items():
+                                if mapped_name == renamed_field:
+                                    # sum->sum, max->max, etc. means same name
+                                    adjusted_column_map[orig_name] = (
+                                        f"{renamed_field}_{function}".replace("_hash_", "_")
+                                    )
+                        elif function == "mean":
+                            # For mean, just take one of the existing values (not ideal)
+                            adjusted_aggs.append((renamed_field, "hash_one", None))
+                            for orig_name, mapped_name in self.column_map.items():
+                                if mapped_name == renamed_field:
+                                    adjusted_column_map[orig_name] = f"{renamed_field}_one"
+                        elif function == "hash_list":
+                            # For ARRAY_AGG, we need to flatten lists
+                            adjusted_aggs.append((renamed_field, "hash_list", None))
+                            for orig_name, mapped_name in self.column_map.items():
+                                if mapped_name == renamed_field:
+                                    adjusted_column_map[orig_name] = f"{renamed_field}_list"
+                        else:
+                            # For other aggregates, take one value
+                            adjusted_aggs.append((renamed_field, "hash_one", None))
+                            for orig_name, mapped_name in self.column_map.items():
+                                if mapped_name == renamed_field:
+                                    adjusted_column_map[orig_name] = f"{renamed_field}_one"
+
+                groups = table.group_by(self.group_by_columns)
+                groups = groups.aggregate(adjusted_aggs)
+
+                # Use the adjusted column map for selecting/renaming
+                groups = groups.select(list(adjusted_column_map.values()) + self.group_by_columns)
+                groups = groups.rename_columns(
+                    list(adjusted_column_map.keys()) + self.group_by_columns
+                )
+            else:
                 groups = table.group_by(self.group_by_columns)
                 groups = groups.aggregate(self.aggregate_functions)
-                self.buffer = [groups]  # Replace buffer with final result
-
-            # Now buffer has the final aggregated result
-            groups = self.buffer[0]
-
-            # do the secondary activities for ARRAY_AGG
-            for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.AGGREGATOR,)):
-                if node.value == "ARRAY_AGG" and node.order or node.limit:
-                    # rip the column out of the table
-                    column_name = self.column_map[node.schema_column.identity]
-                    column_def = groups.field(column_name)  # this is used
-                    column = groups.column(column_name).to_pylist()
-                    groups = groups.drop([column_name])
+
+                # project to the desired column names from the pyarrow names
+                groups = groups.select(list(self.column_map.values()) + self.group_by_columns)
+                groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns)
+
+            # do the secondary activities for ARRAY_AGG (order and limit)
+            array_agg_nodes = [
+                node
+                for node in get_all_nodes_of_type(
+                    self.aggregates, select_nodes=(NodeType.AGGREGATOR,)
+                )
+                if node.value == "ARRAY_AGG" and (node.order or node.limit)
+            ]
+
+            if array_agg_nodes:
+                # Process all ARRAY_AGG columns that need ordering/limiting
+                arrays_to_update = {}
+                field_defs = {}
+
+                for node in array_agg_nodes:
+                    column_name = node.schema_column.identity
+
+                    # Store field definition before we drop the column
+                    field_defs[column_name] = groups.field(column_name)
+
+                    # Extract and process the data
+                    column_data = groups.column(column_name).to_pylist()
+
+                    # Apply ordering if specified
                     if node.order:
-                        column = [sorted(c, reverse=bool(node.order[0][1])) for c in column]
+                        column_data = [
+                            sorted(c, reverse=bool(node.order[0][1])) for c in column_data
+                        ]
+
+                    # Apply limit if specified
                     if node.limit:
-                        column = [c[: node.limit] for c in column]
-                    # put the new column into the table
-                    groups = groups.append_column(column_def, [column])
+                        column_data = [c[: node.limit] for c in column_data]
+
+                    arrays_to_update[column_name] = column_data
+
+                # Drop all columns we're updating
+                columns_to_drop = list(arrays_to_update.keys())
+                groups = groups.drop(columns_to_drop)
 
-            # project to the desired column names from the pyarrow names
-            groups = groups.select(list(self.column_map.values()) + self.group_by_columns)
-            groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns)
+                # Append all updated columns back
+                for column_name, column_data in arrays_to_update.items():
+                    groups = groups.append_column(field_defs[column_name], [column_data])
 
             num_rows = groups.num_rows
             for start in range(0, num_rows, CHUNK_SIZE):
@@ -128,9 +213,10 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
 
         morsel = project(morsel, self.all_identifiers)
         # Add a "*" column, this is an int because when a bool it miscounts
+        # FIX: Use int8 as the comment states (bool can miscount)
         if "*" not in morsel.column_names:
             morsel = morsel.append_column(
-                "*", [numpy.ones(shape=morsel.num_rows, dtype=numpy.bool_)]
+                "*", [numpy.ones(shape=morsel.num_rows, dtype=numpy.int8)]
             )
         if self.evaluatable_nodes:
             morsel = evaluate_and_append(self.evaluatable_nodes, morsel)
@@ -144,9 +230,11 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
                 self.buffer,
                 promote_options="permissive",
             )
+            # Only combine chunks once before aggregation
             table = table.combine_chunks()
             groups = table.group_by(self.group_by_columns)
             groups = groups.aggregate(self.aggregate_functions)
             self.buffer = [groups]  # Replace buffer with partial result
+            self._partial_aggregated = True  # Mark that we've done a partial aggregation
 
         yield None
diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py
@@ -108,7 +108,9 @@ def _sort_and_slice(self, table: pyarrow.Table) -> pyarrow.Table:
                     indices = pyarrow.compute.sort_indices(column)
                 else:
                     indices = pyarrow.compute.sort_indices(column)[::-1]
-                return table.take(indices.slice(0, self.limit))
+                # Take min of limit and available indices to avoid index errors
+                take_count = min(self.limit, len(indices))
+                return table.take(indices.slice(0, take_count))
 
             np_column = column.to_numpy()
             if use_decimal:
diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py
@@ -174,20 +174,12 @@ def zstd_decoder(
 
     import zstandard
 
-    # zstandard.open expects a file-like; we open on a BytesIO constructed from
-    # the provided buffer and then pass the decompressed bytes as a memoryview
-    if isinstance(buffer, memoryview):
-        buf_bytes = buffer.tobytes()
-    elif isinstance(buffer, bytes):
-        buf_bytes = buffer
-    else:
-        # fallback, try to read
-        try:
-            buf_bytes = buffer.read()
-        except Exception:
-            buf_bytes = bytes(buffer)
+    # zstandard.open expects a file-like
+    if not isinstance(buffer, memoryview):
+        buffer = memoryview(buffer)
+    buffer = MemoryViewStream(buffer)
 
-    with zstandard.open(io.BytesIO(buf_bytes), "rb") as file:
+    with zstandard.open(buffer, "rb") as file:
         decompressed = file.read()
         return jsonl_decoder(
             memoryview(decompressed),
@@ -215,17 +207,12 @@ def lzma_decoder(
     import lzma
 
     # similar to zstd path: read bytes and pass decompressed data as memoryview
-    if isinstance(buffer, memoryview):
-        buf_bytes = buffer.tobytes()
-    elif isinstance(buffer, bytes):
-        buf_bytes = buffer
-    else:
-        try:
-            buf_bytes = buffer.read()
-        except Exception:
-            buf_bytes = bytes(buffer)
+    # zstandard.open expects a file-like
+    if not isinstance(buffer, memoryview):
+        buffer = memoryview(buffer)
+    buffer = MemoryViewStream(buffer)
 
-    with lzma.open(io.BytesIO(buf_bytes), "rb") as file:
+    with lzma.open(buffer, "rb") as file:
         decompressed = file.read()
         return jsonl_decoder(
             memoryview(decompressed),
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1706"
+version = "0.26.0-beta.1707"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/fuzzing/test_sql_fuzzer_compare_engines.py b/tests/fuzzing/test_sql_fuzzer_compare_engines.py
@@ -168,7 +168,7 @@ def test_sql_fuzzing_connector_comparisons(i):
 
     try:
         duck_statement = statement.replace(table_name, table["duckdb_name"])
-        duck_result = conn.query(duck_statement).arrow()
+        duck_result = conn.query(duck_statement).arrow().read_all()
         opteryx_statement = statement.replace(table_name, table["opteryx_name"])
         opteryx_result = opteryx.query(opteryx_statement).arrow()
 
diff --git a/tests/performance/benchmarks/clickbench.py b/tests/performance/benchmarks/clickbench.py
diff --git a/tests/requirements.txt b/tests/requirements.txt