mabel-dev
diff --git a/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion b/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opteryx/__version__.py‎
Lines changed: 3 additions & 3 deletions b/‎opteryx/__version__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎opteryx/compiled/aggregations/group_state_store.pyx‎
Lines changed: 0 additions & 4 deletions b/‎opteryx/compiled/aggregations/group_state_store.pyx‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎opteryx/config.py‎
Lines changed: 5 additions & 0 deletions b/‎opteryx/config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎opteryx/connectors/io_systems/gcs_filesystem.py‎
Lines changed: 36 additions & 0 deletions b/‎opteryx/connectors/io_systems/gcs_filesystem.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎opteryx/connectors/io_systems/local_filesystem.py‎
Lines changed: 20 additions & 0 deletions b/‎opteryx/connectors/io_systems/local_filesystem.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎opteryx/connectors/io_systems/s3_filesystem.py‎
Lines changed: 33 additions & 0 deletions b/‎opteryx/connectors/io_systems/s3_filesystem.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎opteryx/operators/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎opteryx/operators/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎opteryx/operators/parquet_read_node.py‎
Lines changed: 155 additions & 0 deletions b/‎opteryx/operators/parquet_read_node.py‎
Lines changed: 155 additions & 0 deletions
@@ -29,7 +29,7 @@ class VersionStatus(Enum):
 
 __major_version__ = 0
 __minor_version__ = 6
-__revision_version__ = 29
+__revision_version__ = 30
 __author__ = "@joocer"
 __status__ = VersionStatus.RELEASE
 
 
@@ -1,11 +1,11 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 361
+__build__ = 364
 __author__ = "@joocer"
-__version__ = "0.6.29"
+__version__ = "0.6.30"
 __lib__ = "opteryx-core"
-__build_date__ = "2026-02-26T21:07:51.277997+00:00Z"
+__build_date__ = "2026-02-26T23:35:55.839493+00:00Z"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -21,12 +21,9 @@ from opteryx.compiled.aggregations.aggregate_kernels cimport update_state
 from opteryx.draken.vectors.int64_vector cimport Int64Vector
 from opteryx.draken.vectors.float64_vector cimport Float64Vector
 from opteryx.draken.vectors.integer_vector cimport IntegerVector
-from opteryx.draken.vectors.int64_vector cimport from_sequence as int64_from_sequence
-from opteryx.draken.vectors.float64_vector cimport from_sequence as float64_from_sequence
 
 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint8_t, uint64_t
 from libc.stdlib cimport malloc, free
-from libc.math cimport NAN
 from libc.stddef cimport size_t
 from cython.operator cimport dereference, preincrement
 from opteryx.third_party.abseil.containers cimport IdentityHash
@@ -273,7 +270,6 @@ cdef class GroupStateStore:
         cdef uint64_t[::1] key_hashes
         cdef uint64_t key_hash
         cdef uint64_t distinct_value_u64
-        cdef IntegerVector key_int_vector
         cdef IntegerVector value_int_vector
         cdef DrakenFixedBuffer* int_value_ptr
         cdef uint64_t* _narrow_key_buf
 
@@ -177,6 +177,11 @@ class Features:
     disable_predicate_ordering = bool(get("FEATURE_DISABLE_PREDICATE_ORDERING", False))
     disable_predicate_pushdown = bool(get("FEATURE_DISABLE_PREDICATE_PUSHDOWN", False))
     disable_manifest_pruning = bool(get("FEATURE_DISABLE_MANIFEST_PRUNING", False))
+    use_parquet_reader = str(get("FEATURE_USE_PARQUET_READER", "1")).lower() in (
+        "1",
+        "true",
+        "yes",
+    )
 
 
 features = Features()
@@ -9,6 +9,7 @@
 import os
 import urllib.parse
 from typing import List
+from typing import Tuple
 from typing import Union
 
 from opteryx.exceptions import DatasetReadError
@@ -137,6 +138,41 @@ def get_file_info(self, paths: Union[str, List[str]]):
 
         return infos[0] if single_path else infos
 
+    def read_ranges(self, path: str, ranges: List[Tuple[int, int]]) -> List[bytes]:
+        """Read multiple byte ranges from a GCS object using HTTP range requests.
+
+        Args:
+            path: GCS object path, with or without the ``gs://`` prefix.
+            ranges: List of (offset, length) tuples specifying byte ranges to read.
+
+        Returns:
+            List of byte buffers in the same order as ranges.
+        """
+        # Normalize path
+        if path.startswith("gs://"):
+            path = path[5:]
+
+        from opteryx.utils import paths as path_utils
+
+        bucket, _, _, _ = path_utils.get_parts(path)
+        object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
+        url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
+
+        result = []
+        for offset, length in ranges:
+            # GCS range request: Range: bytes=offset-end (inclusive)
+            end = offset + length - 1
+            response = self.session.get(
+                url,
+                headers={
+                    "Authorization": f"Bearer {self.access_token}",
+                    "Range": f"bytes={offset}-{end}",
+                },
+                timeout=30,
+            )
+            result.append(response.content)
+        return result
+
     def stream_to(self, path: str, sink, chunk_size: int = 1 << 20) -> int:
         """Stream a GCS object directly into *sink* without an intermediate buffer.
 
 
@@ -7,6 +7,8 @@
 
 import datetime
 import os
+from typing import List
+from typing import Tuple
 
 
 class MemoryMappedFile:
@@ -214,6 +216,24 @@ def get_file_info(self, paths):
 
         return infos[0] if single_path else infos
 
+    def read_ranges(self, path: str, ranges: List[Tuple[int, int]]) -> List[bytes]:
+        """Read multiple byte ranges from a local file.
+
+        Args:
+            path: Absolute or relative path to the local file.
+            ranges: List of (offset, length) tuples specifying byte ranges to read.
+
+        Returns:
+            List of byte buffers in the same order as ranges.
+        """
+        result = []
+        with open(path, "rb") as f:
+            for offset, length in ranges:
+                f.seek(offset)
+                chunk = f.read(length)
+                result.append(chunk)
+        return result
+
     def stream_to(self, path: str, sink, chunk_size: int = 1 << 20) -> int:
         """Stream a local file directly into *sink* without an intermediate buffer.
 
 
@@ -10,12 +10,14 @@
 from dataclasses import dataclass
 from typing import List
 from typing import Optional
+from typing import Tuple
 from typing import Union
 
 from minio.select import OutputSerialization
 from minio.xml import SubElement
 
 from opteryx.connectors.capabilities import PredicatePushable
+from opteryx.exceptions import DatasetReadError
 from opteryx.exceptions import MissingDependencyError
 from opteryx.exceptions import UnmetRequirementError
 from opteryx.third_party.alantsd.base64 import encode
@@ -255,6 +257,37 @@ def get_file_info(self, paths: Union[str, List[str]]):
 
         return infos[0] if single_path else infos
 
+    def read_ranges(self, path: str, ranges: List[Tuple[int, int]]) -> List[bytes]:
+        """Read multiple byte ranges from an S3 object using HTTP range requests.
+
+        Args:
+            path: S3 object path including bucket as first component
+                  (e.g. ``my-bucket/path/to/file.parquet``).
+            ranges: List of (offset, length) tuples specifying byte ranges to read.
+
+        Returns:
+            List of byte buffers in the same order as ranges.
+        """
+        from opteryx.utils import paths as path_utils
+
+        bucket, object_path, name, extension = path_utils.get_parts(path)
+        full_object_name = object_path + "/" + name + extension
+
+        result = []
+        for offset, length in ranges:
+            response = self.minio.get_object(
+                bucket_name=bucket,
+                object_name=full_object_name,
+                offset=offset,
+                length=length,
+            )
+            try:
+                chunk = response.read()
+                result.append(chunk)
+            finally:
+                response.close()
+        return result
+
     def stream_to(self, path: str, sink, chunk_size: int = 1 << 20) -> int:
         """Stream an S3 object directly into *sink* without an intermediate buffer.
 
 
@@ -90,6 +90,7 @@ def execute(self, morsel):
 from .aggregate_node import AggregateNode  # aggregate data
 from .iops_read_node import IopsReadNode
 from .null_reader_node import NullReaderNode  # empty table for contradictory predicates
+from .parquet_read_node import ParquetReadNode
 from .simple_aggregate_node import SimpleAggregateNode  # aggregate data
 from .simple_aggregate_and_group_node import SimpleAggregateAndGroupNode  # aggregate data
 
@@ -136,6 +137,7 @@ def execute(self, morsel):
     "AggregateNode",
     "IopsReadNode",
     "NullReaderNode",
+    "ParquetReadNode",
     "SimpleAggregateNode",
     "SimpleAggregateAndGroupNode",
     "CrossJoinNode",
 
@@ -0,0 +1,155 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# See the License at http://www.apache.org/licenses/LICENSE-2.0
+# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
+
+"""
+Parquet Read Node
+
+SQL Query Execution Plan Node that reads Parquet files using the column-chunk
+range-read design (docs/parquet-column-reads-design.md).
+
+Instead of downloading whole blobs into a shared-memory ring, this node:
+
+  1. Fetches the Parquet footer for each file (two small range reads each).
+  2. Fans out (file × row-group) work units to a thread pool.
+  3. For each unit, batches all projected column ranges into one read_ranges()
+     call, decodes with rugo, and yields the assembled row group.
+
+The filesystem layer is taken directly from the connector (every catalog-backed
+connector already exposes ``self.filesystem``), so this node works identically
+for local disk, GCS, and S3.
+
+Row groups are yielded in completion order — the thread pool handles overlap
+between I/O and decode across all files and row groups simultaneously.
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Generator
+
+from opteryx import EOS
+from opteryx.draken.morsels.morsel import Morsel
+from opteryx.models import QueryProperties
+from opteryx.parquet_io import InMemoryParquetCache
+from opteryx.parquet_io import fetch_footer
+from opteryx.parquet_io import iter_row_groups
+from opteryx.utils.file_decoders import get_decoder
+
+from .read_node import ReaderNode
+
+
+class ParquetReadNode(ReaderNode):
+    """Read node backed by column-chunk range reads via ``parquet_io``.
+
+    Activated for filesystem-backed connectors (GCS, S3, local) when the
+    manifest contains only ``.parquet`` files.  Falls back to the existing
+    ``IopsReadNode`` / ``ReaderNode`` paths for mixed or non-Parquet manifests.
+    """
+
+    def __init__(self, properties: QueryProperties, **parameters) -> None:
+        ReaderNode.__init__(self, properties=properties, **parameters)
+        self.predicates = parameters.get("predicates")
+
+    @property
+    def name(self) -> str:  # pragma: no cover
+        return "Parquet Read"
+
+    def to_mermaid(self, nid):  # pragma: no cover
+        mermaid = f'NODE_{nid}[("**{self.name.upper()}**<br />'
+        mermaid += f"{self.connector.dataset}<br />"
+        mermaid += f"({self.execution_time / 1_000_000:,.2f}ms)"
+        return mermaid + '")]'
+
+    def execute(self, morsel, **kwargs) -> Generator:
+        if morsel == EOS:
+            yield None
+            return
+
+        orso_schema = self.parameters["schema"]
+
+        # ── Empty manifest ────────────────────────────────────────────────────
+        if not self.manifest or self.manifest.get_file_count() == 0:
+            from orso import DataFrame
+
+            as_arrow = DataFrame(rows=[], schema=orso_schema).arrow()
+            renames = [orso_schema.column(col).identity for col in as_arrow.column_names]
+            as_arrow = as_arrow.rename_columns(renames)
+            yield as_arrow
+            return
+
+        # ── Project schema to requested columns only ──────────────────────────
+        orso_schema_cols = [
+            col
+            for col in orso_schema.columns
+            if col.identity in {c.schema_column.identity for c in self.columns}
+        ]
+        orso_schema.columns = orso_schema_cols
+        self.readings["columns_read"] += len(orso_schema.columns)
+
+        records_to_read = self.limit if self.limit is not None else float("inf")
+
+        filesystem = self.connector.filesystem
+        # Column names as they appear in the Parquet file (Parquet uses the
+        # original names, not identity aliases).
+        column_names = [col.name for col in orso_schema.columns]
+        # Map data-file column name → query-engine identity for Morsel construction.
+        name_to_identity = {col.name: col.identity for col in orso_schema.columns}
+        blob_paths = self.manifest.get_file_paths()
+
+        # One cache per execute() call: footers shared across all row groups of
+        # the same file; column chunks cached for reuse across row groups with
+        # identical content (rare but free).
+        cache = InMemoryParquetCache()
+        result_morsel = None
+
+        decode_start = time.monotonic_ns()
+        try:
+            for row_group in iter_row_groups(filesystem, blob_paths, column_names, cache):
+                path = row_group.pop("__path__")
+                rg_idx = row_group.pop("__row_group__")
+
+                # Assemble the projected columns into a Draken Morsel directly.
+                # Each value is a DrakenVector; we map data-file names to identity
+                # names so the morsel arrives downstream already correctly labelled.
+                identity_names = [name_to_identity[col] for col in row_group]
+                vectors = list(row_group.values())
+                result_morsel = Morsel.from_vectors(identity_names, vectors)
+
+                num_rows = result_morsel.num_rows
+                self.readings["rows_seen"] += num_rows
+                self.readings["blobs_seen"] += 1
+
+                # ── LIMIT enforcement ─────────────────────────────────────────
+                if records_to_read < num_rows:
+                    result_morsel = result_morsel.slice(0, int(records_to_read))
+                    records_to_read = 0
+                else:
+                    records_to_read -= num_rows
+
+                self.readings["blobs_read"] += 1
+                self.telemetry.blobs_read += 1
+                self.readings["rows_read"] += result_morsel.num_rows
+                self.telemetry.rows_read += result_morsel.num_rows
+                self.readings["bytes_processed"] += result_morsel.nbytes
+                self.telemetry.bytes_processed += result_morsel.nbytes
+
+                yield result_morsel
+
+                if records_to_read <= 0:
+                    break
+
+        finally:
+            decode_ns = time.monotonic_ns() - decode_start
+            self.readings["time_decoding_blobs"] = (
+                self.readings.get("time_decoding_blobs", 0) + decode_ns
+            )
+            self.telemetry.time_decoding_blobs += decode_ns
+
+        # ── Empty result guard ────────────────────────────────────────────────
+        if result_morsel is None:
+            self.readings["empty_datasets"] += 1
+            yield pyarrow.Table.from_arrays(
+                [pyarrow.array([]) for _ in arrow_schema], schema=arrow_schema
+            )