Merge pull request #58 from mabel-dev/missing-dep

joocer · web-flow · commit 97c3b0b2cf73 · 2025-12-31T19:36:48.000Z
fix compression and limit pushdown
diff --git a/dev/build_counter.py b/dev/build_counter.py
@@ -26,7 +26,7 @@ class VersionStatus(Enum):
 
 __major_version__ = 0
 __minor_version__ = 5
-__revision_version__ = 3
+__revision_version__ = 4
 __author__ = "@joocer"
 __status__ = VersionStatus.RELEASE
 
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,11 +1,11 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 116
+__build__ = 117
 __author__ = "@joocer"
-__version__ = "0.5.3"
+__version__ = "0.5.4"
 __lib__ = "opteryx-core"
-__build_date__ = "2025-12-30T17:48:32.909410+00:00Z"
+__build_date__ = "2025-12-31T19:34:31.469445+00:00Z"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/connectors/filesystem_connector.py b/opteryx/connectors/filesystem_connector.py
@@ -185,12 +185,13 @@ def blocking_read():
         telemetry.bytes_read += len(data)
         return ref
 
-    def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
+    def get_list_of_blob_names(self, *, prefix: str, predicates: list = []) -> List[str]:
         """
         List all blobs matching the prefix.
 
         Args:
             prefix: Path prefix to search
+            predicates: Optional predicates for filtering (subclasses may use this)
 
         Returns:
             List of blob paths
@@ -230,7 +231,7 @@ def read_dataset(
         Yields:
             PyArrow Tables or schemas
         """
-        blob_names = self.get_list_of_blob_names(prefix=self.dataset)
+        blob_names = self.get_list_of_blob_names(prefix=self.dataset, predicates=predicates or [])
 
         if just_schema:
             for blob_name in blob_names:
diff --git a/opteryx/connectors/opteryx_connector.py b/opteryx/connectors/opteryx_connector.py
@@ -4,11 +4,11 @@
 # Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 
 """
-Iceberg Connector - Refactored Architecture
+Opteryx Connector - Refactored Architecture
 
 Architecture:
-- IcebergConnector: Long-lived catalog gateway (handles catalog operations, views, introspection)
-- IcebergTable: Transient table-specific engine (handles data reading for one table)
+- OpteryxConnector: Long-lived catalog gateway (handles catalog operations, views, introspection)
+- OpteryxTable: Transient table-specific engine (handles data reading for one table)
 """
 
 import datetime
@@ -105,7 +105,7 @@ def __init__(self, dataset: str, catalog, workspace: str, **kwargs):
 
         # Call FileSystemTable.__init__ which calls BaseTable.__init__
         FileSystemTable.__init__(
-            self, dataset=dataset, filesystem=filesystem, storage_type="ICEBERG", **kwargs
+            self, dataset=dataset, filesystem=filesystem, storage_type="OPTERYX", **kwargs
         )
         Diachronic.__init__(self, **kwargs)
         Statistics.__init__(self, **kwargs)
@@ -180,7 +180,7 @@ def get_dataset_schema(self) -> RelationSchema:
         # Use Parquet manifest reader instead of Opteryx inspect API to avoid Avro
         try:
             import pyarrow as pa
-            from opteryx_catalof.parquet_manifest import read_parquet_manifest
+            from opteryx_catalog.parquet_manifest import read_parquet_manifest
 
             parquet_records = read_parquet_manifest(
                 self.table.metadata,
@@ -230,16 +230,6 @@ def get_dataset_schema(self) -> RelationSchema:
 
         relation_statistics.record_count = pyarrow.compute.sum(files.column("record_count")).as_py()
 
-        if "distinct_counts" in files.columns:
-            for file in files.column("distinct_counts"):
-                for k, v in file:
-                    relation_statistics.set_cardinality_estimate(column_names[k], v)
-
-        if "value_counts" in files.columns:
-            for file in files.column("value_counts"):
-                for k, v in file:
-                    relation_statistics.add_count(column_names[k], v)
-
         self.relation_statistics = relation_statistics
 
         return self.schema
@@ -250,6 +240,7 @@ def get_list_of_blob_names(self, *, prefix: str = None, predicates: list = []) -
         # Get the list of data files to read
         data_files = self.table.scan(
             #row_filter=pushed_filters,
+            row_limit=self.limit,
             snapshot_id=self.snapshot_id,
         )
         return [data_file.file_path for data_file in data_files]
@@ -461,9 +452,8 @@ def get_view(self, view_name: str):
         # Parse relative_id into collection and name
         # For "clickbench.q01": collection="clickbench", name="q01"
         parts = relative_id.split(".")
-        if len(parts) >= 2:
-            name = parts[-1]
-            collection = ".".join(parts[:-1])
+        name = parts[-1]
+        collection = ".".join(parts[:-1])
 
         identifier = (collection, name)
         view = catalog.load_view(identifier)
diff --git a/opteryx/draken/vectors/arrow_vector.py b/opteryx/draken/vectors/arrow_vector.py
@@ -42,13 +42,77 @@ def __init__(self, arrow_array: "pyarrow.Array"):
             raise TypeError("ArrowVector requires a pyarrow.Array")
         self._arr = arrow_array
         self._pa = pa
-        self._pc = pa.compute
+        try:
+            # Prefer direct access if available
+            self._pc = pa.compute
+        except Exception:
+            try:
+                # Some pyarrow builds expose compute as a submodule
+                import pyarrow.compute as _pc
+
+                self._pc = _pc
+            except Exception:
+                # Minimal shim for required compute operations used by ArrowVector
+                class _Shim:
+                    @staticmethod
+                    def take(arr, indices_arr):
+                        indices = indices_arr.to_pylist()
+                        vals = arr.to_pylist()
+                        return pa.array([vals[i] for i in indices])
+
+                    @staticmethod
+                    def equal(arr, value):
+                        return pa.array([x == value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def not_equal(arr, value):
+                        return pa.array([x != value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def greater(arr, value):
+                        return pa.array([x > value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def greater_equal(arr, value):
+                        return pa.array([x >= value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def less(arr, value):
+                        return pa.array([x < value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def less_equal(arr, value):
+                        return pa.array([x <= value for x in arr.to_pylist()])
+
+                    @staticmethod
+                    def sum(arr):
+                        s = sum(x for x in arr.to_pylist() if x is not None)
+                        return pa.scalar(s)
+
+                    @staticmethod
+                    def min(arr):
+                        vals = [x for x in arr.to_pylist() if x is not None]
+                        return pa.scalar(min(vals)) if vals else pa.scalar(None)
+
+                    @staticmethod
+                    def max(arr):
+                        vals = [x for x in arr.to_pylist() if x is not None]
+                        return pa.scalar(max(vals)) if vals else pa.scalar(None)
+
+                    @staticmethod
+                    def is_null(arr):
+                        return pa.array([x is None for x in arr.to_pylist()])
+
+                self._pc = _Shim()
 
     # -------- Core metadata --------
     @property
     def length(self) -> int:
         return len(self._arr)
 
+    def __len__(self) -> int:
+        return self.length
+
     @property
     def dtype(self):
         from opteryx.draken.interop.arrow import arrow_type_to_draken
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx_core"
-version = "0.5.3"
+version = "0.5.4"
 description = "Opteryx Query Engine"
 requires-python = '>=3.13'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/third_party/mabel/draken/vectors/date32_vector.pyx b/third_party/mabel/draken/vectors/date32_vector.pyx
@@ -366,6 +366,37 @@ cdef class Date32Vector(Vector):
 
             dst[i] = mix_hash(dst[i], value)
 
+    cdef void compress_into(self, int64_t[::1] out_buf, Py_ssize_t offset=0) except *:
+        """Fast compress for Date32Vector: extend int32 days to int64."""
+        cdef DrakenFixedBuffer* ptr = self.ptr
+        cdef int32_t* data = <int32_t*> ptr.data
+        cdef Py_ssize_t n = ptr.length
+        cdef int64_t NULL_FLAG = <int64_t> -9223372036854775808
+
+        if n == 0:
+            return
+
+        if offset < 0 or offset + n > out_buf.shape[0]:
+            raise ValueError("Date32Vector.compress: output buffer too small")
+
+        cdef int64_t* dst = &out_buf[offset]
+        cdef uint8_t* null_bitmap = ptr.null_bitmap
+        cdef bint has_nulls = null_bitmap != NULL
+        cdef Py_ssize_t i
+        cdef uint8_t byte, bit
+
+        if has_nulls:
+            for i in range(n):
+                byte = null_bitmap[i >> 3]
+                bit = (byte >> (i & 7)) & 1
+                if bit:
+                    dst[i] = <int64_t> data[i]
+                else:
+                    dst[i] = NULL_FLAG
+        else:
+            for i in range(n):
+                dst[i] = <int64_t> data[i]
+
     def __str__(self):
         cdef list vals = []
         cdef Py_ssize_t i, k = min(<Py_ssize_t>buf_length(self.ptr), 10)
diff --git a/third_party/mabel/draken/vectors/interval_vector.pyx b/third_party/mabel/draken/vectors/interval_vector.pyx
@@ -328,6 +328,34 @@ cdef class IntervalVector(Vector):
                 value = mix_hash(partial, <uint64_t>data[i].microseconds)
             dst[i] = mix_hash(dst[i], value)
 
+    cdef void compress_into(self, int64_t[::1] out_buf, Py_ssize_t offset=0) except *:
+        """Fast compress for IntervalVector: use months component for ordering."""
+        cdef DrakenFixedBuffer* ptr = self.ptr
+        cdef Py_ssize_t n = ptr.length
+        cdef int64_t NULL_FLAG = <int64_t> -9223372036854775808
+
+        if n == 0:
+            return
+
+        if offset < 0 or offset + n > out_buf.shape[0]:
+            raise ValueError("IntervalVector.compress: output buffer too small")
+
+        cdef IntervalValue* data = <IntervalValue*> ptr.data
+        cdef int64_t* dst = &out_buf[offset]
+        cdef bint has_nulls = ptr.null_bitmap != NULL
+        cdef Py_ssize_t i
+
+        if has_nulls:
+            for i in range(n):
+                if _is_valid(ptr, i):
+                    # Use months as primary component for ordering
+                    dst[i] = data[i].months
+                else:
+                    dst[i] = NULL_FLAG
+        else:
+            for i in range(n):
+                dst[i] = data[i].months
+
     def __str__(self):
         cdef list preview = []
         cdef Py_ssize_t i, n = buf_length(self.ptr)
diff --git a/third_party/mabel/draken/vectors/time_vector.pyx b/third_party/mabel/draken/vectors/time_vector.pyx
@@ -27,7 +27,7 @@ from libc.stdint cimport intptr_t
 from libc.stdint cimport uint64_t
 from libc.stdint cimport uint8_t
 from libc.stdlib cimport malloc
-from libc.string cimport memset
+from libc.string cimport memset, memcpy
 
 from opteryx.draken.core.buffers cimport DrakenFixedBuffer
 from opteryx.draken.core.buffers cimport DRAKEN_TIME32
@@ -299,6 +299,52 @@ cdef class TimeVector(Vector):
 
                 dst[i] = mix_hash(dst[i], value)
 
+    cdef void compress_into(self, int64_t[::1] out_buf, Py_ssize_t offset=0) except *:
+        """Fast compress for TimeVector: handle both time32 and time64."""
+        cdef DrakenFixedBuffer* ptr = self.ptr
+        cdef Py_ssize_t n = ptr.length
+        cdef int64_t NULL_FLAG = <int64_t> -9223372036854775808
+
+        if n == 0:
+            return
+
+        if offset < 0 or offset + n > out_buf.shape[0]:
+            raise ValueError("TimeVector.compress: output buffer too small")
+
+        cdef int64_t* dst = &out_buf[offset]
+        cdef uint8_t* null_bitmap = ptr.null_bitmap
+        cdef bint has_nulls = null_bitmap != NULL
+        cdef Py_ssize_t i
+        cdef uint8_t byte, bit
+        cdef int64_t* data64
+        cdef int32_t* data32
+
+        if self.is_time64:
+            data64 = <int64_t*> ptr.data
+            if not has_nulls:
+                memcpy(<void*>dst, <const void*>data64, <size_t>(n * sizeof(int64_t)))
+                return
+            for i in range(n):
+                byte = null_bitmap[i >> 3]
+                bit = (byte >> (i & 7)) & 1
+                if bit:
+                    dst[i] = data64[i]
+                else:
+                    dst[i] = NULL_FLAG
+        else:
+            data32 = <int32_t*> ptr.data
+            if has_nulls:
+                for i in range(n):
+                    byte = null_bitmap[i >> 3]
+                    bit = (byte >> (i & 7)) & 1
+                    if bit:
+                        dst[i] = <int64_t> data32[i]
+                    else:
+                        dst[i] = NULL_FLAG
+            else:
+                for i in range(n):
+                    dst[i] = <int64_t> data32[i]
+
     def __str__(self):
         cdef list vals = []
         cdef Py_ssize_t i, k = min(<Py_ssize_t>buf_length(self.ptr), 10)
diff --git a/third_party/mabel/draken/vectors/timestamp_vector.pyx b/third_party/mabel/draken/vectors/timestamp_vector.pyx
@@ -27,7 +27,7 @@ from libc.stdint cimport intptr_t
 from libc.stdint cimport uint64_t
 from libc.stdint cimport uint8_t
 from libc.stdlib cimport malloc
-from libc.string cimport memset
+from libc.string cimport memset, memcpy
 
 from opteryx.draken.core.buffers cimport DrakenFixedBuffer
 from opteryx.draken.core.buffers cimport DRAKEN_TIMESTAMP64
@@ -348,6 +348,37 @@ cdef class TimestampVector(Vector):
 
             dst[i] = mix_hash(dst[i], value)
 
+    cdef void compress_into(self, int64_t[::1] out_buf, Py_ssize_t offset=0) except *:
+        """Fast compress for TimestampVector: timestamps are already int64."""
+        cdef DrakenFixedBuffer* ptr = self.ptr
+        cdef int64_t* src = <int64_t*> ptr.data
+        cdef Py_ssize_t n = ptr.length
+        cdef int64_t* dst_base
+        cdef int64_t NULL_FLAG = <int64_t> -9223372036854775808
+
+        if n == 0:
+            return
+
+        if offset < 0 or offset + n > out_buf.shape[0]:
+            raise ValueError("TimestampVector.compress: output buffer too small")
+
+        dst_base = &out_buf[0]
+        cdef int64_t* dst = dst_base + offset
+        cdef uint8_t* null_bitmap = ptr.null_bitmap
+        cdef bint has_nulls = null_bitmap != NULL
+        cdef Py_ssize_t i
+
+        if not has_nulls:
+            # Fast path: bulk copy
+            memcpy(<void*>dst, <const void*>src, <size_t>(n * sizeof(int64_t)))
+            return
+
+        for i in range(n):
+            if _bitmap_is_valid(null_bitmap, i, self.null_bit_offset):
+                dst[i] = src[i]
+            else:
+                dst[i] = NULL_FLAG
+
     def __str__(self):
         cdef list vals = []
         cdef Py_ssize_t i, k = min(<Py_ssize_t>buf_length(self.ptr), 10)
diff --git a/third_party/mabel/rugo/jsonl/text_search.hpp b/third_party/mabel/rugo/jsonl/text_search.hpp