mabel-dev
diff --git a/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions b/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opteryx/compiled/io/iouring.pxd‎
Lines changed: 70 additions & 0 deletions b/‎opteryx/compiled/io/iouring.pxd‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎opteryx/compiled/io/iouring.pyx‎
Lines changed: 156 additions & 0 deletions b/‎opteryx/compiled/io/iouring.pyx‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎opteryx/connectors/disk_connector.py‎
Lines changed: 0 additions & 1 deletion b/‎opteryx/connectors/disk_connector.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎opteryx/utils/file_decoders.py‎
Lines changed: 28 additions & 36 deletions b/‎opteryx/utils/file_decoders.py‎
Lines changed: 28 additions & 36 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1698
+__build__ = 1701
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1698"
+__version__ = "0.26.0-beta.1701"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -0,0 +1,70 @@
+# distutils: language = c++
+# cython: language_level=3
+
+cdef extern from "errno.h":
+    int errno
+
+cdef extern from "stdlib.h":
+    int posix_memalign(void **memptr, size_t alignment, size_t size)
+    void free(void *ptr)
+
+cdef extern from "unistd.h":
+    int close(int fd)
+
+cdef extern from "fcntl.h":
+    int open(const char *pathname, int flags, ...)
+
+cdef extern from "sys/types.h":
+    pass
+
+cdef extern from "sys/uio.h":
+    ctypedef struct iovec:
+        void   *iov_base
+        size_t  iov_len
+
+cdef extern from "liburing.h":
+    ctypedef struct io_uring:
+        pass
+
+    ctypedef struct io_uring_sqe:
+        pass
+
+    ctypedef struct io_uring_cqe:
+        unsigned int    flags
+        int             res
+        unsigned long long user_data
+
+    int  io_uring_queue_init(unsigned entries, io_uring *ring, unsigned flags)
+    void io_uring_queue_exit(io_uring *ring)
+
+    io_uring_sqe* io_uring_get_sqe(io_uring *ring)
+    int  io_uring_submit(io_uring *ring)
+
+    int  io_uring_wait_cqe(io_uring *ring, io_uring_cqe **cqe_ptr)
+    int  io_uring_peek_cqe(io_uring *ring, io_uring_cqe **cqe_ptr)
+    void io_uring_cqe_seen(io_uring *ring, io_uring_cqe *cqe)
+
+    # buffer registration
+    int  io_uring_register_buffers(io_uring *ring, const iovec *iovecs, unsigned nr_iovecs)
+    int  io_uring_unregister_buffers(io_uring *ring)
+
+    # prep helpers
+    void io_uring_prep_read_fixed(io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, long long offset, int buf_index)
+
+    # user_data helpers (declared in liburing.h as static inline)
+    void               io_uring_sqe_set_data64(io_uring_sqe *sqe, unsigned long long data)
+    unsigned long long io_uring_cqe_get_data64(const io_uring_cqe *cqe)
+
+    # common setup flags
+    cdef unsigned IORING_SETUP_CLAMP
+    cdef unsigned IORING_SETUP_COOP_TASKRUN
+    cdef unsigned IORING_SETUP_SINGLE_ISSUER
+
+# open(2) flags
+cdef extern from "fcntl.h":
+    int O_RDONLY
+    int O_DIRECT
+    int O_CLOEXEC
+
+# Helper (defined in .pyx)
+cdef int _check_errno(int rc)
@@ -0,0 +1,156 @@
+# distutils: language = c++
+# cython: language_level=3
+
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from libc.stdint cimport uint64_t, uintptr_t
+from libc.string cimport memset
+
+# IMPORTANT: cimport the module by its full package path
+cimport opteryx.compiled.io.iouring as C
+
+cdef int _check_errno(int rc):
+    if rc < 0:
+        raise OSError(-rc, "io_uring error")
+    return rc
+
+cdef class BufferPool:
+    cdef void **ptrs
+    cdef C.iovec *iov
+    cdef size_t nbuf
+    cdef size_t buf_size
+    cdef size_t alignment
+
+    def __cinit__(self, size_t nbuf, size_t buf_size, size_t alignment=4096):
+        self.nbuf = nbuf
+        self.buf_size = buf_size
+        self.alignment = alignment
+        self.ptrs = <void **>PyMem_Malloc(nbuf * sizeof(void *))
+        if self.ptrs == NULL:
+            raise MemoryError("alloc ptrs")
+        self.iov = <C.iovec *>PyMem_Malloc(nbuf * sizeof(C.iovec))
+        if self.iov == NULL:
+            PyMem_Free(self.ptrs)
+            raise MemoryError("alloc iov")
+        for i in range(nbuf):
+            self.ptrs[i] = NULL
+
+        cdef void *p
+        for i in range(nbuf):
+            if C.posix_memalign(&p, alignment, buf_size) != 0:
+                self._cleanup(i)
+                raise MemoryError(f"posix_memalign failed for buffer {i}")
+            memset(p, 0, buf_size)
+            self.ptrs[i] = p
+            self.iov[i].iov_base = p
+            self.iov[i].iov_len = buf_size
+
+    cdef void _cleanup(self, size_t upto):
+        cdef size_t j
+        for j in range(upto):
+            if self.ptrs[j] != NULL:
+                C.free(self.ptrs[j])
+        if self.iov != NULL:
+            PyMem_Free(self.iov)
+        if self.ptrs != NULL:
+            PyMem_Free(self.ptrs)
+
+    def __dealloc__(self):
+        self._cleanup(self.nbuf)
+
+    property n:
+        def __get__(self):
+            return self.nbuf
+
+    property size:
+        def __get__(self):
+            return self.buf_size
+
+    def addr(self, size_t idx) -> int:
+        if idx >= self.nbuf:
+            raise IndexError
+        return <uintptr_t>self.ptrs[idx]
+
+    def view(self, size_t idx, Py_ssize_t length):
+        if idx >= self.nbuf or length > self.buf_size:
+            raise IndexError
+        cdef unsigned char[:] mv = <unsigned char[:length]> self.ptrs[idx]
+        return mv
+
+
+cdef class Uring:
+    cdef C.io_uring ring
+    cdef BufferPool pool
+    cdef bint buffers_registered
+
+    def __cinit__(self, unsigned entries=4096, unsigned flags=0):
+        if flags == 0:
+            flags = C.IORING_SETUP_CLAMP | C.IORING_SETUP_COOP_TASKRUN | C.IORING_SETUP_SINGLE_ISSUER
+        _check_errno(C.io_uring_queue_init(entries, &self.ring, flags))
+        self.pool = None
+        self.buffers_registered = False
+
+    def __dealloc__(self):
+        try:
+            if self.buffers_registered:
+                C.io_uring_unregister_buffers(&self.ring)
+        except Exception:
+            pass
+        C.io_uring_queue_exit(&self.ring)
+
+    def register_buffers(self, BufferPool pool):
+        if pool is None:
+            raise ValueError("pool is None")
+        _check_errno(C.io_uring_register_buffers(&self.ring, pool.iov, <unsigned>pool.nbuf))
+        self.pool = pool
+        self.buffers_registered = True
+
+    def submit_read_fixed(self, int fd, size_t buf_index, size_t nbytes, long long offset, uint64_t user_data=0):
+        if not self.buffers_registered:
+            raise RuntimeError("buffers not registered")
+        if buf_index >= self.pool.nbuf:
+            raise IndexError
+        if nbytes > self.pool.buf_size:
+            raise ValueError("nbytes > buffer size")
+
+        cdef C.io_uring_sqe* sqe = C.io_uring_get_sqe(&self.ring)
+        if sqe == NULL:
+            raise RuntimeError("no available SQE (ring full)")
+
+        C.io_uring_prep_read_fixed(sqe, fd, self.pool.ptrs[buf_index],
+                                   <unsigned>nbytes, offset, <int>buf_index)
+        # Use helper instead of touching struct fields
+        C.io_uring_sqe_set_data64(sqe, user_data)
+
+    def submit(self) -> int:
+        return _check_errno(C.io_uring_submit(&self.ring))
+
+    def wait_cqe(self):
+        cdef C.io_uring_cqe* cqe
+        _check_errno(C.io_uring_wait_cqe(&self.ring, &cqe))
+        res = cqe.res
+        ud = C.io_uring_cqe_get_data64(cqe)
+        C.io_uring_cqe_seen(&self.ring, cqe)
+        return res, ud
+
+    def peek_cqe(self):
+        cdef C.io_uring_cqe* cqe
+        rc = C.io_uring_peek_cqe(&self.ring, &cqe)
+        if rc == 0 and cqe != NULL:
+            res = cqe.res
+            ud = C.io_uring_cqe_get_data64(cqe)
+            C.io_uring_cqe_seen(&self.ring, cqe)
+            return res, ud
+        return None
+
+
+def open_direct(path: bytes) -> int:
+    """Open O_RDONLY|O_DIRECT|O_CLOEXEC. Caller must close(fd)."""
+    cdef int fd = C.open(<const char*>path, C.O_RDONLY | C.O_DIRECT | C.O_CLOEXEC)
+    if fd < 0:
+        raise OSError(C.errno, "open(O_DIRECT) failed")
+    return fd
+
+
+def close_fd(int fd):
+    if C.close(fd) != 0:
+        raise OSError(C.errno, "close failed")
@@ -9,7 +9,6 @@
 """
 
 import contextlib
-import ctypes
 import mmap
 import os
 import platform
 
@@ -243,7 +243,7 @@ def parquet_decoder(
     just_schema: bool = False,
     just_statistics: bool = False,
     force_read: bool = False,
-    use_threads: bool = False,
+    use_threads: bool = True,
     statistics: Optional[RelationStatistics] = None,
 ) -> Tuple[int, int, pyarrow.Table]:
     """
@@ -334,21 +334,28 @@ def parquet_decoder(
 
         return statistics
 
-    # If we're here, we can't use rugo - we need to read the file with pyarrow
-
-    # Open the parquet file only once. Prefer pyarrow.BufferReader with a
-    # pyarrow.Buffer when we have a memoryview to avoid creating intermediate
-    # Python bytes objects.
+    # Use rugo's lightweight metadata reader first (faster than pyarrow)
     if isinstance(buffer, memoryview):
-        pa_buf = pyarrow.py_buffer(buffer)
-        stream = pyarrow.BufferReader(pa_buf)
-    elif isinstance(buffer, bytes):
-        stream = pyarrow.BufferReader(buffer)
+        rmeta = parquet_meta.read_metadata_from_memoryview(buffer)
     else:
-        stream = pyarrow.input_stream(buffer)
+        rmeta = parquet_meta.read_metadata_from_memoryview(memoryview(buffer))
 
-    pq_meta = parquet.read_metadata(stream)
-    stream.seek(0)
+    # Build the pieces we need from the rugo metadata
+    # schema names (parquet has same columns across row groups usually)
+    if rmeta.get("row_groups"):
+        schema_names = [c["name"] for c in rmeta["row_groups"][0]["columns"]]
+    else:
+        schema_names = []
+
+    num_rows = rmeta.get("num_rows")
+    # number of columns - try to derive, fallback to length of schema_names
+    num_columns = rmeta.get("num_columns") or len(schema_names)
+
+    # total uncompressed size (rugo uses total_byte_size)
+    uncompressed_size = sum(
+        sum(col.get("total_byte_size", 0) for col in rg.get("columns", []))
+        for rg in rmeta.get("row_groups", [])
+    )
 
     # we need to work out if we have a selection which may force us
     # fetching columns just for filtering
@@ -361,36 +368,21 @@ def parquet_decoder(
     filter_columns = {
         c.value for c in get_all_nodes_of_type(processed_selection, (NodeType.IDENTIFIER,))
     }
-    selected_columns = list(projection_set.union(filter_columns).intersection(pq_meta.schema.names))
+    selected_columns = list(projection_set.union(filter_columns).intersection(schema_names))
 
     # Read all columns if none are selected, unless force_read is set
     if not selected_columns and not force_read:
         selected_columns = []
 
-    # get the full data size of the file to see how effective projection/selection is
-    uncompressed_size = sum(
-        row_group.column(j).total_uncompressed_size
-        for i in range(pq_meta.num_row_groups)
-        for row_group in [pq_meta.row_group(i)]
-        for j in range(row_group.num_columns)
-    )
-
-    # If it's COUNT(*), we don't need to create a full dataset
-    # We have a handler later to sum up the $COUNT(*) column
-    if projection == [] and selection == []:
-        table = pyarrow.Table.from_arrays([[pq_meta.num_rows]], names=["$COUNT(*)"])
-        return (
-            pq_meta.num_rows,
-            pq_meta.num_columns,
-            uncompressed_size,
-            table,
-        )
+    # Open the parquet file only once. Fake a file-like object around the buffer
+    if isinstance(buffer, memoryview):
+        buffer = MemoryViewStream(buffer)
 
     # Read the parquet table with the optimized column list and selection filters
     table = parquet.read_table(
-        stream,
+        buffer,
         columns=selected_columns,
-        pre_buffer=False,
+        pre_buffer=True,
         filters=dnf_filter,
         use_threads=use_threads,
         use_pandas_metadata=False,
@@ -401,8 +393,8 @@ def parquet_decoder(
         table = filter_records(processed_selection, table)
 
     return (
-        pq_meta.num_rows,
-        pq_meta.num_columns,
+        num_rows,
+        num_columns,
         uncompressed_size,
         table,
     )
 
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1698"
+version = "0.26.0-beta.1701"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}