mabel-dev · joocer · Oct 23, 2025 · Oct 18, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/dev/documents/list-hash-fastpath.md b/dev/documents/list-hash-fastpath.md
@@ -0,0 +1,38 @@
+List hashing fast-path — opteryx compiled/table_ops/hash_ops
+
+Purpose
+- Document when the list hashing implementation takes a buffer-aware fast path (no Python object allocation) and when it falls back to per-element Python hashing.
+
+Where it lives
+- Implementation: `opteryx/compiled/table_ops/hash_ops.pyx` — function `process_list_chunk`.
+- Tests: `tests/unit/diagnostic/test_list_fast_paths.py`.
+
+Fast-path conditions
+- The list handler will use buffer-aware, zero-Python-object inner loops when the list child type is one of:
+  - integer types (signed/unsigned, fixed-width)
+  - floating point types
+  - temporal types (timestamps/dates)
+  - string or binary child types (string buffers + offsets)
+
+- For the above child types the code reads child buffers directly and computes element hashes without creating Python objects. This gives a large performance win for dense numeric/string lists.
+
+Fallback cases
+- If the list child type is a complex/unrecognized Arrow type (for example, structs, maps, or arbitrary Python objects), the implementation falls back to slicing the child array and calling Python-level hashing for each element. This is correct but slower.
+
+Correctness notes
+- All paths account for Arrow `chunk.offset` on both the parent list array and on the child array. Validity bitmaps are checked with proper bit/byte arithmetic.
+- 8-byte primitive loads are done via `memcpy` into a local `uint64_t` to avoid unaligned memory reads.
+
+Testing and benchmarks
+- Unit tests in `tests/unit/diagnostic/test_list_fast_paths.py` validate parity between flat and chunked arrays and basic correctness for nested and boolean lists.
+- Benchmarks live in `tests/performance/benchmarks/bench_hash_ops.py`.
+
+When to extend
+- If you see nested lists of primitives commonly in workloads, consider implementing a dedicated nested-list stack-based fast path to avoid repeated slice() allocations.
+- If child types are frequently small fixed-width types, additional micro-optimizations (incremental bit/byte pointers rather than recomputing shifts) can pay off.
+
+"Why not always buffer-aware?"
+- Some Arrow child types are not stored as simple contiguous buffers accessible by offset arithmetic (e.g., structs or other nested variable-width complex types). In those cases, the safe and correct approach is to create Python objects and hash them.
+
+Contact
+- If you have a representative large dataset that still performs poorly, attach it or a small reproducer and I'll benchmark and iterate.
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1666
+__build__ = 1676
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1666"
+__version__ = "0.26.0-beta.1676"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py

diff --git a/opteryx/compiled/structures/buffers.pxd b/opteryx/compiled/structures/buffers.pxd
@@ -17,6 +17,7 @@ cdef extern from "intbuffer.h" namespace "":
         CIntBuffer(size_t size_hint)
         void append(int64_t value)
         void extend(const vector[int64_t]& values)
+        void extend(const int64_t* values, size_t count)
         const int64_t* data() const
         size_t size() const
 
@@ -29,3 +30,6 @@ cdef class IntBuffer:
     cpdef void extend(self, iterable)
     cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self)
     cpdef size_t size(self)
+    cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr)
+    cpdef void reserve(self, size_t capacity)
+    cpdef void append_batch(self, int64_t[::1] values)
diff --git a/opteryx/compiled/structures/buffers.pyx b/opteryx/compiled/structures/buffers.pyx
@@ -13,52 +13,81 @@ numpy.import_array()
 
 from libc.stdint cimport int64_t
 from libcpp.vector cimport vector
+from libc.string cimport memcpy
 
 cdef extern from "intbuffer.h":
     cdef cppclass CIntBuffer:
-
         CIntBuffer(size_t size_hint)
-        inline void append(int64_t value)
-        inline void extend(const vector[int64_t]& values)
-        inline const int64_t* data() const
-        inline size_t size()
+        void append(int64_t value) nogil
+        void extend(const vector[int64_t]& values) nogil
+        void extend(const int64_t* data, size_t count) nogil
+        const int64_t* data() nogil
+        size_t size() nogil
 
 cdef class IntBuffer:
-    """
-    Python wrapper for the C++ IntBuffer class.
-    """
-    #cdef CIntBuffer* c_buffer
 
-    def __cinit__(self, size_hint: int = 1024):
+    def __cinit__(self, size_t size_hint = 1024):
         self.c_buffer = new CIntBuffer(size_hint)
 
     def __dealloc__(self):
         del self.c_buffer
 
     cpdef void append(self, int64_t value):
-        """ Append an integer to the buffer. """
+        """Append an integer to the buffer."""
         self.c_buffer.append(value)
 
-    cpdef void extend(self, iterable):
-        """ Extend the buffer with an iterable of integers. """
-        cdef vector[int64_t] values = iterable
-        self.c_buffer.extend(values)
+    cpdef void append_batch(self, int64_t[::1] values):
+        """Append a batch of values efficiently."""
+        cdef size_t n = values.shape[0]
+        if n > 0:
+            self.c_buffer.extend(&values[0], n)
+
+    cpdef void extend(self, object iterable):
+        """Extend the buffer with an iterable of integers."""
+        # Fast path for numpy arrays
+        if isinstance(iterable, numpy.ndarray):
+            arr = numpy.ascontiguousarray(iterable, dtype=numpy.int64)
+            self.extend_numpy(arr)
+            return
+
+        # Fast path for lists/tuples - pre-allocate and copy
+        cdef size_t estimated_size
+        estimated_size = len(iterable)
+
+        cdef vector[int64_t] vec
+        if estimated_size > 1000:  # For large iterables, use vector
+            vec.reserve(estimated_size)
+            for item in iterable:
+                vec.push_back(item)
+            self.c_buffer.extend(vec)
+        else:
+            # Small iterables - just append one by one
+            for item in iterable:
+                self.c_buffer.append(item)
+
+    cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr):
+        """Extend with numpy array - fastest method."""
+        cdef size_t n = arr.shape[0]
+        if n > 0:
+            self.c_buffer.extend(&arr[0], n)
 
     cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self):
-        """ Convert the buffer to a NumPy array by copying data. """
+        """Convert the buffer to a NumPy array using memcpy."""
         cdef size_t size = self.c_buffer.size()
-        cdef const int64_t* data_ptr = self.c_buffer.data()
 
         if size == 0:
-            return numpy.empty(0, dtype=numpy.int64)  # Handle empty buffer case
+            return numpy.empty(0, dtype=numpy.int64)
 
-        # Allocate a NumPy array and copy data
-        arr = numpy.empty(size, dtype=numpy.int64)
-        cdef int64_t[::1] arr_view = arr
-        for i in range(size):
-            arr_view[i] = data_ptr[i]  # Copy values manually
+        cdef const int64_t* data_ptr = self.c_buffer.data()
+        cdef numpy.ndarray[int64_t, ndim=1] arr = numpy.empty(size, dtype=numpy.int64)
 
+        memcpy(<void*>&arr[0], <const void*>data_ptr, size * sizeof(int64_t))
         return arr
 
     cpdef size_t size(self):
         return self.c_buffer.size()
+
+    cpdef void reserve(self, size_t capacity):
+        """Reserve capacity for future appends."""
+        # We'll need to add this method to the C++ class
+        pass