Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions dev/documents/list-hash-fastpath.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
List hashing fast-path — opteryx compiled/table_ops/hash_ops

Purpose
- Document when the list hashing implementation takes a buffer-aware fast path (no Python object allocation) and when it falls back to per-element Python hashing.

Where it lives
- Implementation: `opteryx/compiled/table_ops/hash_ops.pyx` — function `process_list_chunk`.
- Tests: `tests/unit/diagnostic/test_list_fast_paths.py`.

Fast-path conditions
- The list handler will use buffer-aware, zero-Python-object inner loops when the list child type is one of:
- integer types (signed/unsigned, fixed-width)
- floating point types
- temporal types (timestamps/dates)
- string or binary child types (string buffers + offsets)

- For the above child types the code reads child buffers directly and computes element hashes without creating Python objects. This gives a large performance win for dense numeric/string lists.

Fallback cases
- If the list child type is a complex/unrecognized Arrow type (for example, structs, maps, or arbitrary Python objects), the implementation falls back to slicing the child array and calling Python-level hashing for each element. This is correct but slower.

Correctness notes
- All paths account for Arrow `chunk.offset` on both the parent list array and on the child array. Validity bitmaps are checked with proper bit/byte arithmetic.
- 8-byte primitive loads are done via `memcpy` into a local `uint64_t` to avoid unaligned memory reads.

Testing and benchmarks
- Unit tests in `tests/unit/diagnostic/test_list_fast_paths.py` validate parity between flat and chunked arrays and basic correctness for nested and boolean lists.
- Benchmarks live in `tests/performance/benchmarks/bench_hash_ops.py`.

When to extend
- If you see nested lists of primitives commonly in workloads, consider implementing a dedicated nested-list stack-based fast path to avoid repeated slice() allocations.
- If child types are frequently small fixed-width types, additional micro-optimizations (incremental bit/byte pointers rather than recomputing shifts) can pay off.

"Why not always buffer-aware?"
- Some Arrow child types are not stored as simple contiguous buffers accessible by offset arithmetic (e.g., structs or other nested variable-width complex types). In those cases, the safe and correct approach is to create Python objects and hash them.

Contact
- If you have a representative large dataset that still performs poorly, attach it or a small reproducer and I'll benchmark and iterate.
4 changes: 2 additions & 2 deletions opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
# DO NOT EDIT THIS FILE DIRECTLY

__build__ = 1666
__build__ = 1676
__author__ = "@joocer"
__version__ = "0.26.0-beta.1666"
__version__ = "0.26.0-beta.1676"

# Store the version here so:
# 1) we don't load dependencies by storing it in __init__.py
Expand Down
4 changes: 4 additions & 0 deletions opteryx/compiled/structures/buffers.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ cdef extern from "intbuffer.h" namespace "":
CIntBuffer(size_t size_hint)
void append(int64_t value)
void extend(const vector[int64_t]& values)
void extend(const int64_t* values, size_t count)
const int64_t* data() const
size_t size() const

Expand All @@ -29,3 +30,6 @@ cdef class IntBuffer:
cpdef void extend(self, iterable)
cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self)
cpdef size_t size(self)
cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr)
cpdef void reserve(self, size_t capacity)
cpdef void append_batch(self, int64_t[::1] values)
75 changes: 52 additions & 23 deletions opteryx/compiled/structures/buffers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,52 +13,81 @@ numpy.import_array()

from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from libc.string cimport memcpy

cdef extern from "intbuffer.h":
cdef cppclass CIntBuffer:

CIntBuffer(size_t size_hint)
inline void append(int64_t value)
inline void extend(const vector[int64_t]& values)
inline const int64_t* data() const
inline size_t size()
void append(int64_t value) nogil
void extend(const vector[int64_t]& values) nogil
void extend(const int64_t* data, size_t count) nogil
const int64_t* data() nogil
size_t size() nogil

cdef class IntBuffer:
"""
Python wrapper for the C++ IntBuffer class.
"""
#cdef CIntBuffer* c_buffer

def __cinit__(self, size_hint: int = 1024):
def __cinit__(self, size_t size_hint = 1024):
self.c_buffer = new CIntBuffer(size_hint)

def __dealloc__(self):
del self.c_buffer

cpdef void append(self, int64_t value):
""" Append an integer to the buffer. """
"""Append an integer to the buffer."""
self.c_buffer.append(value)

cpdef void extend(self, iterable):
""" Extend the buffer with an iterable of integers. """
cdef vector[int64_t] values = iterable
self.c_buffer.extend(values)
cpdef void append_batch(self, int64_t[::1] values):
"""Append a batch of values efficiently."""
cdef size_t n = values.shape[0]
if n > 0:
self.c_buffer.extend(&values[0], n)

cpdef void extend(self, object iterable):
"""Extend the buffer with an iterable of integers."""
# Fast path for numpy arrays
if isinstance(iterable, numpy.ndarray):
arr = numpy.ascontiguousarray(iterable, dtype=numpy.int64)
self.extend_numpy(arr)
return

# Fast path for lists/tuples - pre-allocate and copy
cdef size_t estimated_size
estimated_size = len(iterable)

cdef vector[int64_t] vec
if estimated_size > 1000: # For large iterables, use vector
vec.reserve(estimated_size)
for item in iterable:
vec.push_back(item)
self.c_buffer.extend(vec)
else:
# Small iterables - just append one by one
for item in iterable:
self.c_buffer.append(item)

cpdef void extend_numpy(self, numpy.ndarray[int64_t, ndim=1] arr):
"""Extend with numpy array - fastest method."""
cdef size_t n = arr.shape[0]
if n > 0:
self.c_buffer.extend(&arr[0], n)

cpdef numpy.ndarray[int64_t, ndim=1] to_numpy(self):
""" Convert the buffer to a NumPy array by copying data. """
"""Convert the buffer to a NumPy array using memcpy."""
cdef size_t size = self.c_buffer.size()
cdef const int64_t* data_ptr = self.c_buffer.data()

if size == 0:
return numpy.empty(0, dtype=numpy.int64) # Handle empty buffer case
return numpy.empty(0, dtype=numpy.int64)

# Allocate a NumPy array and copy data
arr = numpy.empty(size, dtype=numpy.int64)
cdef int64_t[::1] arr_view = arr
for i in range(size):
arr_view[i] = data_ptr[i] # Copy values manually
cdef const int64_t* data_ptr = self.c_buffer.data()
cdef numpy.ndarray[int64_t, ndim=1] arr = numpy.empty(size, dtype=numpy.int64)

memcpy(<void*>&arr[0], <const void*>data_ptr, size * sizeof(int64_t))
return arr

cpdef size_t size(self):
return self.c_buffer.size()

cpdef void reserve(self, size_t capacity):
"""Reserve capacity for future appends."""
# We'll need to add this method to the C++ class
pass
Loading
Loading