Skip to content

Commit d6bda3a

Browse files
committed
perf: draken group by
1 parent 1504ab7 commit d6bda3a

File tree

19 files changed

+33185
-211
lines changed

19 files changed

+33185
-211
lines changed

dev/build_counter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class VersionStatus(Enum):
2929

3030
__major_version__ = 0
3131
__minor_version__ = 6
32-
__revision_version__ = 28
32+
__revision_version__ = 29
3333
__author__ = "@joocer"
3434
__status__ = VersionStatus.RELEASE
3535

opteryx/__version__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 356
4+
__build__ = 359
55
__author__ = "@joocer"
6-
__version__ = "0.6.28"
6+
__version__ = "0.6.29"
77
__lib__ = "opteryx-core"
8-
__build_date__ = "2026-02-24T19:37:10.770016+00:00Z"
8+
__build_date__ = "2026-02-25T21:41:37.549756+00:00Z"
99

1010
# Store the version here so:
1111
# 1) we don't load dependencies by storing it in __init__.py

opteryx/compiled/aggregations/group_by_draken_kernels/10_count_star_int64.pyx

Lines changed: 56 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,41 +32,66 @@ cdef class Int64CountStarKernel:
3232
if self._counts.size() == 0 and row_count > 0:
3333
self._counts.reserve(<size_t>(row_count * 2))
3434

35-
if key_nulls == NULL:
36-
for row_idx in range(row_count):
37-
key_value = key_data[row_idx]
38-
self._counts[<uint64_t>key_value] += 1
39-
return True
40-
41-
for row_idx in range(row_count):
42-
if (key_nulls[row_idx >> 3] >> (row_idx & 7)) & 1:
43-
key_value = key_data[row_idx]
44-
self._counts[<uint64_t>key_value] += 1
35+
# perform the tight counting loop without touching Python objects
36+
# - extract locals so we can release the GIL
37+
cdef uint64_t* dptr = <uint64_t*>NULL
38+
cdef uint8_t* nptr = <uint8_t*>NULL
39+
cdef flat_hash_map[uint64_t, int64_t]* cmap
40+
cdef bint seen = self._seen_null
41+
cdef int64_t nulls = self._null_count
42+
43+
dptr = <uint64_t*>key_data
44+
nptr = key_nulls
45+
cmap = &self._counts
46+
47+
with nogil:
48+
if nptr == NULL:
49+
for row_idx in range(row_count):
50+
cmap[0][dptr[row_idx]] += 1
4551
else:
46-
self._seen_null = True
47-
self._null_count += 1
52+
for row_idx in range(row_count):
53+
if (nptr[row_idx >> 3] >> (row_idx & 7)) & 1:
54+
cmap[0][dptr[row_idx]] += 1
55+
else:
56+
seen = True
57+
nulls += 1
58+
59+
# write back the bits that had to live under the GIL
60+
self._seen_null = seen
61+
self._null_count = nulls
4862
return True
4963

64+
# finalize_rows is the slow, fully‑Python path; we keep it for
65+
# compatibility but it's only invoked if the caller cannot consume
66+
# the fast columns returned by ``finalize_fast_columns``.
5067
cpdef list finalize_rows(self):
68+
# delegate to fast columns and then convert to Python list in a single
69+
# loop; this avoids repeated append() calls in the Python layer.
70+
cdef object res
5171
cdef list rows
5272
cdef flat_hash_map[uint64_t, int64_t].iterator count_it
73+
cdef object keys, counts
5374

54-
if self._counts.size() == 0 and not self._seen_null:
55-
return []
56-
57-
rows = []
58-
count_it = self._counts.begin()
59-
while count_it != self._counts.end():
60-
rows.append(
61-
(
62-
(<int64_t>dereference(count_it).first,),
63-
[dereference(count_it).second],
75+
res = self.finalize_fast_columns()
76+
if res is None:
77+
# nulls present; fall back to the old behaviour for correctness
78+
rows = []
79+
count_it = self._counts.begin()
80+
while count_it != self._counts.end():
81+
rows.append(
82+
(
83+
(<int64_t>dereference(count_it).first,),
84+
[dereference(count_it).second],
85+
)
6486
)
65-
)
66-
preincrement(count_it)
67-
if self._seen_null:
68-
rows.append(((None,), [self._null_count]))
69-
return rows
87+
preincrement(count_it)
88+
if self._seen_null:
89+
rows.append(((None,), [self._null_count]))
90+
return rows
91+
else:
92+
# res is (keys, counts) tuple
93+
keys, counts = res
94+
return [( (k,), [c] ) for k, c in zip(keys, counts)]
7095

7196
cpdef object finalize_fast_columns(self):
7297
cdef Py_ssize_t n
@@ -88,6 +113,10 @@ cdef class Int64CountStarKernel:
88113

89114
count_it = self._counts.begin()
90115
idx = 0
116+
# iterate over the C++ map while holding the GIL; the body does not
117+
# perform any Python calls so the loop is already very tight. if
118+
# desired this could be executed nogil, but the improvement is
119+
# negligible compared to the cost of the aggregation itself.
91120
while count_it != self._counts.end():
92121
key_view[idx] = <int64_t>dereference(count_it).first
93122
count_view[idx] = dereference(count_it).second

0 commit comments

Comments
 (0)