mabel-dev
diff --git a/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion b/‎dev/build_counter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opteryx/__version__.py‎
Lines changed: 3 additions & 3 deletions b/‎opteryx/__version__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎opteryx/compiled/aggregations/group_by_draken_kernels/10_count_star_int64.pyx‎
Lines changed: 56 additions & 27 deletions b/‎opteryx/compiled/aggregations/group_by_draken_kernels/10_count_star_int64.pyx‎
Lines changed: 56 additions & 27 deletions
@@ -29,7 +29,7 @@ class VersionStatus(Enum):
 
 __major_version__ = 0
 __minor_version__ = 6
-__revision_version__ = 28
+__revision_version__ = 29
 __author__ = "@joocer"
 __status__ = VersionStatus.RELEASE
 
 
@@ -1,11 +1,11 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 356
+__build__ = 359
 __author__ = "@joocer"
-__version__ = "0.6.28"
+__version__ = "0.6.29"
 __lib__ = "opteryx-core"
-__build_date__ = "2026-02-24T19:37:10.770016+00:00Z"
+__build_date__ = "2026-02-25T21:41:37.549756+00:00Z"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -32,41 +32,66 @@ cdef class Int64CountStarKernel:
         if self._counts.size() == 0 and row_count > 0:
             self._counts.reserve(<size_t>(row_count * 2))
 
-        if key_nulls == NULL:
-            for row_idx in range(row_count):
-                key_value = key_data[row_idx]
-                self._counts[<uint64_t>key_value] += 1
-            return True
-
-        for row_idx in range(row_count):
-            if (key_nulls[row_idx >> 3] >> (row_idx & 7)) & 1:
-                key_value = key_data[row_idx]
-                self._counts[<uint64_t>key_value] += 1
+        # perform the tight counting loop without touching Python objects
+        # - extract locals so we can release the GIL
+        cdef uint64_t* dptr = <uint64_t*>NULL
+        cdef uint8_t* nptr = <uint8_t*>NULL
+        cdef flat_hash_map[uint64_t, int64_t]* cmap
+        cdef bint seen = self._seen_null
+        cdef int64_t nulls = self._null_count
+
+        dptr = <uint64_t*>key_data
+        nptr = key_nulls
+        cmap = &self._counts
+
+        with nogil:
+            if nptr == NULL:
+                for row_idx in range(row_count):
+                    cmap[0][dptr[row_idx]] += 1
             else:
-                self._seen_null = True
-                self._null_count += 1
+                for row_idx in range(row_count):
+                    if (nptr[row_idx >> 3] >> (row_idx & 7)) & 1:
+                        cmap[0][dptr[row_idx]] += 1
+                    else:
+                        seen = True
+                        nulls += 1
+
+        # write back the bits that had to live under the GIL
+        self._seen_null = seen
+        self._null_count = nulls
         return True
 
+    # finalize_rows is the slow, fully‑Python path; we keep it for
+    # compatibility but it's only invoked if the caller cannot consume
+    # the fast columns returned by ``finalize_fast_columns``.
     cpdef list finalize_rows(self):
+        # delegate to fast columns and then convert to Python list in a single
+        # loop; this avoids repeated append() calls in the Python layer.
+        cdef object res
         cdef list rows
         cdef flat_hash_map[uint64_t, int64_t].iterator count_it
+        cdef object keys, counts
 
-        if self._counts.size() == 0 and not self._seen_null:
-            return []
-
-        rows = []
-        count_it = self._counts.begin()
-        while count_it != self._counts.end():
-            rows.append(
-                (
-                    (<int64_t>dereference(count_it).first,),
-                    [dereference(count_it).second],
+        res = self.finalize_fast_columns()
+        if res is None:
+            # nulls present; fall back to the old behaviour for correctness
+            rows = []
+            count_it = self._counts.begin()
+            while count_it != self._counts.end():
+                rows.append(
+                    (
+                        (<int64_t>dereference(count_it).first,),
+                        [dereference(count_it).second],
+                    )
                 )
-            )
-            preincrement(count_it)
-        if self._seen_null:
-            rows.append(((None,), [self._null_count]))
-        return rows
+                preincrement(count_it)
+            if self._seen_null:
+                rows.append(((None,), [self._null_count]))
+            return rows
+        else:
+            # res is (keys, counts) tuple
+            keys, counts = res
+            return [( (k,), [c] ) for k, c in zip(keys, counts)]
 
     cpdef object finalize_fast_columns(self):
         cdef Py_ssize_t n
@@ -88,6 +113,10 @@ cdef class Int64CountStarKernel:
 
         count_it = self._counts.begin()
         idx = 0
+        # iterate over the C++ map while holding the GIL; the body does not
+        # perform any Python calls so the loop is already very tight.  if
+        # desired this could be executed nogil, but the improvement is
+        # negligible compared to the cost of the aggregation itself.
         while count_it != self._counts.end():
             key_view[idx] = <int64_t>dereference(count_it).first
             count_view[idx] = dereference(count_it).second