@@ -32,41 +32,66 @@ cdef class Int64CountStarKernel:
3232 if self ._counts.size() == 0 and row_count > 0 :
3333 self ._counts.reserve(< size_t> (row_count * 2 ))
3434
35- if key_nulls == NULL :
36- for row_idx in range (row_count):
37- key_value = key_data[row_idx]
38- self ._counts[< uint64_t> key_value] += 1
39- return True
40-
41- for row_idx in range (row_count):
42- if (key_nulls[row_idx >> 3 ] >> (row_idx & 7 )) & 1 :
43- key_value = key_data[row_idx]
44- self ._counts[< uint64_t> key_value] += 1
35+ # perform the tight counting loop without touching Python objects
36+ # - extract locals so we can release the GIL
37+ cdef uint64_t* dptr = < uint64_t* > NULL
38+ cdef uint8_t* nptr = < uint8_t* > NULL
39+ cdef flat_hash_map[uint64_t, int64_t]* cmap
40+ cdef bint seen = self ._seen_null
41+ cdef int64_t nulls = self ._null_count
42+
43+ dptr = < uint64_t* > key_data
44+ nptr = key_nulls
45+ cmap = & self ._counts
46+
47+ with nogil:
48+ if nptr == NULL :
49+ for row_idx in range (row_count):
50+ cmap[0 ][dptr[row_idx]] += 1
4551 else :
46- self ._seen_null = True
47- self ._null_count += 1
52+ for row_idx in range (row_count):
53+ if (nptr[row_idx >> 3 ] >> (row_idx & 7 )) & 1 :
54+ cmap[0 ][dptr[row_idx]] += 1
55+ else :
56+ seen = True
57+ nulls += 1
58+
59+ # write back the bits that had to live under the GIL
60+ self ._seen_null = seen
61+ self ._null_count = nulls
4862 return True
4963
64+ # finalize_rows is the slow, fully‑Python path; we keep it for
65+ # compatibility but it's only invoked if the caller cannot consume
66+ # the fast columns returned by ``finalize_fast_columns``.
5067 cpdef list finalize_rows(self ):
68+ # delegate to fast columns and then convert to Python list in a single
69+ # loop; this avoids repeated append() calls in the Python layer.
70+ cdef object res
5171 cdef list rows
5272 cdef flat_hash_map[uint64_t, int64_t].iterator count_it
73+ cdef object keys, counts
5374
54- if self ._counts.size() == 0 and not self ._seen_null:
55- return []
56-
57- rows = []
58- count_it = self ._counts.begin()
59- while count_it != self ._counts.end():
60- rows.append(
61- (
62- (< int64_t> dereference(count_it).first,),
63- [dereference(count_it).second],
75+ res = self .finalize_fast_columns()
76+ if res is None :
77+ # nulls present; fall back to the old behaviour for correctness
78+ rows = []
79+ count_it = self ._counts.begin()
80+ while count_it != self ._counts.end():
81+ rows.append(
82+ (
83+ (< int64_t> dereference(count_it).first,),
84+ [dereference(count_it).second],
85+ )
6486 )
65- )
66- preincrement(count_it)
67- if self ._seen_null:
68- rows.append(((None ,), [self ._null_count]))
69- return rows
87+ preincrement(count_it)
88+ if self ._seen_null:
89+ rows.append(((None ,), [self ._null_count]))
90+ return rows
91+ else :
92+ # res is (keys, counts) tuple
93+ keys, counts = res
94+ return [( (k,), [c] ) for k, c in zip (keys, counts)]
7095
7196 cpdef object finalize_fast_columns(self ):
7297 cdef Py_ssize_t n
@@ -88,6 +113,10 @@ cdef class Int64CountStarKernel:
88113
89114 count_it = self ._counts.begin()
90115 idx = 0
116+ # iterate over the C++ map while holding the GIL; the body does not
117+ # perform any Python calls so the loop is already very tight. if
118+ # desired this could be executed nogil, but the improvement is
119+ # negligible compared to the cost of the aggregation itself.
91120 while count_it != self ._counts.end():
92121 key_view[idx] = < int64_t> dereference(count_it).first
93122 count_view[idx] = dereference(count_it).second
0 commit comments