Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ cdef class ObjectFactorizer(Factorizer):
"""
cdef:
ndarray[intp_t] labels
bint seen_na

if mask is not None:
raise NotImplementedError("mask not supported for ObjectFactorizer.")
Expand All @@ -119,7 +120,7 @@ cdef class ObjectFactorizer(Factorizer):
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, na_value)
labels, seen_na = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, na_value)
self.count = len(self.uniques)
return labels
return labels, seen_na
47 changes: 31 additions & 16 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,8 @@ cdef class {{name}}HashTable(HashTable):
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
object mask=None, bint return_inverse=False, bint use_result_mask=False):
object mask=None, bint return_inverse=False, bint use_result_mask=False,
bint return_labels_only=False):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -684,6 +685,7 @@ cdef class {{name}}HashTable(HashTable):
if ignore_na and use_mask:
if mask_values[i]:
labels[i] = na_sentinel
seen_na = True
continue
elif ignore_na and (
is_nan_{{c_type}}(val) or
Expand All @@ -693,6 +695,7 @@ cdef class {{name}}HashTable(HashTable):
# ignore_na is True), skip the hashtable entry for them,
# and replace the corresponding label with na_sentinel
labels[i] = na_sentinel
seen_na = True
continue
elif not ignore_na and use_result_mask:
if mask_values[i]:
Expand Down Expand Up @@ -749,6 +752,8 @@ cdef class {{name}}HashTable(HashTable):
idx = self.table.vals[k]
labels[i] = idx

if return_inverse and return_labels_only:
return labels.base, seen_na # .base -> underlying ndarray
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
if use_result_mask:
Expand Down Expand Up @@ -824,10 +829,11 @@ cdef class {{name}}HashTable(HashTable):
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
labels, seen_na = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True, mask=mask)
return labels
ignore_na=True, return_inverse=True, mask=mask,
return_labels_only=True)
return labels, seen_na

{{if dtype == 'int64'}}
@cython.boundscheck(False)
Expand Down Expand Up @@ -904,16 +910,17 @@ cdef class {{name}}Factorizer(Factorizer):
"""
cdef:
ndarray[intp_t] labels
bint seen_na

if self.uniques.external_view_exists:
uniques = {{name}}Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
labels, seen_na = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
na_value=na_value, mask=mask)
self.count = len(self.uniques)
return labels
return labels, seen_na

{{endfor}}

Expand Down Expand Up @@ -1080,7 +1087,7 @@ cdef class StringHashTable(HashTable):
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
bint return_inverse=False, bint return_labels_only=False):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -1123,7 +1130,7 @@ cdef class StringHashTable(HashTable):
const char *v
const char **vecs
khiter_t k
bint use_na_value
bint use_na_value, seen_na = False

if return_inverse:
labels = np.zeros(n, dtype=np.intp)
Expand All @@ -1142,6 +1149,7 @@ cdef class StringHashTable(HashTable):
# ignore_na is True), we can skip the actual value, and
# replace the label with na_sentinel directly
labels[i] = na_sentinel
seen_na = True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of trying to find this iterating by element can we get the same performance by querying up front for missing values? That approach would work better in the future where we are more arrow based

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't we expect a two-pass implementation to be slower? i dont think we should be making decisions based on a potential arrow-based future since that would likely need a completely separate implementation anyway

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arrow strings are using a completely different code path, this is just for our own strings

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't we expect a two-pass implementation to be slower? i

Generally I would expect a two pass solution to be faster. Our nulll-checking implementations are pretty naive and always go elementwise with a boolean value. The Arrow implementation iterates 64 bits at a time, so checking for NA can be up to 64x as fast as this kind of loop. I'm not sure what NumPy does internally but with a byte mask that same type of logic would be up to 8x as fast.

By separating that out to a different has_na().any() type of check you let other libraries determine that much faster than we can

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear not a blocker for now; just something to think about as we make our code base more Arrow friendly

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arrow strings are using a completely different code path, this is just for our own strings

As I said, arrow dispatches to pyarrow compute functions for those things, so it won't impact this part for now anyway

else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
Expand Down Expand Up @@ -1179,6 +1187,8 @@ cdef class StringHashTable(HashTable):
for i in range(count):
uniques.append(values[uindexer[i]])

if return_inverse and return_labels_only:
return labels.base, seen_na # .base -> underlying ndarray
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
Expand Down Expand Up @@ -1247,10 +1257,11 @@ cdef class StringHashTable(HashTable):
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
labels, seen_na = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels
ignore_na=True, return_inverse=True,
return_labels_only=True)
return labels, seen_na


cdef class PyObjectHashTable(HashTable):
Expand Down Expand Up @@ -1362,7 +1373,7 @@ cdef class PyObjectHashTable(HashTable):
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
bint return_inverse=False, bint return_labels_only=False):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -1402,7 +1413,7 @@ cdef class PyObjectHashTable(HashTable):
int ret = 0
object val
khiter_t k
bint use_na_value
bint use_na_value, seen_na=False

if return_inverse:
labels = np.empty(n, dtype=np.intp)
Expand All @@ -1420,6 +1431,7 @@ cdef class PyObjectHashTable(HashTable):
# ignore_na is True), skip the hashtable entry for them, and
# replace the corresponding label with na_sentinel
labels[i] = na_sentinel
seen_na = True
continue

k = kh_get_pymap(self.table, <PyObject*>val)
Expand All @@ -1437,6 +1449,8 @@ cdef class PyObjectHashTable(HashTable):
idx = self.table.vals[k]
labels[i] = idx

if return_inverse and return_labels_only:
return labels.base, seen_na # .base -> underlying ndarray
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the second argument returned by this is either a bool or an ndarray right? Instead of having dynamic return types like that I think it would be better to just pass in a reference to seen_na - the caller can choose to ignore the result altogether if they'd like. But that way you don't need a return_labels_only argument and can be consistent in what is returned

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this was my first idea as well, but it isn't a good idea. We drop the result but we still set an internal variable that signals that there is an external view created, which triggers a copy later in the process because we call into the same hashtable again. Avoiding this copy is part of the performance improvement, so we can't use the same signature.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how passing the seen_na variable by reference as opposed to having a dynamic return type affects that. That seems like a code organization issue outside of this?

Copy link
Member Author

@phofl phofl Mar 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't matter, but returning the same result as when return_labels_only=False matters, so we need another if condition anyway.

if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
Expand Down Expand Up @@ -1505,7 +1519,8 @@ cdef class PyObjectHashTable(HashTable):
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
labels, seen_na = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels
ignore_na=True, return_inverse=True,
return_labels_only=True)
return labels, seen_na
21 changes: 8 additions & 13 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2530,24 +2530,24 @@ def _factorize_keys(

if isinstance(lk, BaseMaskedArray):
assert isinstance(rk, BaseMaskedArray)
llab = rizer.factorize(lk._data, mask=lk._mask)
rlab = rizer.factorize(rk._data, mask=rk._mask)
llab, lany = rizer.factorize(lk._data, mask=lk._mask)
rlab, rany = rizer.factorize(rk._data, mask=rk._mask)
elif isinstance(lk, ArrowExtensionArray):
assert isinstance(rk, ArrowExtensionArray)
# we can only get here with numeric dtypes
# TODO: Remove when we have a Factorizer for Arrow
llab = rizer.factorize(
llab, lany = rizer.factorize(
lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
)
rlab = rizer.factorize(
rlab, rany = rizer.factorize(
rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
)
else:
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
llab = rizer.factorize(lk) # type: ignore[arg-type]
rlab = rizer.factorize(rk) # type: ignore[arg-type]
llab, lany = rizer.factorize(lk) # type: ignore[arg-type]
rlab, rany = rizer.factorize(rk) # type: ignore[arg-type]
assert llab.dtype == np.dtype(np.intp), llab.dtype
assert rlab.dtype == np.dtype(np.intp), rlab.dtype

Expand All @@ -2558,16 +2558,11 @@ def _factorize_keys(
llab, rlab = _sort_labels(uniques, llab, rlab)

# NA group
lmask = llab == -1
lany = lmask.any()
rmask = rlab == -1
rany = rmask.any()

if lany or rany:
if lany:
np.putmask(llab, lmask, count)
np.putmask(llab, llab == -1, count)
if rany:
np.putmask(rlab, rmask, count)
np.putmask(rlab, rlab == -1, count)
count += 1

return llab, rlab, count
Expand Down