From aebecfe9e4c529b0f4a988e8320e3e5cc42a0b67 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 21 Feb 2024 22:01:16 +0100 Subject: [PATCH 1/2] PERF: Improve merge performance --- pandas/_libs/hashtable.pyx | 7 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 47 ++++++++++++++-------- pandas/core/reshape/merge.py | 21 ++++------ 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..30f7c7a327d1c 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -111,6 +111,7 @@ cdef class ObjectFactorizer(Factorizer): """ cdef: ndarray[intp_t] labels + bint seen_na if mask is not None: raise NotImplementedError("mask not supported for ObjectFactorizer.") @@ -119,7 +120,7 @@ cdef class ObjectFactorizer(Factorizer): uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques - labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, na_value) + labels, seen_na = self.table.get_labels(values, self.uniques, + self.count, na_sentinel, na_value) self.count = len(self.uniques) - return labels + return labels, seen_na diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 26dcf0b6c4ce3..ca944477d5a03 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -595,7 +595,8 @@ cdef class {{name}}HashTable(HashTable): def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - object mask=None, bint return_inverse=False, bint use_result_mask=False): + object mask=None, bint return_inverse=False, bint use_result_mask=False, + bint return_labels_only=False): """ Calculate unique values and labels (no sorting!) @@ -684,6 +685,7 @@ cdef class {{name}}HashTable(HashTable): if ignore_na and use_mask: if mask_values[i]: labels[i] = na_sentinel + seen_na = True continue elif ignore_na and ( is_nan_{{c_type}}(val) or @@ -693,6 +695,7 @@ cdef class {{name}}HashTable(HashTable): # ignore_na is True), skip the hashtable entry for them, # and replace the corresponding label with na_sentinel labels[i] = na_sentinel + seen_na = True continue elif not ignore_na and use_result_mask: if mask_values[i]: @@ -749,6 +752,8 @@ cdef class {{name}}HashTable(HashTable): idx = self.table.vals[k] labels[i] = idx + if return_inverse and return_labels_only: + return labels.base, seen_na # .base -> underlying ndarray if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray if use_result_mask: @@ -824,10 +829,11 @@ cdef class {{name}}HashTable(HashTable): Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): # -> np.ndarray[np.intp] - _, labels = self._unique(values, uniques, count_prior=count_prior, + labels, seen_na = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True, mask=mask) - return labels + ignore_na=True, return_inverse=True, mask=mask, + return_labels_only=True) + return labels, seen_na {{if dtype == 'int64'}} @cython.boundscheck(False) @@ -904,16 +910,17 @@ cdef class {{name}}Factorizer(Factorizer): """ cdef: ndarray[intp_t] labels + bint seen_na if self.uniques.external_view_exists: uniques = {{name}}Vector() uniques.extend(self.uniques.to_array()) self.uniques = uniques - labels = self.table.get_labels(values, self.uniques, + labels, seen_na = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value=na_value, mask=mask) self.count = len(self.uniques) - return labels + return labels, seen_na {{endfor}} @@ -1080,7 +1087,7 @@ cdef class StringHashTable(HashTable): def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - bint return_inverse=False): + bint return_inverse=False, bint return_labels_only=False): """ Calculate unique values and labels (no sorting!) @@ -1123,7 +1130,7 @@ cdef class StringHashTable(HashTable): const char *v const char **vecs khiter_t k - bint use_na_value + bint use_na_value, seen_na = False if return_inverse: labels = np.zeros(n, dtype=np.intp) @@ -1142,6 +1149,7 @@ cdef class StringHashTable(HashTable): # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly labels[i] = na_sentinel + seen_na = True else: # if ignore_na is False, we also stringify NaN/None/etc. try: @@ -1179,6 +1187,8 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) + if return_inverse and return_labels_only: + return labels.base, seen_na # .base -> underlying ndarray if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() @@ -1247,10 +1257,11 @@ cdef class StringHashTable(HashTable): Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): # -> np.ndarray[np.intp] - _, labels = self._unique(values, uniques, count_prior=count_prior, + labels, seen_na = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) - return labels + ignore_na=True, return_inverse=True, + return_labels_only=True) + return labels, seen_na cdef class PyObjectHashTable(HashTable): @@ -1362,7 +1373,7 @@ cdef class PyObjectHashTable(HashTable): def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - bint return_inverse=False): + bint return_inverse=False, bint return_labels_only=False): """ Calculate unique values and labels (no sorting!) @@ -1402,7 +1413,7 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - bint use_na_value + bint use_na_value, seen_na=False if return_inverse: labels = np.empty(n, dtype=np.intp) @@ -1420,6 +1431,7 @@ cdef class PyObjectHashTable(HashTable): # ignore_na is True), skip the hashtable entry for them, and # replace the corresponding label with na_sentinel labels[i] = na_sentinel + seen_na = True continue k = kh_get_pymap(self.table, val) @@ -1437,6 +1449,8 @@ cdef class PyObjectHashTable(HashTable): idx = self.table.vals[k] labels[i] = idx + if return_inverse and return_labels_only: + return labels.base, seen_na # .base -> underlying ndarray if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() @@ -1505,7 +1519,8 @@ cdef class PyObjectHashTable(HashTable): Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): # -> np.ndarray[np.intp] - _, labels = self._unique(values, uniques, count_prior=count_prior, + labels, seen_na = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) - return labels + ignore_na=True, return_inverse=True, + return_labels_only=True) + return labels, seen_na diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0494138d1e16f..01b8cd0b9eecd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2530,24 +2530,24 @@ def _factorize_keys( if isinstance(lk, BaseMaskedArray): assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=rk._mask) + llab, lany = rizer.factorize(lk._data, mask=lk._mask) + rlab, rany = rizer.factorize(rk._data, mask=rk._mask) elif isinstance(lk, ArrowExtensionArray): assert isinstance(rk, ArrowExtensionArray) # we can only get here with numeric dtypes # TODO: Remove when we have a Factorizer for Arrow - llab = rizer.factorize( + llab, lany = rizer.factorize( lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() ) - rlab = rizer.factorize( + rlab, rany = rizer.factorize( rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() ) else: # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - rlab = rizer.factorize(rk) # type: ignore[arg-type] + llab, lany = rizer.factorize(lk) # type: ignore[arg-type] + rlab, rany = rizer.factorize(rk) # type: ignore[arg-type] assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype @@ -2558,16 +2558,11 @@ def _factorize_keys( llab, rlab = _sort_labels(uniques, llab, rlab) # NA group - lmask = llab == -1 - lany = lmask.any() - rmask = rlab == -1 - rany = rmask.any() - if lany or rany: if lany: - np.putmask(llab, lmask, count) + np.putmask(llab, llab == -1, count) if rany: - np.putmask(rlab, rmask, count) + np.putmask(rlab, rlab == -1, count) count += 1 return llab, rlab, count From 0349979d5d054f290e23a60ce9ba32f1fc50f67d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 22 Feb 2024 17:47:52 +0100 Subject: [PATCH 2/2] Fix tests --- pandas/tests/test_algos.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 057a5a627370e..abb63744a3542 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -217,9 +217,10 @@ def test_factorize_nan(self): key = np.array([1, 2, 1, np.nan], dtype="O") rizer = ht.ObjectFactorizer(len(key)) for na_sentinel in (-1, 20): - ids = rizer.factorize(key, na_sentinel=na_sentinel) + ids, seen_na = rizer.factorize(key, na_sentinel=na_sentinel) expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp) assert len(set(key)) == len(set(expected)) + assert seen_na tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) tm.assert_numpy_array_equal(ids, expected) @@ -228,9 +229,10 @@ def test_factorizer_with_mask(self): data = np.array([1, 2, 3, 1, 1, 0], dtype="int64") mask = np.array([False, False, False, False, False, True]) rizer = ht.Int64Factorizer(len(data)) - result = rizer.factorize(data, mask=mask) + result, seen_na = rizer.factorize(data, mask=mask) expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + assert seen_na expected_uniques = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) @@ -238,9 +240,10 @@ def test_factorizer_object_with_nan(self): # GH#49549 data = np.array([1, 2, 3, 1, np.nan]) rizer = ht.ObjectFactorizer(len(data)) - result = rizer.factorize(data.astype(object)) + result, seen_na = rizer.factorize(data.astype(object)) expected = np.array([0, 1, 2, 0, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + assert seen_na expected_uniques = np.array([1, 2, 3], dtype=object) tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)