Skip to content

Commit 2b90845

Browse files
committed
Faster factorize
1 parent 5fa31f3 commit 2b90845

File tree

1 file changed

+31
-1
lines changed

1 file changed

+31
-1
lines changed

flox/core.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,26 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
735735
return offset, size
736736

737737

738+
def fast_isin(ar1, ar2, invert):
739+
rev_idx, ar1 = pd.factorize(ar1, sort=False)
740+
741+
ar = np.concatenate((ar1, ar2))
742+
# We need this to be a stable sort, so always use 'mergesort'
743+
# here. The values from the first array should always come before
744+
# the values from the second array.
745+
order = ar.argsort(kind="mergesort")
746+
sar = ar[order]
747+
if invert:
748+
bool_ar = sar[1:] != sar[:-1]
749+
else:
750+
bool_ar = sar[1:] == sar[:-1]
751+
flag = np.concatenate((bool_ar, [invert]))
752+
ret = np.empty(ar.shape, dtype=bool)
753+
ret[order] = flag
754+
755+
return ret[rev_idx]
756+
757+
738758
@overload
739759
def factorize_(
740760
by: T_Bys,
@@ -830,8 +850,18 @@ def factorize_(
830850
if expect is not None and reindex:
831851
sorter = np.argsort(expect)
832852
groups = expect[(sorter,)] if sort else expect
853+
833854
idx = np.searchsorted(expect, flat, sorter=sorter)
834-
mask = ~np.isin(flat, expect) | isnull(flat) | (idx == len(expect))
855+
mask = fast_isin(flat, expect, invert=True)
856+
if not np.issubdtype(flat.dtype, np.integer):
857+
mask |= isnull(flat)
858+
mask |= idx == len(expect)
859+
860+
# idx = np.full(flat.shape, -1)
861+
# result = np.searchsorted(expect.values, flat[~mask], sorter=sorter)
862+
# idx[~mask] = result
863+
# idx = np.searchsorted(expect.values, flat, sorter=sorter)
864+
# idx[mask] = -1
835865
if not sort:
836866
# idx is the index in to the sorted array.
837867
# if we didn't want sorting, unsort it back

0 commit comments

Comments
 (0)