@@ -735,6 +735,26 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
735
735
return offset , size
736
736
737
737
738
+ def fast_isin (ar1 , ar2 , invert ):
739
+ rev_idx , ar1 = pd .factorize (ar1 , sort = False )
740
+
741
+ ar = np .concatenate ((ar1 , ar2 ))
742
+ # We need this to be a stable sort, so always use 'mergesort'
743
+ # here. The values from the first array should always come before
744
+ # the values from the second array.
745
+ order = ar .argsort (kind = "mergesort" )
746
+ sar = ar [order ]
747
+ if invert :
748
+ bool_ar = sar [1 :] != sar [:- 1 ]
749
+ else :
750
+ bool_ar = sar [1 :] == sar [:- 1 ]
751
+ flag = np .concatenate ((bool_ar , [invert ]))
752
+ ret = np .empty (ar .shape , dtype = bool )
753
+ ret [order ] = flag
754
+
755
+ return ret [rev_idx ]
756
+
757
+
738
758
@overload
739
759
def factorize_ (
740
760
by : T_Bys ,
@@ -830,8 +850,18 @@ def factorize_(
830
850
if expect is not None and reindex :
831
851
sorter = np .argsort (expect )
832
852
groups = expect [(sorter ,)] if sort else expect
853
+
833
854
idx = np .searchsorted (expect , flat , sorter = sorter )
834
- mask = ~ np .isin (flat , expect ) | isnull (flat ) | (idx == len (expect ))
855
+ mask = fast_isin (flat , expect , invert = True )
856
+ if not np .issubdtype (flat .dtype , np .integer ):
857
+ mask |= isnull (flat )
858
+ mask |= idx == len (expect )
859
+
860
+ # idx = np.full(flat.shape, -1)
861
+ # result = np.searchsorted(expect.values, flat[~mask], sorter=sorter)
862
+ # idx[~mask] = result
863
+ # idx = np.searchsorted(expect.values, flat, sorter=sorter)
864
+ # idx[mask] = -1
835
865
if not sort :
836
866
# idx is the index in to the sorted array.
837
867
# if we didn't want sorting, unsort it back
0 commit comments