Skip to content

Commit 5646179

Browse files
committed
Correctly factorize values outside bin edges
1 parent dcfb1db commit 5646179

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

flox/core.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ def factorize_(
431431
# pd.cut with bins = IntervalIndex[datetime64] doesn't work...
432432
if groupvar.dtype.kind == "M":
433433
expect = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]])
434+
# code is -1 for values outside the bounds of all intervals
434435
idx = pd.cut(groupvar.ravel(), bins=expect).codes.copy()
435436
else:
436437
if expect is not None and reindex:
@@ -455,9 +456,12 @@ def factorize_(
455456
grp_shape = tuple(len(grp) for grp in found_groups)
456457
ngroups = np.prod(grp_shape)
457458
if len(by) > 1:
458-
group_idx = np.ravel_multi_index(factorized, grp_shape, mode="wrap").reshape(by[0].shape)
459-
nan_by_mask = reduce(np.logical_or, [isnull(b) for b in by])
459+
group_idx = np.ravel_multi_index(factorized, grp_shape, mode="wrap")
460+
# NaNs; as well as values outside the bins are coded by -1
461+
# Restore these after the raveling
462+
nan_by_mask = reduce(np.logical_or, [(f == -1) for f in factorized])
460463
group_idx[nan_by_mask] = -1
464+
group_idx = group_idx.reshape(by[0].shape)
461465
else:
462466
group_idx = factorized[0]
463467

tests/test_core.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -888,3 +888,20 @@ def test_group_by_datetime(engine, method):
888888
)
889889
expected = np.broadcast_to(expected, (2, 3, expected.shape[-1]))
890890
assert_equal(expected, actual)
891+
892+
893+
def test_factorize_values_outside_bins():
894+
895+
vals = factorize_(
896+
(np.arange(10).reshape(5, 2), np.arange(10).reshape(5, 2)),
897+
axis=(0, 1),
898+
expected_groups=(
899+
pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)),
900+
pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)),
901+
),
902+
reindex=True,
903+
fastpath=True,
904+
)
905+
actual = vals[0]
906+
expected = np.array([[-1, -1], [-1, 0], [6, 12], [18, 24], [-1, -1]])
907+
assert_equal(expected, actual)

0 commit comments

Comments
 (0)