Skip to content

Commit 8a82c0c

Browse files
author
Luke Shaw
committed
Passes all findex tests
1 parent 5926e8a commit 8a82c0c

File tree

2 files changed

+166
-64
lines changed

2 files changed

+166
-64
lines changed

src/blosc2/ndarray.py

Lines changed: 150 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import ndindex
2929
import numpy as np
30+
from ndindex.subindex_helpers import ceiling
3031

3132
import blosc2
3233
from blosc2 import SpecialValue, blosc2_ext, compute_chunks_blocks
@@ -1459,7 +1460,7 @@ def T(self):
14591460
raise ValueError("This property only works for 2-dimensional arrays.")
14601461
return permute_dims(self)
14611462

1462-
def get_fselection_numpy(self, key: list | np.ndarray) -> np.ndarray:
1463+
def get_fselection_numpy(self, key: list | np.ndarray) -> np.ndarray: # noqa: C901
14631464
"""
14641465
Select a slice from the array using a fancy index.
14651466
Closely matches NumPy fancy indexing behaviour, except in
@@ -1482,60 +1483,131 @@ def get_fselection_numpy(self, key: list | np.ndarray) -> np.ndarray:
14821483
# return self[:][key] # load into memory for smallish arrays
14831484
shape = self.shape
14841485
chunks = self.chunks
1486+
# after this, all indices are slices or arrays of integers
1487+
# moreover, all arrays are consecutive (otherwise an error is raised)
14851488
_slice = ndindex.ndindex(key).expand(shape)
14861489
out_shape = _slice.newshape(shape)
1487-
out = np.empty(out_shape, dtype=self.dtype)
1488-
key = np.array(key)
1489-
1490-
if self.ndim == 1 and len(key.shape) == 1:
1491-
# Fast path so we can avoid the costly lines in slow path
1492-
# (sub_idx = _slice.as_subindex(c).raw, sel_idx = c.as_subindex(_slice))
1493-
if np.any(np.diff(key) < 0):
1494-
idx_order = np.argsort(key)
1495-
sorted_idxs = key[idx_order]
1496-
else:
1497-
idx_order = None
1498-
sorted_idxs = key
1499-
chunk_nitems = np.bincount(
1500-
sorted_idxs // chunks[0], minlength=self.schunk.nchunks
1501-
) # number of items per chunk
1490+
_slice = _slice.raw
1491+
shape = np.array(shape)
1492+
1493+
if np.all([isinstance(s, (slice, np.ndarray)) for s in _slice]) and np.all(
1494+
[s.dtype is not bool for s in _slice if isinstance(s, np.ndarray)]
1495+
):
1496+
arr = []
1497+
chunks = np.array(chunks)
1498+
f_ = 0
1499+
# |------|
1500+
# ------| arrs |------
1501+
# f_=0 f_=1 f_=2
1502+
begin = 0
1503+
end = None
1504+
extradims = 0
1505+
flat_shape = ()
1506+
for num, i in enumerate(_slice):
1507+
if isinstance(i, np.ndarray):
1508+
if f_ == 2:
1509+
raise ValueError("Cannot use slices between arrays of integers in index")
1510+
arr += [i.flatten()]
1511+
extradims += i.ndim - 1
1512+
if f_ == 0:
1513+
begin = num
1514+
f_ = 1
1515+
else:
1516+
if f_ == 1: # finished adding arrays
1517+
arr = np.stack(arr)
1518+
end = num
1519+
f_ = 2
1520+
flat_shape += (arr.shape[-1],)
1521+
flat_shape += ((i.stop - i.start) // i.step,)
1522+
if not isinstance(arr, np.ndarray): # might have missed last part of loop
1523+
arr = np.stack(arr)
1524+
flat_shape += (arr.shape[-1],)
1525+
# out_shape could have new dims if indexing arrays are not all 1D
1526+
# (we have just flattened them so need to handle accordingly)
1527+
idx_order = np.lexsort(
1528+
tuple(a for a in reversed(arr))
1529+
) # sort by column with priority to first column
1530+
sorted_idxs = arr[:, idx_order]
1531+
out = np.empty(flat_shape, dtype=self.dtype)
1532+
1533+
divider = chunks[begin:end]
1534+
unique_chunks, chunk_nitems = np.unique(sorted_idxs.T // divider, axis=0, return_counts=True)
15021535
chunk_nitems_cumsum = np.cumsum(chunk_nitems)
1503-
for chunk_i, c in enumerate(chunk_nitems):
1504-
if c == 0:
1505-
continue # no items in chunk
1536+
prior_tuple = _slice[:begin]
1537+
post_tuple = _slice[end:] if end is not None else ()
1538+
cprior_slices = [
1539+
slice_to_chunktuple(s, c) for s, c in zip(prior_tuple, chunks[:begin], strict=True)
1540+
]
1541+
cpost_slices = (
1542+
[slice_to_chunktuple(s, c) for s, c in zip(post_tuple, chunks[end:], strict=True)]
1543+
if end is not None
1544+
else []
1545+
)
1546+
for chunk_i, chunk_idx in enumerate(unique_chunks):
15061547
start = 0 if chunk_i == 0 else chunk_nitems_cumsum[chunk_i - 1]
15071548
stop = chunk_nitems_cumsum[chunk_i]
1508-
selection = sorted_idxs[start:stop]
1509-
out_selection = idx_order[start:stop] if idx_order is not None else slice(start, stop)
1510-
to_be_loaded = np.empty((selection[-1] - selection[0] + 1,), dtype=self.dtype)
1511-
# if len(selection) < 10:
1512-
# # TODO: optimise for case of few elements and go index-by-index
1513-
# selector = out_selection.start + np.arange(out_selection.stop) if isinstance(out_selection, slice) else out_selection
1514-
# for j, idx in enumerate(sorted_idxs[start:stop]):
1515-
# out[out_selection[j]] = self[idx]
1516-
# else:
1517-
super().get_slice_numpy(
1518-
to_be_loaded, ((selection[0],), (selection[-1] + 1,))
1519-
) # get relevant section of chunk
1520-
loc_idx = selection - selection[0]
1521-
out[out_selection] = to_be_loaded[loc_idx]
1522-
1523-
else:
1524-
chunk_size = ndindex.ChunkSize(chunks)
1525-
# repeated indices are grouped together
1526-
intersecting_chunks = chunk_size.as_subchunks(
1527-
_slice, shape
1528-
) # if _slice is (), returns all chunks
1529-
for c in intersecting_chunks:
1530-
sub_idx = _slice.as_subindex(c).raw
1531-
sel_idx = c.as_subindex(_slice)
1532-
new_shape = sel_idx.newshape(out_shape)
1533-
start, stop, step = get_ndarray_start_stop(self.ndim, c.raw, shape)
1534-
chunk = np.empty(
1535-
tuple(sp - st for st, sp in zip(start, stop, strict=True)), dtype=self.dtype
1536-
)
1537-
super().get_slice_numpy(chunk, (start, stop))
1538-
out[sel_idx.raw] = chunk[sub_idx].reshape(new_shape)
1549+
selection = sorted_idxs[:, start:stop].T
1550+
out_selection = idx_order[start:stop].T
1551+
# loop over chunks coming from slices before and after array indices
1552+
for cprior_tuple in product(*cprior_slices):
1553+
prior_selection = selector(cprior_tuple, prior_tuple, chunks[:begin])
1554+
# selection relative to coordinates of out (necessarily step = 1)
1555+
out_prior_selection = tuple(
1556+
slice(ps.start - pt.start, ps.stop - pt.start, 1)
1557+
for ps, pt in zip(prior_selection, prior_tuple, strict=True)
1558+
)
1559+
for cpost_tuple in product(*cpost_slices):
1560+
post_selection = selector(
1561+
cpost_tuple, post_tuple, chunks[end:] if end is not None else []
1562+
)
1563+
# selection relative to coordinates of out (necessarily step = 1)
1564+
out_post_selection = tuple(
1565+
slice(ps.start - pt.start, ps.stop - pt.start, 1)
1566+
for ps, pt in zip(post_selection, post_tuple, strict=True)
1567+
)
1568+
chunk_begin = chunk_idx * chunks[begin:end]
1569+
chunk_end = np.minimum((chunk_idx + 1) * chunks[begin:end], shape[begin:end])
1570+
locbegin = np.hstack(
1571+
(
1572+
[s.start for s in prior_selection],
1573+
chunk_begin,
1574+
[s.start for s in post_selection],
1575+
),
1576+
casting="unsafe",
1577+
dtype="int64",
1578+
)
1579+
locend = np.hstack(
1580+
([s.stop for s in prior_selection], chunk_end, [s.stop for s in post_selection]),
1581+
casting="unsafe",
1582+
dtype="int64",
1583+
)
1584+
out_selection = out_prior_selection + (out_selection,) + out_post_selection
1585+
to_be_loaded = np.empty(locend - locbegin, dtype=self.dtype)
1586+
# basically load whole chunk, except for slice part at beginning and end
1587+
super().get_slice_numpy(
1588+
to_be_loaded, (locbegin, locend)
1589+
) # get relevant section of chunk
1590+
loc_idx = (
1591+
tuple(slice(0, s.stop - s.start, s.step) for s in prior_selection)
1592+
+ tuple(a for a in (selection - chunk_begin).T)
1593+
+ tuple(slice(0, s.stop - s.start, s.step) for s in post_selection)
1594+
)
1595+
out[out_selection] = to_be_loaded[loc_idx]
1596+
return out.reshape(out_shape) # should have filled in correct order, just need to reshape
1597+
1598+
# Default when there are booleans
1599+
out = np.empty(out_shape, dtype=self.dtype)
1600+
chunk_size = ndindex.ChunkSize(chunks)
1601+
# repeated indices are grouped together
1602+
intersecting_chunks = chunk_size.as_subchunks(_slice, shape) # if _slice is (), returns all chunks
1603+
for c in intersecting_chunks:
1604+
sub_idx = _slice.as_subindex(c).raw
1605+
sel_idx = c.as_subindex(_slice)
1606+
new_shape = sel_idx.newshape(out_shape)
1607+
start, stop, step = get_ndarray_start_stop(self.ndim, c.raw, shape)
1608+
chunk = np.empty(tuple(sp - st for st, sp in zip(start, stop, strict=True)), dtype=self.dtype)
1609+
super().get_slice_numpy(chunk, (start, stop))
1610+
out[sel_idx.raw] = chunk[sub_idx].reshape(new_shape)
15391611

15401612
return out
15411613

@@ -4519,3 +4591,31 @@ def __setitem__(self, selection, input) -> np.ndarray:
45194591

45204592
# def __setitem__(self, selection, input) -> np.ndarray:
45214593
# return NotImplementedError
4594+
4595+
4596+
def slice_to_chunktuple(s, n):
4597+
"""
4598+
# credit to ndindex for this function #
4599+
Parameters
4600+
----------
4601+
s : slice
4602+
A slice object with start, stop, and step attributes.
4603+
n : int
4604+
The number of elements in the chunk axis
4605+
4606+
Returns
4607+
-------
4608+
out: tuple
4609+
"""
4610+
start, stop, step = s.start, s.stop, s.step
4611+
if step > n:
4612+
return tuple((start + k * step) // n for k in range(ceiling(stop - start, step)))
4613+
else:
4614+
return tuple(range(start // n, ceiling(stop, n)))
4615+
4616+
4617+
def selector(ctuple, _tuple, chunks):
4618+
return tuple(
4619+
slice(builtins.max(s.start, i * csize), builtins.min(csize * (i + 1), s.stop), s.step)
4620+
for i, s, csize in zip(ctuple, _tuple, chunks, strict=True)
4621+
)

tests/ndarray/test_ndarray.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -295,31 +295,33 @@ def test_oindex():
295295
def test_findex():
296296
# Test 1d fast path
297297
ndim = 1
298-
d = 1 + int(blosc2.MAX_FAST_PATH_SIZE / 8) # just over fast path size
299-
shape = (d,) * ndim
300-
arr = blosc2.linspace(0, 100, num=np.prod(shape), shape=shape, dtype=np.float64)
301-
rng = np.random.default_rng()
302-
idx = rng.integers(low=0, high=d, size=(d // 4,))
303-
nparr = arr[:]
304-
b = arr[idx]
305-
n = nparr[idx]
306-
np.testing.assert_allclose(b, n)
298+
dtype = np.dtype("float")
299+
# d = 1 + int(blosc2.MAX_FAST_PATH_SIZE / dtype.itemsize) # just over fast path size
300+
d = 10
301+
# shape = (d,) * ndim
302+
# arr = blosc2.linspace(0, 100, num=np.prod(shape), shape=shape, dtype=dtype)
303+
# rng = np.random.default_rng()
304+
# idx = rng.integers(low=0, high=d, size=(d // 4,))
305+
# nparr = arr[:]
306+
# b = arr[idx]
307+
# n = nparr[idx]
308+
# np.testing.assert_allclose(b, n)
307309

308310
ndim = 3
309-
d = 1 + int((blosc2.MAX_FAST_PATH_SIZE / 8) ** (1 / ndim)) # just over fast path size
311+
# d = 1 + int((blosc2.MAX_FAST_PATH_SIZE / 8) ** (1 / ndim)) # just over fast path size
310312
shape = (d,) * ndim
311-
arr = blosc2.linspace(0, 100, num=np.prod(shape), shape=shape, dtype=np.float64)
313+
arr = blosc2.linspace(0, 100, num=np.prod(shape), shape=shape, dtype=dtype)
312314
rng = np.random.default_rng()
313315
idx = rng.integers(low=0, high=d, size=(100,))
314316

315317
row = idx
316318
col = rng.permutation(idx)
317319
mask = rng.integers(low=0, high=2, size=(d,)) == 1
318-
319-
## Test fancy indexing for different use cases
320+
#
321+
# ## Test fancy indexing for different use cases
320322
m, M = np.min(idx), np.max(idx)
321323
nparr = arr[:]
322-
# i)
324+
# # i)
323325
b = arr[[m, M // 2, M]]
324326
n = nparr[[m, M // 2, M]]
325327
np.testing.assert_allclose(b, n)

0 commit comments

Comments
 (0)