Skip to content

Commit abb6b77

Browse files
fix: make vector_of_vectors read robust to empty idx and starts
Co-authored-by: aider (gpt-5.2) <aider@aider.chat>
1 parent 675d6b7 commit abb6b77

File tree

2 files changed

+58
-18
lines changed

2 files changed

+58
-18
lines changed

src/lgdo/lh5/_serializers/read/vector_of_vectors.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,28 @@ def _h5_read_vector_of_vectors(
3939
# read out cumulative_length
4040
cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
4141
h5d_cl = h5py.h5d.open(h5g, b"cumulative_length")
42+
if idx is not None:
43+
ds_n_rows = h5d_cl.get_space().shape[0]
44+
idx = np.asarray(idx)
45+
valid_mask = (idx >= 0) & (idx < ds_n_rows)
46+
n_invalid = np.count_nonzero(~valid_mask)
47+
if n_invalid > 0:
48+
log.warning(
49+
"Index array for '%s' in file '%s' contains %d out-of-range "
50+
"entries (total %d); these indices will be ignored.",
51+
oname,
52+
fname,
53+
n_invalid,
54+
idx.size,
55+
)
56+
idx = idx[valid_mask]
57+
if idx.size == 0:
58+
log.warning(
59+
"All indices for '%s' in file '%s' were out of range; "
60+
"resulting selection is empty.",
61+
oname,
62+
fname,
63+
)
4264
cumulative_length, n_rows_read = _h5_read_array(
4365
h5d_cl,
4466
fname,
@@ -63,29 +85,35 @@ def _h5_read_vector_of_vectors(
6385
# re-read cumulative_length with these indices
6486
# note this will allocate memory for fd_starts!
6587
fd_start = None
66-
if idx2[0] == -1:
88+
if idx2.size > 0 and idx2[0] == -1:
6789
idx2 = idx2[1:]
6890
fd_start = 0 # this variable avoids an ndarray append
6991

70-
fd_starts, _fds_n_rows_read = _h5_read_array(
71-
h5d_cl,
72-
fname,
73-
f"{oname}/cumulative_length",
74-
start_row=start_row,
75-
n_rows=n_rows,
76-
idx=idx2,
77-
use_h5idx=use_h5idx,
78-
obj_buf=None,
79-
)
80-
fd_starts = fd_starts.nda # we just need the nda
81-
if fd_start is None:
82-
fd_start = fd_starts[0]
92+
if idx2.size == 0:
93+
fd_starts = np.empty(0, dtype=this_cumulen_nda.dtype)
94+
else:
95+
fd_starts, _fds_n_rows_read = _h5_read_array(
96+
h5d_cl,
97+
fname,
98+
f"{oname}/cumulative_length",
99+
start_row=start_row,
100+
n_rows=n_rows,
101+
idx=idx2,
102+
use_h5idx=use_h5idx,
103+
obj_buf=None,
104+
)
105+
fd_starts = fd_starts.nda # we just need the nda
106+
if fd_start is None:
107+
fd_start = fd_starts[0]
83108

84109
# compute the length that flattened_data will have after the
85110
# fancy-indexed read
86-
fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
87-
if fd_start == 0:
88-
fd_n_rows += this_cumulen_nda[0]
111+
if fd_starts.size == 0:
112+
fd_n_rows = this_cumulen_nda[0]
113+
else:
114+
fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
115+
if fd_start == 0:
116+
fd_n_rows += this_cumulen_nda[0]
89117

90118
# now make fd_idx
91119
fd_idx = np.empty(fd_n_rows, dtype="int32")
@@ -95,7 +123,8 @@ def _h5_read_vector_of_vectors(
95123
# to match the in-memory version of flattened_data. Note: these
96124
# operations on the view change the original array because they are
97125
# numpy arrays, not lists.
98-
this_cumulen_nda[-len(fd_starts) :] -= fd_starts
126+
if fd_starts.size > 0:
127+
this_cumulen_nda[-len(fd_starts) :] -= fd_starts
99128
np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
100129

101130
else:

tests/lh5/test_lh5_store.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,17 @@ def test_read_vov_fancy_idx(lh5_file):
255255
assert lh5_obj == types.VectorOfVectors([[[1, 2], [3, 4, 5]], [[5, 3, 1]]])
256256
assert len(lh5_obj) == 2
257257

258+
# Out-of-range indices should be culled (and not raise).
259+
lh5_obj = store.read("/data/struct_full/vov3d", lh5_file, idx=[0, 10_000])
260+
assert isinstance(lh5_obj, types.VectorOfVectors)
261+
assert lh5_obj == types.VectorOfVectors([[[1, 2], [3, 4, 5]]])
262+
assert len(lh5_obj) == 1
263+
264+
# A fully out-of-range idx should yield an empty object.
265+
lh5_obj = store.read("/data/struct_full/vov3d", lh5_file, idx=[10_000])
266+
assert isinstance(lh5_obj, types.VectorOfVectors)
267+
assert len(lh5_obj) == 0
268+
258269

259270
def test_read_voev(lh5_file):
260271
store = lh5.LH5Store()

0 commit comments

Comments
 (0)