@@ -39,6 +39,28 @@ def _h5_read_vector_of_vectors(
3939 # read out cumulative_length
4040 cumulen_buf = None if obj_buf is None else obj_buf .cumulative_length
4141 h5d_cl = h5py .h5d .open (h5g , b"cumulative_length" )
42+ if idx is not None :
43+ ds_n_rows = h5d_cl .get_space ().shape [0 ]
44+ idx = np .asarray (idx )
45+ valid_mask = (idx >= 0 ) & (idx < ds_n_rows )
46+ n_invalid = np .count_nonzero (~ valid_mask )
47+ if n_invalid > 0 :
48+ log .warning (
49+ "Index array for '%s' in file '%s' contains %d out-of-range "
50+ "entries (total %d); these indices will be ignored." ,
51+ oname ,
52+ fname ,
53+ n_invalid ,
54+ idx .size ,
55+ )
56+ idx = idx [valid_mask ]
57+ if idx .size == 0 :
58+ log .warning (
59+ "All indices for '%s' in file '%s' were out of range; "
60+ "resulting selection is empty." ,
61+ oname ,
62+ fname ,
63+ )
4264 cumulative_length , n_rows_read = _h5_read_array (
4365 h5d_cl ,
4466 fname ,
@@ -63,29 +85,35 @@ def _h5_read_vector_of_vectors(
6385 # re-read cumulative_length with these indices
6486 # note this will allocate memory for fd_starts!
6587 fd_start = None
66- if idx2 [0 ] == - 1 :
88+ if idx2 . size > 0 and idx2 [0 ] == - 1 :
6789 idx2 = idx2 [1 :]
6890 fd_start = 0 # this variable avoids an ndarray append
6991
70- fd_starts , _fds_n_rows_read = _h5_read_array (
71- h5d_cl ,
72- fname ,
73- f"{ oname } /cumulative_length" ,
74- start_row = start_row ,
75- n_rows = n_rows ,
76- idx = idx2 ,
77- use_h5idx = use_h5idx ,
78- obj_buf = None ,
79- )
80- fd_starts = fd_starts .nda # we just need the nda
81- if fd_start is None :
82- fd_start = fd_starts [0 ]
92+ if idx2 .size == 0 :
93+ fd_starts = np .empty (0 , dtype = this_cumulen_nda .dtype )
94+ else :
95+ fd_starts , _fds_n_rows_read = _h5_read_array (
96+ h5d_cl ,
97+ fname ,
98+ f"{ oname } /cumulative_length" ,
99+ start_row = start_row ,
100+ n_rows = n_rows ,
101+ idx = idx2 ,
102+ use_h5idx = use_h5idx ,
103+ obj_buf = None ,
104+ )
105+ fd_starts = fd_starts .nda # we just need the nda
106+ if fd_start is None :
107+ fd_start = fd_starts [0 ]
83108
84109 # compute the length that flattened_data will have after the
85110 # fancy-indexed read
86- fd_n_rows = np .sum (this_cumulen_nda [- len (fd_starts ) :] - fd_starts )
87- if fd_start == 0 :
88- fd_n_rows += this_cumulen_nda [0 ]
111+ if fd_starts .size == 0 :
112+ fd_n_rows = this_cumulen_nda [0 ]
113+ else :
114+ fd_n_rows = np .sum (this_cumulen_nda [- len (fd_starts ) :] - fd_starts )
115+ if fd_start == 0 :
116+ fd_n_rows += this_cumulen_nda [0 ]
89117
90118 # now make fd_idx
91119 fd_idx = np .empty (fd_n_rows , dtype = "int32" )
@@ -95,7 +123,8 @@ def _h5_read_vector_of_vectors(
95123 # to match the in-memory version of flattened_data. Note: these
96124 # operations on the view change the original array because they are
97125 # numpy arrays, not lists.
98- this_cumulen_nda [- len (fd_starts ) :] -= fd_starts
126+ if fd_starts .size > 0 :
127+ this_cumulen_nda [- len (fd_starts ) :] -= fd_starts
99128 np .cumsum (this_cumulen_nda , out = this_cumulen_nda )
100129
101130 else :
0 commit comments