Skip to content

Commit 0f10aa1

Browse files
authored
Externallinkfix (#153)
* Show linked chunk counts * support hdf5:// prefix in external link refs * updates for numpy deprecation errors
1 parent ca1f8ae commit 0f10aa1

24 files changed

+165
-344
lines changed

h5pyd/_apps/hsls.py

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -145,32 +145,60 @@ def dump(name, obj, visited=None):
145145
if isinstance(obj.id.layout, dict):
146146
# H5D_CHUNKED_REF layout
147147
chunk_dims = obj.id.layout["dims"]
148-
storage_desc = "Storage " + obj.id.layout["class"]
148+
obj_layout = obj.id.layout["class"]
149149
else:
150150
chunk_dims = obj.chunks
151-
storage_desc = "Storage H5D_CHUNKED"
152-
for chunk_dim in chunk_dims:
151+
obj_layout = "H5D_CHUNKED"
152+
storage_desc = f"Storage {obj_layout}"
153+
max_chunk_count = 1
154+
rank = len(obj.shape)
155+
for i in range(rank):
156+
extent = obj.shape[i]
157+
chunk_dim = chunk_dims[i]
153158
chunk_size *= chunk_dim
159+
max_chunk_count *= -(-extent // chunk_dim)
154160
dset_size = obj.dtype.itemsize
155161
for dim_extent in obj.shape:
156162
dset_size *= dim_extent
157163

158-
num_chunks = obj.num_chunks
159-
allocated_size = obj.allocated_size
164+
if obj_layout == "H5D_CHUNKED_REF_INDIRECT":
165+
chunk_table_id = obj.id.layout["chunk_table"]
166+
chunk_table = obj.file[f"datasets/{chunk_table_id}"]
167+
num_chunks = int(np.prod(chunk_table.shape))
168+
chunk_table_elements = chunk_table[...].reshape((num_chunks,))
169+
num_linked_chunks = 0
170+
allocated_size = 0
171+
for e in chunk_table_elements:
172+
chunk_offset = e[0]
173+
chunk_size = e[1]
174+
if chunk_offset > 0 and chunk_size > 0:
175+
num_linked_chunks += 1
176+
allocated_size += chunk_size
177+
num_chunks = num_linked_chunks
178+
chunk_type = "linked"
179+
180+
else:
181+
num_chunks = obj.num_chunks
182+
allocated_size = obj.allocated_size
183+
chunk_type = "allocated"
184+
160185
if num_chunks is not None and allocated_size is not None:
161-
fstr = " {0:>32}: {1} {2} bytes, {3} allocated chunks"
162-
print(fstr.format("Chunks", chunk_dims, intToStr(chunk_size),
163-
intToStr(num_chunks)))
186+
fstr = " {0:>32}: {1} {2} bytes, {3}/{4} {5} chunks"
187+
188+
s = fstr.format("Chunks", chunk_dims, intToStr(chunk_size), intToStr(num_chunks),
189+
intToStr(max_chunk_count), chunk_type)
190+
print(s)
164191
if dset_size > 0:
165192
utilization = allocated_size / dset_size
166-
fstr = " {0:>32}: {1} logical bytes, {2} allocated bytes, {3:.2f}% utilization"
193+
fstr = " {0:>32}: {1} logical bytes, {2} {3} bytes, {4:.2f}% utilization"
167194
print(fstr.format(storage_desc, intToStr(dset_size),
168195
intToStr(allocated_size),
196+
chunk_type,
169197
utilization * 100.0))
170198
else:
171-
fstr = " {0:>32}: {1} logical bytes, {2} allocated bytes"
199+
fstr = " {0:>32}: {1} logical bytes, {2} {3} bytes"
172200
print(fstr.format(storage_desc, intToStr(dset_size),
173-
intToStr(allocated_size)))
201+
intToStr(allocated_size), chunk_type))
174202

175203
else:
176204
# verbose info not available, just show the chunk layout

h5pyd/_apps/hstouch.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,6 @@ def touchDomain(domain):
6969
parent_domain = getParentDomain(domain)
7070

7171
if parent_domain == "/":
72-
if not domain.endswith("/"):
73-
msg = "Only folders can be created as a top-level domain"
74-
logging.error(msg)
75-
sys.exit(msg)
7672
if len(domain) < 4:
7773
msg = "Top-level folders must be at least three characters"
7874
logging.error(msg)

h5pyd/_hl/attrs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def create(self, name, data, shape=None, dtype=None):
252252
# Not an array type; make sure to check the number of elements
253253
# is compatible, and reshape if needed.
254254
else:
255-
if numpy.product(shape) != numpy.product(data.shape):
255+
if numpy.prod(shape) != numpy.prod(data.shape):
256256
raise ValueError("Shape of new attribute conflicts with shape of data")
257257

258258
if shape != data.shape:
@@ -321,7 +321,7 @@ def modify(self, name, value):
321321
322322
# Allow the case of () <-> (1,)
323323
if (value.shape != attr.shape) and not \
324-
(numpy.product(value.shape) == 1 and numpy.product(attr.shape) == 1):
324+
(numpy.prod(value.shape) == 1 and numpy.prod(attr.shape) == 1):
325325
raise TypeError("Shape of data is incompatible with existing attribute")
326326
attr.write(value)
327327
"""

h5pyd/_hl/base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -464,9 +464,10 @@ def getElementCount(buffer, offset):
464464
count_bytes = bytes(buffer[offset:(offset+4)])
465465

466466
try:
467-
count = int(np.frombuffer(count_bytes, dtype="<i4"))
467+
arr =np.frombuffer(count_bytes, dtype="<i4")
468+
count = int(arr[0])
468469
except TypeError as e:
469-
msg = "Unexpected error reading count value for variable length elemennt: {}".format(e)
470+
msg = f"Unexpected error reading count value for variable length elemennt: {e}"
470471
raise TypeError(msg)
471472
if count < 0:
472473
# shouldn't be negative

h5pyd/_hl/dataset.py

Lines changed: 23 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ def make_new_dset(
104104
else:
105105
shape = (shape,) if isinstance(shape, int) else tuple(shape)
106106
if data is not None and (
107-
numpy.product(shape, dtype=numpy.ulonglong)
108-
!= numpy.product(data.shape, dtype=numpy.ulonglong)
107+
numpy.prod(shape, dtype=numpy.ulonglong)
108+
!= numpy.prod(data.shape, dtype=numpy.ulonglong)
109109
):
110110
raise ValueError("Shape tuple is incompatible with data")
111111

@@ -399,7 +399,6 @@ def __init__(self, dset, source_sel=None):
399399

400400
if not dset.chunks:
401401
# can only use with chunked datasets
402-
# (currently all datasets are chunked, but check for future compat)
403402
raise TypeError("Chunked dataset required")
404403

405404
if isinstance(dset.chunks, dict):
@@ -426,22 +425,15 @@ def __init__(self, dset, source_sel=None):
426425
for dim in range(rank):
427426
s = self._sel[dim]
428427
if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
429-
raise ValueError(
430-
"Invalid selection - selection region must be within dataset space"
431-
)
428+
msg = "Invalid selection - selection region must be within dataset space"
429+
raise ValueError(msg)
432430
index = s.start // self._layout[dim]
433431
self._chunk_index.append(index)
434432

435433
def __iter__(self):
436434
return self
437435

438436
def __next__(self):
439-
def get_ret(item):
440-
if len(item) == 1:
441-
return item[0]
442-
else:
443-
return tuple(item)
444-
445437
rank = len(self._shape)
446438
slices = []
447439
if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
@@ -475,7 +467,7 @@ def get_ret(item):
475467
# reset to the start and continue iterating with higher dimension
476468
self._chunk_index[dim] = 0
477469
dim -= 1
478-
return get_ret(slices)
470+
return tuple(slices)
479471

480472

481473
class Dataset(HLObject):
@@ -910,7 +902,7 @@ def _getQueryParam(self, start, stop, step=None):
910902
step = (1,) * rank
911903
param += "["
912904
for i in range(rank):
913-
field = "{}:{}:{}".format(start[i], stop[i], step[i])
905+
field = f"{start[i]}:{stop[i]}:{step[i]}"
914906
param += field
915907
if i != (rank - 1):
916908
param += ","
@@ -973,7 +965,7 @@ def __getitem__(self, args, new_dtype=None):
973965
mshape = sel.guess_shape(sid)
974966
if mshape is None:
975967
return numpy.array((0,), dtype=new_dtype)
976-
if numpy.product(mshape) == 0:
968+
if numpy.prod(mshape) == 0:
977969
return numpy.array(mshape, dtype=new_dtype)
978970
out = numpy.empty(mshape, dtype=new_dtype)
979971
sid_out = h5s.create_simple(mshape)
@@ -993,7 +985,7 @@ def __getitem__(self, args, new_dtype=None):
993985

994986
if self._shape == ():
995987
selection = sel.select(self, args)
996-
self.log.info("selection.mshape: {}".format(selection.mshape))
988+
self.log.info(f"selection.mshape: {selection.mshape}")
997989

998990
# TBD - refactor the following with the code for the non-scalar case
999991
req = "/datasets/" + self.id.uuid + "/value"
@@ -1006,7 +998,7 @@ def __getitem__(self, args, new_dtype=None):
1006998
arr = bytesToArray(rsp, new_dtype, self._shape)
1007999

10081000
if not self.dtype.shape:
1009-
self.log.debug("reshape arr to: {}".format(self._shape))
1001+
self.log.debug(f"reshape arr to: {self._shape}")
10101002
arr = numpy.reshape(arr, self._shape)
10111003
else:
10121004
# got JSON response
@@ -1024,11 +1016,8 @@ def __getitem__(self, args, new_dtype=None):
10241016
arr = numpy.empty((), dtype=new_dtype)
10251017
arr[()] = data
10261018
if selection.mshape is None:
1027-
self.log.info(
1028-
"return scalar selection of: {}, dtype: {}, shape: {}".format(
1029-
arr, arr.dtype, arr.shape
1030-
)
1031-
)
1019+
msg = f"return scalar selection of: {arr}, dtype: {arr.dtype}, shape: {arr.shape}"
1020+
self.log.info(msg)
10321021
val = arr[()]
10331022
if isinstance(val, str):
10341023
# h5py always returns bytes, so encode the str
@@ -1308,9 +1297,7 @@ def __getitem__(self, args, new_dtype=None):
13081297
points, dtype="u8"
13091298
) # must use unsigned 64-bit int
13101299
body = arr_points.tobytes()
1311-
self.log.info(
1312-
"point select binary request, num bytes: {}".format(len(body))
1313-
)
1300+
self.log.info(f"point select binary request, num bytes: {len(body)}")
13141301
else:
13151302
if delistify:
13161303
self.log.info("delistifying point selection")
@@ -1324,7 +1311,7 @@ def __getitem__(self, args, new_dtype=None):
13241311
else:
13251312
# can just assign
13261313
body["points"] = points
1327-
self.log.info("sending point selection request: {}".format(body))
1314+
self.log.info(f"sending point selection request: {body}")
13281315
rsp = self.POST(req, format=format, body=body)
13291316
if type(rsp) in (bytes, bytearray):
13301317
if len(rsp) // mtype.itemsize != selection.mshape[0]:
@@ -1337,18 +1324,14 @@ def __getitem__(self, args, new_dtype=None):
13371324
else:
13381325
data = rsp["value"]
13391326
if len(data) != selection.mshape[0]:
1340-
raise IOError(
1341-
"Expected {} elements, but got {}".format(
1342-
selection.mshape[0], len(data)
1343-
)
1344-
)
1345-
1327+
msg = f"Expected {selection.mshape[0]} elements, but got {len(data)}"
1328+
raise IOError(msg)
13461329
arr = numpy.asarray(data, dtype=mtype, order="C")
13471330

13481331
else:
13491332
raise ValueError("selection type not supported")
13501333

1351-
self.log.info("got arr: {}, cleaning up shape!".format(arr.shape))
1334+
self.log.info(f"got arr: {arr.shape}, cleaning up shape!")
13521335
# Patch up the output for NumPy
13531336
if len(names) == 1:
13541337
arr = arr[names[0]] # Single-field recarray convention
@@ -1368,7 +1351,7 @@ def __setitem__(self, args, val):
13681351
(slices and integers). For advanced indexing, the shapes must
13691352
match.
13701353
"""
1371-
self.log.info("Dataset __setitem__, args: {}".format(args))
1354+
self.log.info(f"Dataset __setitem__, args: {args}")
13721355
use_base64 = True # may need to set this to false below for some types
13731356

13741357
args = args if isinstance(args, tuple) else (args,)
@@ -1378,7 +1361,7 @@ def __setitem__(self, args, val):
13781361
self.log.debug(
13791362
f"val dtype: {val.dtype}, shape: {val.shape} metadata: {val.dtype.metadata}"
13801363
)
1381-
if numpy.product(val.shape) == 0:
1364+
if numpy.prod(val.shape) == 0:
13821365
self.log.info("no elements in numpy array, skipping write")
13831366
except AttributeError:
13841367
self.log.debug("val not ndarray")
@@ -1428,7 +1411,7 @@ def __setitem__(self, args, val):
14281411
i
14291412
for i in val.reshape(
14301413
(
1431-
numpy.product(val.shape[:-1], dtype=numpy.ulonglong),
1414+
numpy.prod(val.shape[:-1], dtype=numpy.ulonglong),
14321415
val.shape[-1],
14331416
)
14341417
)
@@ -1480,7 +1463,7 @@ def __setitem__(self, args, val):
14801463
# TBD - need to handle cases where the type shape is different
14811464
self.log.debug("got numpy array")
14821465
if val.dtype != self.dtype and val.dtype.shape == self.dtype.shape:
1483-
self.log.info("converting {} to {}".format(val.dtype, self.dtype))
1466+
self.log.info(f"converting {val.dtype} to {self.dtype}")
14841467
# convert array
14851468
tmp = numpy.empty(val.shape, dtype=self.dtype)
14861469
tmp[...] = val[...]
@@ -1584,15 +1567,13 @@ def __setitem__(self, args, val):
15841567
data = val.tobytes()
15851568
data = base64.b64encode(data)
15861569
data = data.decode("ascii")
1587-
self.log.debug("data: {}".format(data))
15881570
body["value_base64"] = data
1589-
self.log.debug("writing base64 data, {} bytes".format(len(data)))
1571+
self.log.debug(f"writing base64 data, {len(data)} bytes")
15901572
else:
15911573
if type(val) is not list:
15921574
val = val.tolist()
15931575
val = _decode(val)
1594-
self.log.debug("writing json data, {} elements".format(len(val)))
1595-
self.log.debug("data: {}".format(val))
1576+
self.log.debug(f"writing json data, {len(val)} elements")
15961577
body["value"] = val
15971578

15981579
if selection.select_type != sel.H5S_SELECT_ALL:
@@ -1702,7 +1683,7 @@ def __array__(self, dtype=None):
17021683
arr = numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype)
17031684

17041685
# Special case for (0,)*-shape datasets
1705-
if self._shape is None or numpy.product(self._shape) == 0:
1686+
if self._shape is None or numpy.prod(self._shape) == 0:
17061687
return arr
17071688

17081689
self.read_direct(arr)

h5pyd/_hl/filters.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def guess_chunk(shape, maxshape, typesize):
306306

307307
# Determine the optimal chunk size in bytes using a PyTables expression.
308308
# This is kept as a float.
309-
dset_size = np.product(chunks) * typesize
309+
dset_size = np.prod(chunks) * typesize
310310
target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024)))
311311

312312
if target_size > CHUNK_MAX:
@@ -321,15 +321,15 @@ def guess_chunk(shape, maxshape, typesize):
321321
# 1b. We're within 50% of the target chunk size, AND
322322
# 2. The chunk is smaller than the maximum chunk size
323323

324-
chunk_bytes = np.product(chunks) * typesize
324+
chunk_bytes = np.prod(chunks) * typesize
325325

326326
if (
327327
chunk_bytes < target_size
328328
or abs(chunk_bytes - target_size) / target_size < 0.5
329329
) and chunk_bytes < CHUNK_MAX:
330330
break
331331

332-
if np.product(chunks) == 1:
332+
if np.prod(chunks) == 1:
333333
break # Element size larger than CHUNK_MAX
334334

335335
chunks[idx % ndims] = np.ceil(chunks[idx % ndims] / 2.0)

h5pyd/_hl/group.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,14 +651,15 @@ def __getitem__(self, name):
651651
# (and hince the httpconn socket won't be closed)
652652
from .files import File
653653
external_domain = link_json['h5domain']
654-
if not op.isabs(external_domain):
654+
if not external_domain.startswith("hdf5://") and not op.isabs(external_domain):
655655
current_domain = self._id.http_conn.domain
656656
external_domain = op.join(op.dirname(current_domain), external_domain)
657657
external_domain = op.normpath(external_domain)
658658
try:
659659
endpoint = self.id.http_conn.endpoint
660660
username = self.id.http_conn.username
661661
password = self.id.http_conn.password
662+
print(external_domain)
662663
f = File(external_domain, endpoint=endpoint, username=username, password=password, mode='r')
663664
except IOError:
664665
# unable to find external link

0 commit comments

Comments
 (0)