Skip to content

Commit 4e6807f

Browse files
committed
Add HDF5 dataset
1 parent 757b706 commit 4e6807f

File tree

6 files changed

+87
-4
lines changed

6 files changed

+87
-4
lines changed

Project.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
1414
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
1515
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
1616
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
17+
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
1718
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1819
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
1920
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
@@ -29,8 +30,8 @@ ColorTypes = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10, 0.11"
2930
DataDeps = "0.3, 0.4, 0.5, 0.6, 0.7"
3031
DataFrames = "1.3"
3132
FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
32-
Glob = "1.3"
3333
GZip = "0.5"
34+
Glob = "1.3"
3435
ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
3536
JSON3 = "1"
3637
MAT = "0.7, 0.8, 0.9, 0.10"

src/MLDatasets.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ using DataFrames, CSV, Tables
1111
using FilePathsBase
1212
using FilePathsBase: AbstractPath
1313
using Glob
14+
using HDF5
1415

1516
import MLUtils
1617
using MLUtils: getobs, numobs, AbstractDataContainer
1718

18-
export FileDataset, TableDataset
19+
export FileDataset, TableDataset, HDF5Dataset
1920
export getobs, numobs
2021

2122
# Julia 1.0 compatibility
@@ -48,6 +49,7 @@ include("download.jl")
4849

4950
include("containers/filedataset.jl")
5051
include("containers/tabledataset.jl")
52+
include("containers/hdf5dataset.jl")
5153

5254
# Misc.
5355
include("BostonHousing/BostonHousing.jl")

src/containers/filedataset.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ Wrap a set of file `paths` as a dataset (traversed in the same order as `paths`)
3939
Alternatively, specify a `dir` and collect all paths that match a glob `pattern`
4040
(recursively globbing by `depth`). The glob order determines the traversal order.
4141
"""
42-
struct FileDataset{T} <: AbstractDataContainer
43-
paths::T
42+
struct FileDataset{T<:Union{AbstractPath, AbstractString}} <: AbstractDataContainer
43+
paths::Vector{T}
4444
end
4545

4646
FileDataset(dir, pattern = "*", depth = 4) = FileDataset(rglob(pattern, string(dir), depth))

src/containers/hdf5dataset.jl

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
function _check_hdf5_shapes(shapes)
2+
nobs = map(last, filter(!isempty, shapes))
3+
4+
return all(==(first(nobs)), nobs[2:end])
5+
end
6+
7+
"""
8+
HDF5Dataset(file::Union{AbstractString, AbstractPath}, paths)
9+
HDF5Dataset(fid::HDF5.File, paths::Vector{HDF5.Dataset})
10+
HDF5Dataset(fid::HDF5.File, paths::Vector{<:AbstractString})
11+
HDF5Dataset(fid::HDF5.File, paths::Vector{HDF5.Dataset}, shapes)
12+
13+
Wrap several HDF5 datasets (`paths`) as a single dataset container.
14+
Each dataset `p` in `paths` should be accessible as `fid[p]`.
15+
See [`close(::HDF5Dataset)`](@ref) for closing the underlying HDF5 file pointer.
16+
17+
For array datasets, the last dimension is assumed to be the observation dimension.
18+
For scalar datasets, the stored value is returned by `getobs` for any index.
19+
"""
20+
struct HDF5Dataset
21+
fid::HDF5.File
22+
paths::Vector{HDF5.Dataset}
23+
shapes::Vector{Tuple}
24+
25+
function HDF5Dataset(fid::HDF5.File, paths::Vector{HDF5.Dataset}, shapes::Vector)
26+
_check_hdf5_shapes(shapes) ||
27+
throw(ArgumentError("Cannot create HDF5Dataset for datasets with mismatch number of observations."))
28+
29+
new(fid, paths, shapes)
30+
end
31+
end
32+
33+
HDF5Dataset(fid::HDF5.File, paths::Vector{HDF5.Dataset}) =
34+
HDF5Dataset(fid, paths, map(size, paths))
35+
HDF5Dataset(fid::HDF5.File, paths::Vector{<:AbstractString}) =
36+
HDF5Dataset(fid, map(p -> fid[p], paths))
37+
HDF5Dataset(file::Union{AbstractString, AbstractPath}, paths) =
38+
HDF5Dataset(h5open(file, "r"), paths)
39+
40+
MLUtils.getobs(dataset::HDF5Dataset, i) = Tuple(map(dataset.paths, dataset.shapes) do path, shape
41+
if isempty(shape)
42+
return read(path)
43+
else
44+
I = map(s -> 1:s, shape[1:(end - 1)])
45+
return path[I..., i]
46+
end
47+
end)
48+
MLUtils.numobs(dataset::HDF5Dataset) = last(first(filter(!isempty, dataset.shapes)))
49+
50+
Base.close(dataset::HDF5Dataset) = close(dataset.fid)

test/containers/hdf5dataset.jl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
function setup_hdf5dataset_test()
2+
datasets = [
3+
("d1", rand(2, 10)),
4+
("g1/d1", rand(10)),
5+
# these are broken at HDF5 level
6+
# ("g1/d2", string.('a':'j')),
7+
# ("g2/g1/d1", "test")
8+
]
9+
10+
fid = h5open("test.h5", "w")
11+
for (path, data) in datasets
12+
fid[path] = data
13+
end
14+
close(fid)
15+
16+
return first.(datasets), last.(datasets)
17+
end
18+
19+
@testset "HDF5Dataset" begin
20+
paths, datas = setup_hdf5dataset_test()
21+
dataset = HDF5Dataset("test.h5", paths)
22+
for i in 1:10
23+
data = Tuple(map(x -> getobs(x, i), datas))
24+
@test getobs(dataset, i) == data
25+
end
26+
@test numobs(dataset) == 10
27+
rm("test.h5")
28+
end

test/runtests.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ using ImageCore
44
using DataDeps
55
using MLUtils: getobs, numobs
66
using DataFrames, CSV, Tables
7+
using HDF5
78

89
ENV["DATADEPS_ALWAYS_ACCEPT"] = true
910

@@ -30,6 +31,7 @@ dataset_tests = [
3031
container_tests = [
3132
"containers/filedataset.jl",
3233
"containers/tabledataset.jl",
34+
"containers/hdf5dataset.jl",
3335
]
3436

3537
@testset "Datasets" for t in dataset_tests

0 commit comments

Comments
 (0)