Skip to content

Commit 4440c04

Browse files
lazy module loading
1 parent e3cc061 commit 4440c04

24 files changed

+159
-96
lines changed

Project.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
33
version = "0.6.0"
44

55
[deps]
6-
BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
76
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
87
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
98
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
@@ -26,12 +25,11 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
2625
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2726

2827
[compat]
29-
BinDeps = "1"
3028
CSV = "0.10.2"
3129
ColorTypes = "0.11"
3230
DataDeps = "0.7"
3331
DataFrames = "1.3"
34-
FileIO = "1.13"
32+
FileIO = "1.14"
3533
FixedPointNumbers = "0.8"
3634
GZip = "0.5"
3735
Glob = "1.3"
@@ -41,8 +39,8 @@ JLD2 = "0.4.21"
4139
JSON3 = "1"
4240
MAT = "0.10"
4341
MLUtils = "0.2.0"
44-
Pickle = "0.3"
4542
NPZ = "0.4.1"
43+
Pickle = "0.3"
4644
Requires = "1"
4745
Tables = "1.6"
4846
julia = "1.6"

docs/make.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ makedocs(
2828
"Text" => "datasets/text.md",
2929
"Vision" => "datasets/vision.md",
3030
],
31-
"Creating Datasets" => Any["containers/overview.md"],
31+
# "Creating Datasets" => Any["containers/overview.md"], # still experimental
3232
"LICENSE.md",
3333
],
3434
strict = true,

src/MLDatasets.jl

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,24 @@ module MLDatasets
22

33
using FixedPointNumbers
44
using SparseArrays
5-
using DataFrames, Tables
5+
using Tables
66
using Glob
7-
import ImageCore
8-
using ColorTypes
7+
# using DataFrames
8+
# import ImageCore
9+
using DataDeps
910
import MLUtils
1011
using MLUtils: getobs, numobs, AbstractDataContainer
12+
using ColorTypes
1113

1214
### I/O imports
13-
import NPZ
14-
import Pickle
15-
using MAT: matopen, matread
16-
import CSV
17-
using HDF5
18-
using JLD2
19-
import JSON3
15+
# import NPZ
16+
# import Pickle
17+
# using MAT: matopen, matread
18+
using FileIO
19+
# import CSV
20+
# using HDF5
21+
# using JLD2
22+
# import JSON3
2023
using DelimitedFiles: readdlm
2124
##########
2225

@@ -29,24 +32,26 @@ include("abstract_datasets.jl")
2932
# export AbstractDataset,
3033
# SupervisedDataset
3134

35+
include("imports.jl")
3236
include("utils.jl")
3337
export convert2image
3438

3539
include("io.jl")
36-
# export read_csv, read_npy
40+
# export read_csv, read_npy, ...
3741

3842
include("download.jl")
3943

40-
include("containers/filedataset.jl")
41-
export FileDataset
42-
include("containers/tabledataset.jl")
43-
export TableDataset
44-
include("containers/hdf5dataset.jl")
45-
export HDF5Dataset
46-
include("containers/jld2dataset.jl")
47-
export JLD2Dataset
48-
include("containers/cacheddataset.jl")
49-
export CachedDataset
44+
### API to be revisited with conditional module loading
45+
# include("containers/filedataset.jl")
46+
# export FileDataset
47+
# include("containers/tabledataset.jl")
48+
# export TableDataset
49+
# include("containers/hdf5dataset.jl")
50+
# export HDF5Dataset
51+
# # include("containers/jld2dataset.jl")
52+
# # export JLD2Dataset
53+
# include("containers/cacheddataset.jl")
54+
# export CachedDataset
5055

5156
# Misc.
5257
include("datasets/misc/boston_housing.jl")

src/abstract_datasets.jl

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,12 @@ function leftalign(s::AbstractString, n::Int)
4242
end
4343
end
4444

45-
_summary(x) = x
45+
_summary(x) = Tables.istable(x) ? summary(x) : x
4646
_summary(x::Symbol) = ":$x"
47-
_summary(x::Union{Dict, AbstractArray, DataFrame}) = summary(x)
48-
_summary(x::Union{Tuple, NamedTuple}) = map(_summary, x)
47+
_summary(x::Dict) = summary(x)
48+
_summary(x::Tuple) = map(_summary, x)
49+
_summary(x::NamedTuple) = map(_summary, x)
50+
_summary(x::AbstractArray) = summary(x)
4951
_summary(x::BitVector) = "$(count(x))-trues BitVector"
5052

5153
"""
@@ -58,11 +60,18 @@ a `features` and a `targets` fields.
5860
abstract type SupervisedDataset <: AbstractDataset end
5961

6062

61-
Base.length(d::SupervisedDataset) = numobs((d.features, d.targets))
63+
Base.length(d::SupervisedDataset) = Tables.istable(d.features) ? numobs_table(d.features) :
64+
numobs((d.features, d.targets))
65+
6266

6367
# We return named tuples
64-
Base.getindex(d::SupervisedDataset, ::Colon) = getobs((; d.features, d.targets))
65-
Base.getindex(d::SupervisedDataset, i) = getobs((; d.features, d.targets), i)
68+
Base.getindex(d::SupervisedDataset, ::Colon) = Tables.istable(d.features) ?
69+
(features = d.features, targets=d.targets) :
70+
getobs((; d.features, d.targets))
71+
72+
Base.getindex(d::SupervisedDataset, i) = Tables.istable(d.features) ?
73+
(features = getobs_table(d.features, i), targets=getobs_table(d.targets, i)) :
74+
getobs((; d.features, d.targets), i)
6675

6776
"""
6877
UnsupervisedDataset <: AbstractDataset

src/containers/tabledataset.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct TableDataset{T} <: AbstractDataContainer
1919
end
2020

2121
TableDataset(table::T) where {T} = TableDataset{T}(table)
22-
TableDataset(path::AbstractString) = TableDataset(DataFrame(CSV.File(path)))
22+
# TableDataset(path::AbstractString) = TableDataset(DataFrame(CSV.File(path)))
2323

2424
# slow accesses based on Tables.jl
2525
_getobs_row(x, i) = first(Iterators.peel(Iterators.drop(x, i - 1)))
@@ -54,8 +54,8 @@ Base.getindex(dataset::TableDataset{<:DataFrame}, i) = dataset.table[i, :]
5454
Base.length(dataset::TableDataset{<:DataFrame}) = nrow(dataset.table)
5555

5656
# fast access for CSV.File
57-
Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
58-
Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
57+
# Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
58+
# Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
5959

6060
## Tables.jl interface
6161

src/datasets/graphs/planetoid.jl

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -83,23 +83,12 @@ function read_planetoid_data(DEPNAME; dir=nothing, reverse_edges=true)
8383
return metadata, g
8484
end
8585

86-
function read_pickle_file(filename, name)
87-
out = Pickle.npyload(filename)
88-
if name == "graph"
89-
return out
90-
end
91-
if out isa SparseMatrixCSC
92-
return Matrix(out)
93-
end
94-
return out
95-
end
96-
9786
function read_planetoid_file(DEPNAME, name, dir)
9887
filename = datafile(DEPNAME, name, dir)
9988
if endswith(name, "test.index")
10089
out = 1 .+ vec(readdlm(filename, Int))
10190
else
102-
out = read_pickle_file(filename, name)
91+
out = read_pickle(filename)
10392
if out isa SparseMatrixCSC
10493
out = Matrix(out)
10594
end

src/datasets/graphs/reddit.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ function Reddit(; full=true, dir=nothing)
6161
feat_path = datafile(DEPNAME, DATA[5], dir)
6262

6363
# Read the json files
64-
graph = open(JSON3.read, graph_json)
65-
class_map = open(JSON3.read, class_map_json)
66-
id_map = open(JSON3.read, id_map_json)
64+
graph = read_json(graph_json)
65+
class_map = read_json(class_map_json)
66+
id_map = read_json(id_map_json)
6767

6868
# Metadata
6969
directed = graph["directed"]

src/datasets/misc/boston_housing.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ function BostonHousing(; as_df = true, dir = nothing)
7878
@assert dir === nothing "custom `dir` is not supported at the moment."
7979
path = joinpath(@__DIR__, "..", "..", "..", "data", "boston_housing.csv")
8080
df = read_csv(path)
81-
features = df[!, Not(:MEDV)]
81+
DFs = checked_import(idDataFrames)
82+
features = df[!, DFs.Not(:MEDV)]
8283
targets = df[!, [:MEDV]]
8384

8485
metadata = Dict{String, Any}()

src/datasets/misc/iris.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,10 @@ end
8080
function Iris(; dir = nothing, as_df = true)
8181
path = datafile("Iris", "iris.data", dir)
8282
df = read_csv(path, header=0)
83-
rename!(df, ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"])
83+
DFs = checked_import(idDataFrames)
84+
DFs.rename!(df, ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"])
8485

85-
features = df[!, Not(:class)]
86+
features = df[!, DFs.Not(:class)]
8687
targets = df[!, [:class]]
8788

8889
metadata = Dict{String, Any}()

src/datasets/misc/mutagenesis.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ function Mutagenesis(split::Symbol; dir=nothing)
8181

8282
data_path = datafile(DEPNAME, DATA, dir)
8383
metadata_path = datafile(DEPNAME, METADATA, dir)
84-
samples = open(JSON3.read, data_path)
85-
metadata = open(JSON3.read, metadata_path)
84+
samples = read_json(data_path)
85+
metadata = read_json(metadata_path)
8686
labelkey = metadata["label"]
8787
targets = map(i -> i[labelkey], samples)
8888
features = map(x->delete!(copy(x), Symbol(labelkey)), samples)

0 commit comments

Comments
 (0)