Skip to content

Commit 0d26602

Browse files
Merge pull request #128 from JuliaML/cl/load
lazy module loading
2 parents e3cc061 + 53a6ab7 commit 0d26602

35 files changed

+349
-1306
lines changed

Project.toml

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
name = "MLDatasets"
22
uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
3-
version = "0.6.0"
3+
version = "0.7.0"
44

55
[deps]
6-
BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
76
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
8-
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
97
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
108
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
119
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
@@ -14,9 +12,10 @@ FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
1412
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
1513
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
1614
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
17-
ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
15+
ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
1816
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
1917
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
18+
LazyModules = "8cdb02fc-e678-4876-92c5-9defec4f444e"
2019
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
2120
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
2221
NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605"
@@ -26,29 +25,29 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
2625
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2726

2827
[compat]
29-
BinDeps = "1"
3028
CSV = "0.10.2"
31-
ColorTypes = "0.11"
3229
DataDeps = "0.7"
3330
DataFrames = "1.3"
34-
FileIO = "1.13"
31+
FileIO = "1.14"
3532
FixedPointNumbers = "0.8"
3633
GZip = "0.5"
3734
Glob = "1.3"
3835
HDF5 = "0.16.2"
39-
ImageCore = "0.9"
36+
ImageShow = "0.3"
4037
JLD2 = "0.4.21"
4138
JSON3 = "1"
39+
LazyModules = "0.3"
4240
MAT = "0.10"
4341
MLUtils = "0.2.0"
44-
Pickle = "0.3"
4542
NPZ = "0.4.1"
43+
Pickle = "0.3"
4644
Requires = "1"
4745
Tables = "1.6"
4846
julia = "1.6"
4947

5048
[extras]
49+
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
5150
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
5251

5352
[targets]
54-
test = ["Test"]
53+
test = ["Test", "ColorTypes"]

docs/make.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ makedocs(
2828
"Text" => "datasets/text.md",
2929
"Vision" => "datasets/vision.md",
3030
],
31-
"Creating Datasets" => Any["containers/overview.md"],
31+
"Creating Datasets" => Any["containers/overview.md"], # still experimental
3232
"LICENSE.md",
3333
],
3434
strict = true,

docs/src/containers/overview.md

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,16 @@ MLDatasets.jl contains several reusable data containers for accessing datasets i
44

55
```@docs
66
FileDataset
7-
TableDataset
8-
HDF5Dataset
9-
Base.close(::HDF5Dataset)
10-
JLD2Dataset
11-
Base.close(::JLD2Dataset)
127
CachedDataset
138
MLDatasets.make_cache
149
```
10+
11+
<!--
12+
# TODO add back to docs when included again in the pkg
13+
HDF5Dataset
14+
Base.close(::HDF5Dataset)
15+
TableDataset
16+
JLD2Dataset
17+
Base.close(::JLD2Dataset)
18+
-->
19+

docs/src/index.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Pkg.add("MLDatasets")
2525

2626
Datasets are grouped into different categories. Click on the links below for a full list of datasets available in each category.
2727

28-
- [Graphs Datasets](@ref) - datasets with an underlying graph structure: Cora, PubMed, CiteSeer, ...
28+
- [Graph Datasets](@ref) - datasets with an underlying graph structure: Cora, PubMed, CiteSeer, ...
2929
- [Miscellaneuous Datasets](@ref) - datasets that do not fall into any of the other categories: Iris, BostonHousing, ...
3030
- [Text Datasets](@ref) - datasets for language models.
3131
- [Vision Datasets](@ref) - vision related datasets such as MNIST, CIFAR10, CIFAR100, ...
@@ -71,6 +71,8 @@ julia> summary(X_train)
7171
Input features are commonly denoted by `features`, while classification labels and regression targets are denoted by `targets`.
7272

7373
```julia-repl
74+
julia> using MLDatasets, DataFrames
75+
7476
julia> iris = Iris()
7577
dataset Iris:
7678
metadata => Dict{String, Any} with 4 entries
@@ -136,6 +138,35 @@ julia> iris.targets
136138
MLDatasets.jl garuantees compatibility with the [getobs](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.getobs) and [numobs](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.numobs) interface defined in [MLUtils.jl](https://github.com/JuliaML/MLUtils.jl).
137139
In practice, applying `getobs` and `numobs` on datasets is equivalent to applying indexing and `length`.
138140

141+
## Conditional module loading
142+
143+
MLDatasets.jl relies on many different packages in order to load and process the diverse type of datasets it supports. Most likely, any single user of the library will use a limited subset of these functionalities.
144+
In order to reduce the time taken by `using MLDatasets` in users' code,
145+
we use a [lazy import system](https://github.com/johnnychen94/LazyModules.jl) that defers the import of packages inside MLDatasets.jl as much as possible.
146+
For some of the packages, some manual intervention is needed from the user.
147+
As an example, the following code will produce an error:
148+
149+
```julia-repl
150+
julia> using MLDataset
151+
152+
julia> MNIST(); # fine, MNIST doesn't require DataFrames
153+
154+
julia> Iris() # ERROR: Add `import DataFrames` or `using DataFrames` to your code to unlock this functionality.
155+
```
156+
157+
We can easily fix the error with an additional import as recommended by the error message:
158+
159+
```julia-repl
160+
julia> using MLDataset, DataFrames
161+
162+
julia> Iris()
163+
dataset Iris:
164+
metadata => Dict{String, Any} with 4 entries
165+
features => 150×4 DataFrame
166+
targets => 150×1 DataFrame
167+
dataframe => 150×5 DataFrame
168+
```
169+
139170
## Download location
140171

141172
MLDatasets.jl is build on top of the package

src/MLDatasets.jl

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,34 @@ module MLDatasets
22

33
using FixedPointNumbers
44
using SparseArrays
5-
using DataFrames, Tables
6-
using Glob
7-
import ImageCore
8-
using ColorTypes
5+
using Tables
6+
using DataDeps
97
import MLUtils
108
using MLUtils: getobs, numobs, AbstractDataContainer
11-
12-
### I/O imports
13-
import NPZ
14-
import Pickle
15-
using MAT: matopen, matread
16-
import CSV
17-
using HDF5
18-
using JLD2
19-
import JSON3
9+
using Glob
2010
using DelimitedFiles: readdlm
21-
##########
11+
using FileIO
12+
using LazyModules: @lazy
2213

23-
export getobs, numobs
24-
export convert2image
14+
include("require.jl") # export @require
15+
16+
# Use `@lazy import SomePkg` whenever the returned types are not its own types,
17+
# since for methods applied on the returned types we would encounter in world-age issues
18+
# (see discussion in https://github.com/JuliaML/MLDatasets.jl/pull/128).
19+
# In the other case instead, use `require import SomePkg` to force
20+
# the use to manually import.
2521

22+
@require import JSON3="0f8b85d8-7281-11e9-16c2-39a750bddbf1"
23+
@require import DataFrames="a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
24+
@require import ImageShow="4e3cecfd-b093-5904-9786-8bbb286a6a31"
25+
# @lazy import NPZ # lazy imported by FileIO
26+
@lazy import Pickle="fbb45041-c46e-462f-888f-7c521cafbc2c"
27+
@lazy import MAT="23992714-dd62-5051-b70f-ba57cb901cac"
28+
@lazy import CSV="336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
29+
@lazy import HDF5="f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
30+
# @lazy import JLD2
2631

32+
export getobs, numobs # From MLUtils.jl
2733

2834
include("abstract_datasets.jl")
2935
# export AbstractDataset,
@@ -33,22 +39,26 @@ include("utils.jl")
3339
export convert2image
3440

3541
include("io.jl")
36-
# export read_csv, read_npy
42+
# export read_csv, read_npy, ...
3743

3844
include("download.jl")
3945

4046
include("containers/filedataset.jl")
4147
export FileDataset
42-
include("containers/tabledataset.jl")
43-
export TableDataset
44-
include("containers/hdf5dataset.jl")
45-
export HDF5Dataset
46-
include("containers/jld2dataset.jl")
47-
export JLD2Dataset
4848
include("containers/cacheddataset.jl")
4949
export CachedDataset
50+
# include("containers/tabledataset.jl")
51+
# export TableDataset
52+
53+
## TODO add back when compatible with `@lazy` or `@require`
54+
## which means that they cannot dispatch on types from JLD2 and HDF5
55+
# include("containers/hdf5dataset.jl")
56+
# export HDF5Dataset
57+
# include("containers/jld2dataset.jl")
58+
# export JLD2Dataset
59+
60+
## Misc.
5061

51-
# Misc.
5262
include("datasets/misc/boston_housing.jl")
5363
export BostonHousing
5464
include("datasets/misc/iris.jl")
@@ -59,7 +69,7 @@ include("datasets/misc/titanic.jl")
5969
export Titanic
6070

6171

62-
# Vision
72+
## Vision
6373

6474
include("datasets/vision/emnist.jl")
6575
export EMNIST
@@ -74,11 +84,11 @@ export CIFAR10
7484
include("datasets/vision/cifar100_reader/CIFAR100Reader.jl")
7585
include("datasets/vision/cifar100.jl")
7686
export CIFAR100
77-
7887
include("datasets/vision/svhn2.jl")
7988
export SVHN2
8089

81-
# Text
90+
## Text
91+
8292
include("datasets/text/ptblm.jl")
8393
export PTBLM
8494
include("datasets/text/udenglish.jl")

src/abstract_datasets.jl

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,12 @@ function leftalign(s::AbstractString, n::Int)
4242
end
4343
end
4444

45-
_summary(x) = x
45+
_summary(x) = Tables.istable(x) ? summary(x) : x
4646
_summary(x::Symbol) = ":$x"
47-
_summary(x::Union{Dict, AbstractArray, DataFrame}) = summary(x)
48-
_summary(x::Union{Tuple, NamedTuple}) = map(_summary, x)
47+
_summary(x::Dict) = summary(x)
48+
_summary(x::Tuple) = map(_summary, x)
49+
_summary(x::NamedTuple) = map(_summary, x)
50+
_summary(x::AbstractArray) = summary(x)
4951
_summary(x::BitVector) = "$(count(x))-trues BitVector"
5052

5153
"""
@@ -58,11 +60,18 @@ a `features` and a `targets` fields.
5860
abstract type SupervisedDataset <: AbstractDataset end
5961

6062

61-
Base.length(d::SupervisedDataset) = numobs((d.features, d.targets))
63+
Base.length(d::SupervisedDataset) = Tables.istable(d.features) ? numobs_table(d.features) :
64+
numobs((d.features, d.targets))
65+
6266

6367
# We return named tuples
64-
Base.getindex(d::SupervisedDataset, ::Colon) = getobs((; d.features, d.targets))
65-
Base.getindex(d::SupervisedDataset, i) = getobs((; d.features, d.targets), i)
68+
Base.getindex(d::SupervisedDataset, ::Colon) = Tables.istable(d.features) ?
69+
(features = d.features, targets=d.targets) :
70+
getobs((; d.features, d.targets))
71+
72+
Base.getindex(d::SupervisedDataset, i) = Tables.istable(d.features) ?
73+
(features = getobs_table(d.features, i), targets=getobs_table(d.targets, i)) :
74+
getobs((; d.features, d.targets), i)
6675

6776
"""
6877
UnsupervisedDataset <: AbstractDataset

src/containers/tabledataset.jl

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@ struct TableDataset{T} <: AbstractDataContainer
1212
# TableDatasets must implement the Tables.jl interface
1313
function TableDataset{T}(table::T) where {T}
1414
Tables.istable(table) ||
15-
throw(ArgumentError("TableDatasets must implement the Tabels.jl interface"))
15+
throw(ArgumentError("The input must implement the Tables.jl interface"))
1616

1717
new{T}(table)
1818
end
1919
end
2020

2121
TableDataset(table::T) where {T} = TableDataset{T}(table)
22-
TableDataset(path::AbstractString) = TableDataset(DataFrame(CSV.File(path)))
22+
TableDataset(path::AbstractString) = TableDataset(read_csv(path))
23+
2324

2425
# slow accesses based on Tables.jl
2526
_getobs_row(x, i) = first(Iterators.peel(Iterators.drop(x, i - 1)))
@@ -29,37 +30,44 @@ function _getobs_column(x, i)
2930

3031
return NamedTuple{colnames}(rowvals)
3132
end
32-
function Base.getindex(dataset::TableDataset, i)
33-
if Tables.rowaccess(dataset.table)
34-
return _getobs_row(Tables.rows(dataset.table), i)
35-
elseif Tables.columnaccess(dataset.table)
36-
return _getobs_column(dataset.table, i)
33+
34+
function getobs_table(table, i)
35+
if Tables.rowaccess(table)
36+
return _getobs_row(Tables.rows(table), i)
37+
elseif Tables.columnaccess(table)
38+
return _getobs_column(table, i)
3739
else
3840
error("The Tables.jl implementation used should have either rowaccess or columnaccess.")
3941
end
4042
end
41-
function Base.length(dataset::TableDataset)
42-
if Tables.columnaccess(dataset.table)
43-
return length(Tables.getcolumn(dataset.table, 1))
44-
elseif Tables.rowaccess(dataset.table)
43+
44+
function numobs_table(table)
45+
if Tables.columnaccess(table)
46+
return length(Tables.getcolumn(table, 1))
47+
elseif Tables.rowaccess(table)
4548
# length might not be defined, but has to be for this to work.
46-
return length(Tables.rows(dataset.table))
49+
return length(Tables.rows(table))
4750
else
4851
error("The Tables.jl implementation used should have either rowaccess or columnaccess.")
4952
end
5053
end
5154

55+
Base.getindex(dataset::TableDataset, i) = getobs_table(dataset.table, i)
56+
Base.length(dataset::TableDataset) = numobs_table(dataset.table)
57+
58+
5259
# fast access for DataFrame
53-
Base.getindex(dataset::TableDataset{<:DataFrame}, i) = dataset.table[i, :]
54-
Base.length(dataset::TableDataset{<:DataFrame}) = nrow(dataset.table)
60+
# Base.getindex(dataset::TableDataset{<:DataFrame}, i) = dataset.table[i, :]
61+
# Base.length(dataset::TableDataset{<:DataFrame}) = nrow(dataset.table)
5562

5663
# fast access for CSV.File
57-
Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
58-
Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
64+
# Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
65+
# Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
5966

6067
## Tables.jl interface
6168

6269
Tables.istable(::TableDataset) = true
70+
6371
for fn in (:rowaccess, :rows, :columnaccess, :columns, :schema, :materializer)
6472
@eval Tables.$fn(dataset::TableDataset) = Tables.$fn(dataset.table)
6573
end

src/datasets/graphs/planetoid.jl

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -83,23 +83,12 @@ function read_planetoid_data(DEPNAME; dir=nothing, reverse_edges=true)
8383
return metadata, g
8484
end
8585

86-
function read_pickle_file(filename, name)
87-
out = Pickle.npyload(filename)
88-
if name == "graph"
89-
return out
90-
end
91-
if out isa SparseMatrixCSC
92-
return Matrix(out)
93-
end
94-
return out
95-
end
96-
9786
function read_planetoid_file(DEPNAME, name, dir)
9887
filename = datafile(DEPNAME, name, dir)
9988
if endswith(name, "test.index")
10089
out = 1 .+ vec(readdlm(filename, Int))
10190
else
102-
out = read_pickle_file(filename, name)
91+
out = read_pickle(filename)
10392
if out isa SparseMatrixCSC
10493
out = Matrix(out)
10594
end

0 commit comments

Comments
 (0)