Skip to content

Commit 87e1984

Browse files
authored
Merge pull request #8 from CarloLucibello/fashion
add outline for FashionMNIST dataset
2 parents 3bbc0ae + cedb9ad commit 87e1984

File tree

14 files changed

+668
-83
lines changed

14 files changed

+668
-83
lines changed

src/CIFAR10.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module CIFAR10
33

44
using BinDeps
55

6-
const defdir = joinpath(Pkg.dir("MLDatasets"), "datasets/cifar10")
6+
const defdir = joinpath(Pkg.dir("MLDatasets"), "datasets", "cifar10")
77

88
function getdata(dir)
99
mkpath(dir)
@@ -25,7 +25,7 @@ function readdata(data::Vector{UInt8})
2525
end
2626

2727
function traindata(dir=defdir)
28-
files = ["$(dir)/cifar-10-batches-bin/data_batch_$(i).bin" for i=1:5]
28+
files = [joinpath(dir,"cifar-10-batches-bin","data_batch_$i.bin") for i=1:5]
2929
all(isfile, files) || getdata(dir)
3030
data = UInt8[]
3131
for file in files
@@ -35,7 +35,7 @@ function traindata(dir=defdir)
3535
end
3636

3737
function testdata(dir=defdir)
38-
file = "$(dir)/cifar-10-batches-bin/test_batch.bin"
38+
file = joinpath(dir,"cifar-10-batches-bin","test_batch.bin")
3939
isfile(file) || getdata(dir)
4040
readdata(open(read,file))
4141
end

src/CIFAR100.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module CIFAR100
33

44
using BinDeps
55

6-
const defdir = joinpath(Pkg.dir("MLDatasets"), "datasets/cifar100")
6+
const defdir = joinpath(Pkg.dir("MLDatasets"), "datasets","cifar100")
77

88
function getdata(dir)
99
mkpath(dir)
@@ -25,13 +25,13 @@ function readdata(data::Vector{UInt8})
2525
end
2626

2727
function traindata(dir=defdir)
28-
file = joinpath(dir, "cifar-100-binary/train.bin")
28+
file = joinpath(dir, "cifar-100-binary","train.bin")
2929
isfile(file) || getdata(dir)
3030
readdata(open(read,file))
3131
end
3232

3333
function testdata(dir=defdir)
34-
file = joinpath(dir, "cifar-100-binary/test.bin")
34+
file = joinpath(dir, "cifar-100-binary","test.bin")
3535
isfile(file) || getdata(dir)
3636
readdata(open(read,file))
3737
end

src/FashionMNIST/FashionMNIST.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
export FashionMNIST
2+
module FashionMNIST
3+
using ImageCore
4+
using ColorTypes
5+
6+
export
7+
8+
traintensor,
9+
testtensor,
10+
11+
trainlabels,
12+
testlabels,
13+
14+
traindata,
15+
testdata,
16+
17+
convert2image,
18+
convert2features,
19+
20+
download_helper
21+
22+
const DEFAULT_DIR = abspath(joinpath(dirname(@__FILE__), "..", "..", "datasets", "fashion_mnist"))
23+
24+
include("reader.jl")
25+
include("interface.jl")
26+
include(joinpath("..", "MNIST", "utils.jl"))
27+
end

src/FashionMNIST/interface.jl

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
"""
2+
traintensor([indices]; [dir], [decimal=true]) -> Array{Float64}
3+
4+
Returns the FashionMNIST **training** images corresponding to the given
5+
`indices` as a multi-dimensional array.
6+
7+
The corresponding source file of the dataset is expected to be
8+
located in the specified directory `dir`. If `dir` is omitted it
9+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
10+
source files have not been downloaded yet, you can use
11+
`FashionMNIST.download_helper(dir)` to assist in the process.
12+
13+
```julia
14+
julia> FashionMNIST.traintensor(dir="/home/user/fashion_mnist")
15+
WARNING: The FashionMNIST file "train-images-idx3-ubyte.gz" was not found in "/home/user/FashionMNIST". You can download [...]
16+
```
17+
18+
The image(s) is/are returned in the native horizontal-major
19+
memory layout as a single floating point array. If `decimal=true`
20+
all values are scaled to be between `0.0` and `1.0`, otherwise
21+
the values will be between `0.0` and `255.0`.
22+
23+
If the parameter `indices` is omitted or an `AbstractVector`, the
24+
images are returned as a 3D array (i.e. a `Array{Float64,3}`), in
25+
which the first dimension corresponds to the pixel *rows* (x) of
26+
the image, the second dimension to the pixel *columns* (y) of the
27+
image, and the third dimension denotes the index of the image.
28+
29+
```julia
30+
julia> FashionMNIST.traintensor() # load all training images
31+
28×28×60000 Array{Float64,3}:
32+
[...]
33+
34+
julia> FashionMNIST.traintensor(1:3) # load first three training images
35+
28×28×3 Array{Float64,3}:
36+
[...]
37+
```
38+
39+
If `indices` is an `Integer`, the single image is returned as
40+
`Matrix{Float64}` in horizontal-major layout, which means that
41+
the first dimension denotes the pixel *rows* (x), and the second
42+
dimension denotes the pixel *columns* (y) of the image.
43+
44+
```julia
45+
julia> FashionMNIST.traintensor(1) # load first training image
46+
28×28 Array{Float64,2}:
47+
[...]
48+
```
49+
50+
As mentioned above, the images are returned in the native
51+
horizontal-major layout to preserve the original feature
52+
ordering. You can use the utility function
53+
[`convert2image`](@ref) to convert an FashionMNIST array into a
54+
vertical-major Julia image with the corrected color values.
55+
56+
```
57+
julia> FashionMNIST.convert2image(FashionMNIST.traintensor(1)) # convert to column-major colorant array
58+
28×28 Array{Gray{Float64},2}:
59+
[...]
60+
```
61+
"""
62+
function traintensor(args...; dir=DEFAULT_DIR, decimal=true)
63+
if decimal
64+
Reader.readtrainimages(dir, args...) ./ 255
65+
else
66+
convert(Array{Float64}, Reader.readtrainimages(dir, args...))
67+
end
68+
end
69+
70+
"""
71+
testtensor([indices]; [dir], [decimal=true]) -> Array{Float64}
72+
73+
Returns the FashionMNIST **test** images corresponding to the given
74+
`indices` as a multi-dimensional array.
75+
76+
The corresponding source file of the dataset is expected to be
77+
located in the specified directory `dir`. If `dir` is omitted it
78+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
79+
source files have not been downloaded yet, you can use
80+
`FashionMNIST.download_helper(dir)` to assist in the process.
81+
82+
```julia
83+
julia> FashionMNIST.testtensor(dir="/home/user/FashionMNIST")
84+
WARNING: The FashionMNIST file "t10k-images-idx3-ubyte.gz" was not found in "/home/user/FashionMNIST". You can download [...]
85+
```
86+
87+
The image(s) is/are returned in the native horizontal-major
88+
memory layout as a single floating point array. If `decimal=true`
89+
all values are scaled to be between `0.0` and `1.0`, otherwise
90+
the values will be between `0.0` and `255.0`.
91+
92+
If the parameter `indices` is omitted or an `AbstractVector`, the
93+
images are returned as a 3D array (i.e. a `Array{Float64,3}`), in
94+
which the first dimension corresponds to the pixel *rows* (x) of
95+
the image, the second dimension to the pixel *columns* (y) of the
96+
image, and the third dimension denotes the index of the image.
97+
98+
```julia
99+
julia> FashionMNIST.testtensor() # load all test images
100+
28×28×10000 Array{Float64,3}:
101+
[...]
102+
103+
julia> FashionMNIST.testtensor(1:3) # load first three test images
104+
28×28×3 Array{Float64,3}:
105+
[...]
106+
```
107+
108+
If `indices` is an `Integer`, the single image is returned as
109+
`Matrix{Float64}` in horizontal-major layout, which means that
110+
the first dimension denotes the pixel *rows* (x), and the second
111+
dimension denotes the pixel *columns* (y) of the image.
112+
113+
```julia
114+
julia> FashionMNIST.testtensor(1) # load first test image
115+
28×28 Array{Float64,2}:
116+
[...]
117+
```
118+
119+
As mentioned above, the images are returned in the native
120+
horizontal-major layout to preserve the original feature
121+
ordering. You can use the utility function
122+
[`convert2image`](@ref) to convert an FashionMNIST array into a
123+
vertical-major Julia image with the corrected color values.
124+
125+
```
126+
julia> FashionMNIST.convert2image(FashionMNIST.testtensor(1)) # convert to column-major colorant array
127+
28×28 Array{Gray{Float64},2}:
128+
[...]
129+
```
130+
"""
131+
function testtensor(args...; dir=DEFAULT_DIR, decimal=true)
132+
if decimal
133+
Reader.readtestimages(dir, args...) ./ 255
134+
else
135+
convert(Array{Float64}, Reader.readtestimages(dir, args...))
136+
end
137+
end
138+
139+
"""
140+
trainlabels([indices]; [dir])
141+
142+
Returns the FashionMNIST **trainset** labels corresponding to the given
143+
`indices` as an `Int` or `Vector{Int}`. The values of the labels
144+
denote the digit that they represent. If `indices` is omitted,
145+
all labels are returned.
146+
147+
```julia
148+
julia> FashionMNIST.trainlabels() # full training set
149+
60000-element Array{Int64,1}:
150+
5
151+
0
152+
153+
6
154+
8
155+
156+
julia> FashionMNIST.trainlabels(1:3) # first three labels
157+
3-element Array{Int64,1}:
158+
5
159+
0
160+
4
161+
162+
julia> FashionMNIST.trainlabels(1) # first label
163+
5
164+
```
165+
166+
The corresponding source file of the dataset is expected to be
167+
located in the specified directory `dir`. If `dir` is omitted it
168+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
169+
source files have not been downloaded yet, you can use
170+
`FashionMNIST.download_helper(dir)` to assist in the process.
171+
172+
```julia
173+
julia> FashionMNIST.trainlabels(dir="/home/user/fashion_mnist")
174+
WARNING: The FashionMNIST file "train-labels-idx1-ubyte.gz" was not found in "/home/user/fashion_mnist". You can download [...]
175+
```
176+
"""
177+
trainlabels(args...; dir=DEFAULT_DIR) = Vector{Int}(Reader.readtrainlabels(dir, args...))
178+
trainlabels(index::Integer; dir=DEFAULT_DIR) = Int(Reader.readtrainlabels(dir, index))
179+
180+
"""
181+
testlabels([indices]; [dir])
182+
183+
Returns the FashionMNIST **testset** labels corresponding to the given
184+
`indices` as an `Int` or `Vector{Int}`. The values of the labels
185+
denote the digit that they represent. If `indices` is omitted,
186+
all labels are returned.
187+
188+
```julia
189+
julia> FashionMNIST.testlabels() # full test set
190+
10000-element Array{Int64,1}:
191+
7
192+
2
193+
194+
5
195+
6
196+
197+
julia> FashionMNIST.testlabels(1:3) # first three labels
198+
3-element Array{Int64,1}:
199+
7
200+
2
201+
1
202+
203+
julia> FashionMNIST.testlabels(1) # first label
204+
7
205+
```
206+
207+
The corresponding source file of the dataset is expected to be
208+
located in the specified directory `dir`. If `dir` is omitted it
209+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
210+
source files have not been downloaded yet, you can use
211+
`FashionMNIST.download_helper(dir)` to assist in the process.
212+
213+
```julia
214+
julia> FashionMNIST.testlabels(dir="/home/user/fashion_mnist")
215+
WARNING: The FashionMNIST file "t10k-labels-idx1-ubyte.gz" was not found in "/home/user/fashion_mnist". You can download [...]
216+
```
217+
"""
218+
testlabels(args...; dir=DEFAULT_DIR) = Vector{Int}(Reader.readtestlabels(dir, args...))
219+
testlabels(index::Integer; dir=DEFAULT_DIR) = Int(Reader.readtestlabels(dir, index))
220+
221+
"""
222+
traindata([indices]; [dir], [decimal=true]) -> Tuple
223+
224+
Returns the FashionMNIST **trainingset** corresponding to the given
225+
`indices` as a two-element tuple. If `indices` is omitted the
226+
full trainingset is returned. The first element of thre return
227+
value will be the images as a multi-dimensional array, and the
228+
second element the corresponding labels as integers.
229+
230+
The images are returned in the native horizontal-major memory
231+
layout as a single floating point array. If `decimal=true` all
232+
values are scaled to be between `0.0` and `1.0`, otherwise the
233+
values will be between `0.0` and `255.0`. The integer values of
234+
the labels correspond 1-to-1 the digit that they represent.
235+
236+
```julia
237+
train_x, train_y = FashionMNIST.traindata() # full datatset
238+
train_x, train_y = FashionMNIST.traindata(2) # only second observation
239+
train_x, train_y = FashionMNIST.traindata(dir="./FashionMNIST") # custom folder
240+
```
241+
242+
The corresponding source files of the dataset are expected to be
243+
located in the specified directory `dir`. If `dir` is omitted it
244+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
245+
source files have not been downloaded yet, you can use
246+
`FashionMNIST.download_helper(dir)` to assist in the process.
247+
248+
Take a look at [`traintensor`](@ref) and [`trainlabels`](@ref)
249+
for more information.
250+
"""
251+
function traindata(args...; dir=DEFAULT_DIR, decimal=true)
252+
(traintensor(args...; dir=dir, decimal=decimal),
253+
trainlabels(args...; dir=dir))
254+
end
255+
256+
"""
257+
testdata([indices]; [dir], [decimal=true]) -> Tuple
258+
259+
Returns the FashionMNIST **testset** corresponding to the given
260+
`indices` as a two-element tuple. If `indices` is omitted the
261+
full testset is returned. The first element of thre return value
262+
will be the images as a multi-dimensional array, and the second
263+
element the corresponding labels as integers.
264+
265+
The images are returned in the native horizontal-major memory
266+
layout as a single floating point array. If `decimal=true` all
267+
values are scaled to be between `0.0` and `1.0`, otherwise the
268+
values will be between `0.0` and `255.0`. The integer values of
269+
the labels correspond 1-to-1 the digit that they represent.
270+
271+
```julia
272+
test_x, test_y = FashionMNIST.testdata() # full datatset
273+
test_x, test_y = FashionMNIST.testdata(2) # only second observation
274+
test_x, test_y = FashionMNIST.testdata(dir="./FashionMNIST") # custom folder
275+
```
276+
277+
The corresponding source files of the dataset are expected to be
278+
located in the specified directory `dir`. If `dir` is omitted it
279+
will default to `MLDatasets/datasets/fashion_mnist`. In the case the
280+
source files have not been downloaded yet, you can use
281+
`FashionMNIST.download_helper(dir)` to assist in the process.
282+
283+
Take a look at [`testtensor`](@ref) and [`testlabels`](@ref)
284+
for more information.
285+
"""
286+
function testdata(args...; dir=DEFAULT_DIR, decimal=true)
287+
(testtensor(args...; dir=dir, decimal=decimal),
288+
testlabels(args...; dir=dir))
289+
end

src/FashionMNIST/reader.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import ..MNIST.Reader
2+
3+
Reader.set_msg_prompt("""
4+
Dataset: THE FashionMNIST DATABASE of fashion products
5+
Authors: Han Xiao, Kashif Rasul, Roland Vollgraf
6+
Website: https://github.com/zalandoresearch/fashion-mnist
7+
8+
Paper: Han Xiao, Kashif Rasul, Roland Vollgraf "Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms."
9+
10+
The files are available for download at the offical website linked above.
11+
""")
12+
13+
Reader.set_baseurl("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/")

src/MLDatasets.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
module MLDatasets
22

3+
include("io/download.jl")
34
include("io/CoNLL.jl")
45

56
include("CIFAR10.jl")
67
include("CIFAR100.jl")
78
include("MNIST/MNIST.jl")
9+
include("FashionMNIST/FashionMNIST.jl")
810
include("PTBLM.jl")
911
include("UD_English.jl")
1012

0 commit comments

Comments
 (0)