Skip to content

Commit a1e524a

Browse files
Include EMNIST (#55)
1 parent f6ac2c2 commit a1e524a

File tree

5 files changed

+389
-0
lines changed

5 files changed

+389
-0
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,23 @@ Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
7575

7676
(*) Note that the SVHN-2 dataset provides an additional 531131 observations aside from the training- and testset
7777

78+
[**EMNIST**](https://www.nist.gov/itl/products-and-services/emnist-dataset) packages 6 different extensions of the MNIST dataset involving letters and digits and variety of test train split options. Each extension has the standard test/train data/labels nested under it as shown below.
79+
80+
```julia
81+
traindata = EMNIST.Balanced.traindata()
82+
testdata = EMNIST.Balanced.testdata()
83+
trainlabels = EMNIST.Balanced.trainlabels()
84+
testlabels = EMNIST.Balanced.testlabels()
85+
```
86+
87+
Dataset | Classes | `traindata` | `trainlabels` | `testdata` | `testlabels` | `balanced classes`
88+
:------:|:-------:|:-------------:|:-------------:|:------------:|:------------:|:------------:
89+
**ByClass** | 62 | 697932x28x28 | 697932x1 | 116323x28x28 | 116323x1 | no
90+
**ByMerge** | 47 | 697932x28x28 | 697932x1 | 116323x28x28 | 116323x1 | no
91+
**Balanced** | 47 | 112800x28x28 | 112800x1 | 18800x28x28 | 18800x1 | yes
92+
**Letters** | 26 | 124800x28x28 | 124800x1 | 20800x28x28 | 208000x1 | yes
93+
**Digits** | 10 | 240000x28x28 | 240000x1 | 40000x28x28 | 40000x1 | yes
94+
**MNIST** | 10 | 60000x28x28 | 60000x1 | 10000x28x28 | 10000x1 | yes
7895

7996
### Misc. Datasets
8097

src/EMNIST/EMNIST.jl

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
export EMNIST
2+
module EMNIST
3+
using DataDeps
4+
using BinDeps
5+
using FixedPointNumbers
6+
using MAT: matopen, matread
7+
using ..MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
8+
using ..MNIST: convert2image
9+
10+
const DEPNAME = "EMNIST"
11+
12+
"""
13+
download([dir]; [i_accept_the_terms_of_use])
14+
15+
Trigger the (interactive) download of the full dataset into
16+
"`dir`". If no `dir` is provided the dataset will be
17+
downloaded into "~/.julia/datadeps/$DEPNAME".
18+
19+
This function will display an interactive dialog unless
20+
either the keyword parameter `i_accept_the_terms_of_use` or
21+
the environment variable `DATADEPS_ALWAY_ACCEPT` is set to
22+
`true`. Note that using the data responsibly and respecting
23+
copyright/terms-of-use remains your responsibility.
24+
"""
25+
download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
26+
27+
function __init__()
28+
register(DataDep(
29+
DEPNAME,
30+
"""
31+
Dataset: The EMNIST Dataset
32+
Authors: Gregory Cohen, Saeed Afshar, Jonathan Tapson, and Andre van Schaik
33+
Website: https://www.nist.gov/itl/products-and-services/emnist-dataset
34+
35+
[Cohen et al., 2017]
36+
Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017).
37+
EMNIST: an extension of MNIST to handwritten letters.
38+
Retrieved from http://arxiv.org/abs/1702.05373
39+
40+
The EMNIST dataset is a set of handwritten character digits derived from the
41+
NIST Special Database 19 (https://www.nist.gov/srd/nist-special-database-19)
42+
and converted to a 28x28 pixel image format and dataset structure that directly
43+
matches the MNIST dataset (http://yann.lecun.com/exdb/mnist/). Further information
44+
on the dataset contents and conversion process can be found in the paper available
45+
at https://arxiv.org/abs/1702.05373v1.
46+
47+
The files are available for download at the official
48+
website linked above. Note that using the data
49+
responsibly and respecting copyright remains your
50+
responsibility. For example the website mentions that
51+
the data is for non-commercial use only. Please read
52+
the website to make sure you want to download the
53+
dataset.
54+
""",
55+
"http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip",
56+
"e1fa805cdeae699a52da0b77c2db17f6feb77eed125f9b45c022e7990444df95",
57+
post_fetch_method = file -> (run(BinDeps.unpack_cmd(file,dirname(file),".zip","")); rm(file))
58+
))
59+
end
60+
61+
module Balanced
62+
using DataDeps
63+
using BinDeps
64+
using FixedPointNumbers
65+
using MAT: matopen, matread
66+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
67+
using ...MNIST: convert2image
68+
69+
const DEPNAME = "EMNIST"
70+
const FILENAME = "matlab/emnist-balanced.mat"
71+
72+
function traindata(; dir = nothing)
73+
path = datafile(DEPNAME, FILENAME, dir)
74+
vars = matread(path)
75+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
76+
end
77+
78+
function testdata(; dir = nothing)
79+
path = datafile(DEPNAME, FILENAME, dir)
80+
vars = matread(path)
81+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
82+
end
83+
84+
function trainlabels(; dir = nothing)
85+
path = datafile(DEPNAME, FILENAME, dir)
86+
vars = matread(path)
87+
return vars["dataset"]["train"]["labels"]
88+
end
89+
90+
function testlabels(; dir = nothing)
91+
path = datafile(DEPNAME, FILENAME, dir)
92+
vars = matread(path)
93+
return vars["dataset"]["test"]["labels"]
94+
end
95+
end
96+
97+
module ByClass
98+
using DataDeps
99+
using BinDeps
100+
using FixedPointNumbers
101+
using MAT: matopen, matread
102+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
103+
using ...MNIST: convert2image
104+
105+
const DEPNAME = "EMNIST"
106+
const FILENAME = "matlab/emnist-byclass.mat"
107+
108+
function traindata(; dir = nothing)
109+
path = datafile(DEPNAME, FILENAME, dir)
110+
vars = matread(path)
111+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
112+
end
113+
114+
function testdata(; dir = nothing)
115+
path = datafile(DEPNAME, FILENAME, dir)
116+
vars = matread(path)
117+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
118+
end
119+
120+
function trainlabels(; dir = nothing)
121+
path = datafile(DEPNAME, FILENAME, dir)
122+
vars = matread(path)
123+
return vars["dataset"]["train"]["labels"]
124+
end
125+
126+
function testlabels(; dir = nothing)
127+
path = datafile(DEPNAME, FILENAME, dir)
128+
vars = matread(path)
129+
return vars["dataset"]["test"]["labels"]
130+
end
131+
end
132+
133+
module ByMerge
134+
using DataDeps
135+
using BinDeps
136+
using FixedPointNumbers
137+
using MAT: matopen, matread
138+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
139+
using ...MNIST: convert2image
140+
141+
const DEPNAME = "EMNIST"
142+
const FILENAME = "matlab/emnist-bymerge.mat"
143+
144+
function traindata(; dir = nothing)
145+
path = datafile(DEPNAME, FILENAME, dir)
146+
vars = matread(path)
147+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
148+
end
149+
150+
function testdata(; dir = nothing)
151+
path = datafile(DEPNAME, FILENAME, dir)
152+
vars = matread(path)
153+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
154+
end
155+
156+
function trainlabels(; dir = nothing)
157+
path = datafile(DEPNAME, FILENAME, dir)
158+
vars = matread(path)
159+
return vars["dataset"]["train"]["labels"]
160+
end
161+
162+
function testlabels(; dir = nothing)
163+
path = datafile(DEPNAME, FILENAME, dir)
164+
vars = matread(path)
165+
return vars["dataset"]["test"]["labels"]
166+
end
167+
end
168+
169+
module Digits
170+
using DataDeps
171+
using BinDeps
172+
using FixedPointNumbers
173+
using MAT: matopen, matread
174+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
175+
using ...MNIST: convert2image
176+
177+
const DEPNAME = "EMNIST"
178+
const FILENAME = "matlab/emnist-digits.mat"
179+
180+
function traindata(; dir = nothing)
181+
path = datafile(DEPNAME, FILENAME, dir)
182+
vars = matread(path)
183+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
184+
end
185+
186+
function testdata(; dir = nothing)
187+
path = datafile(DEPNAME, FILENAME, dir)
188+
vars = matread(path)
189+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
190+
end
191+
192+
function trainlabels(; dir = nothing)
193+
path = datafile(DEPNAME, FILENAME, dir)
194+
vars = matread(path)
195+
return vars["dataset"]["train"]["labels"]
196+
end
197+
198+
function testlabels(; dir = nothing)
199+
path = datafile(DEPNAME, FILENAME, dir)
200+
vars = matread(path)
201+
return vars["dataset"]["test"]["labels"]
202+
end
203+
end
204+
205+
module Letters
206+
using DataDeps
207+
using BinDeps
208+
using FixedPointNumbers
209+
using MAT: matopen, matread
210+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
211+
using ...MNIST: convert2image
212+
213+
const DEPNAME = "EMNIST"
214+
const FILENAME = "matlab/emnist-letters.mat"
215+
216+
function traindata(; dir = nothing)
217+
path = datafile(DEPNAME, FILENAME, dir)
218+
vars = matread(path)
219+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
220+
end
221+
222+
function testdata(; dir = nothing)
223+
path = datafile(DEPNAME, FILENAME, dir)
224+
vars = matread(path)
225+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
226+
end
227+
228+
function trainlabels(; dir = nothing)
229+
path = datafile(DEPNAME, FILENAME, dir)
230+
vars = matread(path)
231+
return vars["dataset"]["train"]["labels"]
232+
end
233+
234+
function testlabels(; dir = nothing)
235+
path = datafile(DEPNAME, FILENAME, dir)
236+
vars = matread(path)
237+
return vars["dataset"]["test"]["labels"]
238+
end
239+
end
240+
241+
module MNIST
242+
using DataDeps
243+
using BinDeps
244+
using FixedPointNumbers
245+
using MAT: matopen, matread
246+
using ...MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
247+
using ...MNIST: convert2image
248+
249+
const DEPNAME = "EMNIST"
250+
const FILENAME = "matlab/emnist-mnist.mat"
251+
252+
function traindata(; dir = nothing)
253+
path = datafile(DEPNAME, FILENAME, dir)
254+
vars = matread(path)
255+
return reshape(vars["dataset"]["train"]["images"],:,28,28)
256+
end
257+
258+
function testdata(; dir = nothing)
259+
path = datafile(DEPNAME, FILENAME, dir)
260+
vars = matread(path)
261+
return reshape(vars["dataset"]["test"]["images"],:,28,28)
262+
end
263+
264+
function trainlabels(; dir = nothing)
265+
path = datafile(DEPNAME, FILENAME, dir)
266+
vars = matread(path)
267+
return vars["dataset"]["train"]["labels"]
268+
end
269+
270+
function testlabels(; dir = nothing)
271+
path = datafile(DEPNAME, FILENAME, dir)
272+
vars = matread(path)
273+
return vars["dataset"]["test"]["labels"]
274+
end
275+
end
276+
end

src/MLDatasets.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ include("FashionMNIST/FashionMNIST.jl")
3535
include("SVHN2/SVHN2.jl")
3636
include("PTBLM/PTBLM.jl")
3737
include("UD_English/UD_English.jl")
38+
include("EMNIST/EMNIST.jl")
3839

3940
function __init__()
4041
# initialize optional dependencies

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ tests = [
1010
"tst_mnist.jl",
1111
"tst_fashion_mnist.jl",
1212
"tst_svhn2.jl",
13+
"tst_emnist.jl",
1314
]
1415

1516
for t in tests

0 commit comments

Comments
 (0)