outline SVHN format 2 dataset

Evizero · Evizero · commit 842e0abfcfb2 · 2018-02-26T17:32:10.000+01:00
diff --git a/README.md b/README.md
@@ -70,6 +70,9 @@ Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
 [**FashionMNIST**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/) | 10 | 28x28x60000 | 60000 | 28x28x10000 | 10000
 [**CIFAR-10**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/) | 10 | 32x32x3x50000 | 50000 | 32x32x3x10000 | 10000
 [**CIFAR-100**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/) | 100 (20) | 32x32x3x50000 | 50000 (x2) | 32x32x3x10000 | 10000 (x2)
+[**SVHN-2**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/SVHN2/)(*) | 10 | 32x32x3x73257 | 73257 | 32x32x3x26032 | 26032
+
+(*) Note that the SVHN-2 dataset provides an additional 531131 observations aside from the training- and testset
 
 ### Language Modeling
 
diff --git a/REQUIRE b/REQUIRE
@@ -5,3 +5,4 @@ ColorTypes 0.4
 DataDeps
 GZip
 BinDeps
+MAT
diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -16,6 +16,7 @@ include("CIFAR10/CIFAR10.jl")
 include("CIFAR100/CIFAR100.jl")
 include("MNIST/MNIST.jl")
 include("FashionMNIST/FashionMNIST.jl")
+include("SVHN2/SVHN2.jl")
 include("PTBLM/PTBLM.jl")
 include("UD_English/UD_English.jl")
 
diff --git a/src/SVHN2/SVHN2.jl b/src/SVHN2/SVHN2.jl
@@ -0,0 +1,92 @@
+export SVHN2
+
+"""
+The Street View House Numbers (SVHN) Dataset
+
+Authors: Yuval Netzer, Tao Wang, Adam Coates, Alessandro Bissacco, Bo Wu, Andrew Y. Ng
+Website: http://ufldl.stanford.edu/housenumbers
+
+SVHN was obtained from house numbers in Google Street View
+images. As such they are quite diverse in terms of orientation
+and image background. Similar to MNIST, SVHN has 10 classes (the
+digits 0-9), but unlike MNIST there is more data and the images
+are a little bigger (32x32 instead of 28x28) with an additional
+RGB color channel. The dataset is split up into three subsets:
+73257 digits for training, 26032 digits for testing, and 531131
+additional to use as extra training data.
+
+## Interface
+
+- [SVHN2.traindata](@ref)
+- [SVHN2.testdata](@ref)
+- [SVHN2.extradata](@ref)
+
+## Utilities
+
+- [SVHN2.convert2features](@ref)
+- [SVHN2.convert2image](@ref)
+"""
+module SVHN2
+    using DataDeps
+    using MAT
+    using ImageCore
+    using ColorTypes
+    using FixedPointNumbers
+    using ..bytes_to_type
+    using ..datafile
+    using ..download_dep
+    using ..download_docstring
+
+    export
+
+        traindata,
+        testdata,
+        extradata,
+
+        convert2image,
+        convert2features,
+
+        download
+
+    const DEPNAME = "SVHN2"
+    const TRAINDATA = "train_32x32.mat"
+    const TESTDATA  = "test_32x32.mat"
+    const EXTRADATA = "extra_32x32.mat"
+    const CLASSES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
+
+    download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
+
+    include("interface.jl")
+    include("utils.jl")
+
+    function __init__()
+        RegisterDataDep(
+            DEPNAME,
+            """
+            Dataset: The Street View House Numbers (SVHN) Dataset
+            Authors: Yuval Netzer, Tao Wang, Adam Coates, Alessandro Bissacco, Bo Wu, Andrew Y. Ng
+            Website: http://ufldl.stanford.edu/housenumbers
+            Format: Cropped Digits (Format 2 on the website)
+            Note: for non-commercial use only
+
+            [Netzer et al., 2011]
+                Yuval Netzer, Tao Wang, Adam Coates, Alessandro Bissacco, Bo Wu, Andrew Y. Ng
+                "Reading Digits in Natural Images with Unsupervised Feature Learning"
+                NIPS Workshop on Deep Learning and Unsupervised Feature Learning 2011
+
+            The dataset is split up into three subsets: 73257
+            digits for training, 26032 digits for testing, and
+            531131 additional to use as extra training data.
+
+            The files are available for download at the official
+            website linked above. Note that using the data
+            responsibly and respecting copyright remains your
+            responsibility. For example the website mentions that
+            the data is for non-commercial use only. Please read
+            the website to make sure you want to download the
+            dataset.
+            """,
+            "http://ufldl.stanford.edu/housenumbers/" .* [TRAINDATA, TESTDATA, EXTRADATA],
+        )
+    end
+end
diff --git a/src/SVHN2/interface.jl b/src/SVHN2/interface.jl
@@ -0,0 +1,150 @@
+"""
+    classnames() -> Vector{Int}
+
+Return the 10 digits for the SVHN classes as a vector of integers.
+"""
+classnames() = CLASSES
+
+"""
+    traindata([T = N0f8], [indices]; [dir]) -> images, labels
+
+Returns the SVHN **trainset** corresponding to the given
+`indices` as a two-element tuple. If `indices` is omitted the
+full trainset is returned. The first element of the return values
+will be the images as a multi-dimensional array, and the second
+element the corresponding labels as integers.
+
+The image(s) is/are returned in the native vertical-major memory
+layout as a single numeric array of eltype `T`. If `T <:
+Integer`, then all values will be within `0` and `255`, otherwise
+the values are scaled to be between `0` and `1`. You can use the
+utility function [`convert2image`](@ref) to convert an SVHN array
+into a Julia image with the appropriate `RGB` eltype. The integer
+values of the labels correspond 1-to-1 the digit that they
+represent with the exception of 0 which is encoded as `10`.
+
+Note that because of the nature of how the dataset is stored on
+disk, `SVHN2.traindata` will always load the full trainset,
+regardless of which observations are requested. In the case
+`indices` are provided by the user, it will simply result in a
+sub-setting. This option is just provided for convenience.
+
+```julia
+train_x, train_y = SVHN2.traindata() # full dataset
+train_x, train_y = SVHN2.traindata(2) # only second observation
+train_x, train_y = SVHN2.traindata(dir="./SVHN") # custom folder
+```
+
+$(download_docstring("SVHN", DEPNAME))
+"""
+function traindata(args...; dir = nothing)
+    traindata(N0f8, args...; dir = dir)
+end
+
+function traindata(::Type{T}; dir = nothing) where T
+    path = datafile(DEPNAME, TRAINDATA, dir)
+    vars = matread(path)
+    images, labels = vars["X"], vars["y"]
+    bytes_to_type(T, images), Vector{Int}(vec(labels))
+end
+
+function traindata(::Type{T}, indices; dir = nothing) where T
+    images, labels = traindata(T, dir = dir)
+    images[:,:,:,indices], labels[indices]
+end
+
+"""
+    testdata([T = N0f8], [indices]; [dir]) -> images, labels
+
+Returns the SVHN **testset** corresponding to the given
+`indices` as a two-element tuple. If `indices` is omitted the
+full testset is returned. The first element of the return
+values will be the images as a multi-dimensional array, and the
+second element the corresponding labels as integers.
+
+The image(s) is/are returned in the native vertical-major memory
+layout as a single numeric array of eltype `T`. If `T <:
+Integer`, then all values will be within `0` and `255`, otherwise
+the values are scaled to be between `0` and `1`. You can use the
+utility function [`convert2image`](@ref) to convert an SVHN array
+into a Julia image with the appropriate `RGB` eltype. The integer
+values of the labels correspond 1-to-1 the digit that they
+represent with the exception of 0 which is encoded as `10`.
+
+Note that because of the nature of how the dataset is stored on
+disk, `SVHN2.testdata` will always load the full testset,
+regardless of which observations are requested. In the case
+`indices` are provided by the user, it will simply result in a
+sub-setting. This option is just provided for convenience.
+
+```julia
+test_x, test_y = SVHN2.testdata() # full dataset
+test_x, test_y = SVHN2.testdata(2) # only second observation
+test_x, test_y = SVHN2.testdata(dir="./SVHN") # custom folder
+```
+
+$(download_docstring("SVHN", DEPNAME))
+"""
+function testdata(args...; dir = nothing)
+    testdata(N0f8, args...; dir = dir)
+end
+
+function testdata(::Type{T}; dir = nothing) where T
+    path = datafile(DEPNAME, TESTDATA, dir)
+    vars = matread(path)
+    images, labels = vars["X"], vars["y"]
+    bytes_to_type(T, images), Vector{Int}(vec(labels))
+end
+
+function testdata(::Type{T}, indices; dir = nothing) where T
+    images, labels = testdata(T, dir = dir)
+    images[:,:,:,indices], labels[indices]
+end
+
+"""
+    extradata([T = N0f8], [indices]; [dir]) -> images, labels
+
+Returns the SVHN **extra trainset** corresponding to the given
+`indices` as a two-element tuple. If `indices` is omitted the
+full dataset is returned. The first element of the return values
+will be the images as a multi-dimensional array, and the second
+element the corresponding labels as integers.
+
+The image(s) is/are returned in the native vertical-major memory
+layout as a single numeric array of eltype `T`. If `T <:
+Integer`, then all values will be within `0` and `255`, otherwise
+the values are scaled to be between `0` and `1`. You can use the
+utility function [`convert2image`](@ref) to convert an SVHN array
+into a Julia image with the appropriate `RGB` eltype. The integer
+values of the labels correspond 1-to-1 the digit that they
+represent with the exception of 0 which is encoded as `10`.
+
+Note that because of the nature of how the dataset is stored on
+disk, `SVHN2.extradata` will always load the full extra trainset,
+regardless of which observations are requested. In the case
+`indices` are provided by the user, it will simply result in a
+sub-setting. This option is just provided for convenience.
+
+```julia
+extra_x, extra_y = SVHN2.extradata() # full dataset
+extra_x, extra_y = SVHN2.extradata(2) # only second observation
+extra_x, extra_y = SVHN2.extradata(dir="./SVHN") # custom folder
+```
+
+$(download_docstring("SVHN", DEPNAME))
+"""
+function extradata(args...; dir = nothing)
+    extradata(N0f8, args...; dir = dir)
+end
+
+function extradata(::Type{T}; dir = nothing) where T
+    path = datafile(DEPNAME, EXTRADATA, dir)
+    vars = matread(path)
+    images, labels = vars["X"], vars["y"]
+    bytes_to_type(T, images), Vector{Int}(vec(labels))
+end
+
+function extradata(::Type{T}, indices; dir = nothing) where T
+    images, labels = extradata(T, dir = dir)
+    images[:,:,:,indices], labels[indices]
+end
diff --git a/src/SVHN2/utils.jl b/src/SVHN2/utils.jl
@@ -0,0 +1,81 @@
+"""
+    convert2features(array)
+
+Convert the given SVHN tensor to a feature matrix (or feature
+vector in the case of a single image). The purpose of this
+function is to drop the spatial dimensions such that traditional
+ML algorithms can process the dataset.
+
+```julia
+julia> SVHN2.convert2features(SVHN2.traindata(Float32)[1]) # full training data
+3072×50000 Array{Float32,2}:
+[...]
+
+julia> SVHN2.convert2features(SVHN2.traindata(Float32)[1][:,:,:,1]) # first observation
+3072-element Array{Float32,1}:
+[...]
+```
+"""
+function convert2features(array::AbstractArray{<:Number,3})
+    nrows, ncols, nchan = size(array)
+    @assert nchan == 3 "the given array should have the RGB channel in the third dimension"
+    vec(array)
+end
+
+function convert2features(array::AbstractArray{<:Number,4})
+    nrows, ncols, nchan, nimages = size(array)
+    @assert nchan == 3 "the given array should have the RGB channel in the third dimension"
+    reshape(array, (nrows * ncols * nchan, nimages))
+end
+
+convert2features(array::AbstractArray{<:RGB,2}) =
+    convert2features(permutedims(channelview(array), (3,1,2)))
+
+convert2features(array::AbstractArray{<:RGB,3}) =
+    convert2features(permutedims(channelview(array), (3,1,2,4)))
+
+"""
+    convert2image(array) -> Array{RGB}
+
+Convert the given SVHN tensor (or feature vector/matrix) to a
+`RGB` array.
+
+```julia
+julia> SVHN2.convert2image(SVHN2.traindata()[1]) # full training dataset
+32×32×50000 Array{RGB{N0f8},3}:
+[...]
+
+julia> SVHN2.convert2image(SVHN2.traindata()[1][:,:,:,1]) # first training image
+32×32 Array{RGB{N0f8},2}:
+[...]
+```
+"""
+function convert2image(array::AbstractVector{<:Number})
+    @assert length(array) % 3072 == 0
+    if length(array) == 3072
+        convert2image(reshape(array, 32, 32, 3))
+    else
+        n = Int(length(array) / 3072)
+        convert2image(reshape(array, 32, 32, 3, n))
+    end
+end
+
+function convert2image(array::AbstractMatrix{<:Number})
+    @assert size(array, 1) == 3072
+    convert2image(reshape(array, 32, 32, 3, size(array, 2)))
+end
+
+function convert2image(array::AbstractArray{<:Number,3})
+    nrows, ncols, nchan = size(array)
+    @assert nchan == 3 "the given array should have the RGB channel in the third dimension"
+    colorview(RGB, permutedims(_norm_array(array), (3,1,2)))
+end
+
+function convert2image(array::AbstractArray{<:Number,4})
+    nrows, ncols, nchan, nimages = size(array)
+    @assert nchan == 3 "the given array should have the RGB channel in the third dimension"
+    colorview(RGB, permutedims(_norm_array(array), (3,1,2,4)))
+end
+
+_norm_array(array::AbstractArray) = array
+_norm_array(array::AbstractArray{<:Integer}) = reinterpret(N0f8, convert(Array{UInt8}, array))

-Original file line number
+Diff line change
 DataDeps
 GZip
 BinDeps
 +MAT