JuliaML
diff --git a/‎README.md
Lines changed: 12 additions & 9 deletions b/‎README.md
Lines changed: 12 additions & 9 deletions
diff --git a/‎REQUIRE
Lines changed: 2 additions & 0 deletions b/‎REQUIRE
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/CIFAR10.jl
Lines changed: 0 additions & 43 deletions b/‎src/CIFAR10.jl
Lines changed: 0 additions & 43 deletions
diff --git a/‎src/CIFAR10/CIFAR10.jl
Lines changed: 93 additions & 0 deletions b/‎src/CIFAR10/CIFAR10.jl
Lines changed: 93 additions & 0 deletions
diff --git a/‎src/CIFAR10/README.md
Lines changed: 81 additions & 0 deletions b/‎src/CIFAR10/README.md
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/CIFAR10/Reader/Reader.jl
Lines changed: 54 additions & 0 deletions b/‎src/CIFAR10/Reader/Reader.jl
Lines changed: 54 additions & 0 deletions
@@ -2,12 +2,9 @@
 
 [![Build Status](https://travis-ci.org/JuliaML/MLDatasets.jl.svg?branch=master)](https://travis-ci.org/JuliaML/MLDatasets.jl)
 
-`MLDatasets` provides an access to common machine learning
-datasets for [Julia](http://julialang.org/). Currently, julia 0.5
-is supported.
-
-The datasets are automatically downloaded to the specified
-directory. The default directory is `MLDatasets/datasets`.
+`MLDatasets` provides access to common machine learning datasets
+for [Julia](http://julialang.org/). Currently, julia 0.6 is
+supported.
 
 ## Installation
 
@@ -33,15 +30,21 @@ Use `traindata(<directory>)` and `testdata(<directory>)` to change the default d
 #### CIFAR-10
 
 The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html)
-dataset consists of 60000 32x32 color images in 10 classes.
+dataset consists of 60000 32x32 RGB images in 10 classes.
+
+Take a look at the [sub-module](src/CIFAR10/README.md) for more
+information
 
 #### CIFAR-100
 
 The [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html)
-dataset consists of 600 32x32 color images in 100 classes. The
+dataset consists of 60000 32x32 color images in 100 classes. The
 100 classes are grouped into 20 superclasses (fine and coarse
 labels).
 
+Take a look at the [sub-module](src/CIFAR100/README.md) for more
+information
+
 #### MNIST
 
 The [MNIST](http://yann.lecun.com/exdb/mnist/) dataset consists
@@ -101,7 +104,7 @@ testdata = UD_English.devdata()
 | | Type | Train x | Train y | Test x | Test y |
 |:---:|:---:|:---:|:---:|:---:|:---:|
 | **CIFAR-10** | image | 32x32x3x50000 | 50000 | 32x32x3x10000 | 10000 |
-| **CIFAR-100** | image | 32x32x3x500 | 2x500 | 32x32x3x100 | 2x100 |
+| **CIFAR-100** | image | 32x32x3x5000 | 50000 (x2) | 32x32x3x10000 | 10000 (x2) |
 | **MNIST** | image | 28x28x60000 | 60000 | 28x28x10000 | 10000 |
 | **FashionMNIST** | image | 28x28x60000 | 60000 | 28x28x10000 | 10000 |
 | **PTBLM** | text | 42068 | 42068 | 3761 | 3761 |
 
@@ -1,5 +1,7 @@
 julia 0.6
 ImageCore 0.1.2
+FixedPointNumbers 0.3
 ColorTypes 0.4
+DataDeps
 GZip
 BinDeps
@@ -0,0 +1,93 @@
+export CIFAR10
+module CIFAR10
+    using DataDeps
+    using BinDeps
+    using ImageCore
+    using ColorTypes
+    using FixedPointNumbers
+    using ..bytes_to_type
+    using ..datafile
+    using ..download_dep
+    using ..download_docstring
+
+    export
+
+        classnames,
+
+        traintensor,
+        testtensor,
+
+        trainlabels,
+        testlabels,
+
+        traindata,
+        testdata,
+
+        convert2image,
+        convert2features,
+
+        download
+
+    const DEPNAME = "CIFAR10"
+    const NCHUNKS = 5
+
+    filename_for_chunk(file_index::Int) =
+        joinpath("cifar-10-batches-bin", "data_batch_$(file_index).bin")
+
+    const TESTSET_FILENAME =
+        joinpath("cifar-10-batches-bin", "test_batch.bin")
+
+    const CLASSES = [
+        "airplane",
+        "automobile",
+        "bird",
+        "cat",
+        "deer",
+        "dog",
+        "frog",
+        "horse",
+        "ship",
+        "truck",
+    ]
+
+    download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
+
+    include(joinpath("Reader","Reader.jl"))
+    include("interface.jl")
+    include("utils.jl")
+
+    function __init__()
+        RegisterDataDep(
+            DEPNAME,
+            """
+            Dataset: The CIFAR-10 dataset
+            Authors: Alex Krizhevsky, Vinod Nair, Geoffrey Hinton
+            Website: https://www.cs.toronto.edu/~kriz/cifar.html
+            Reference: https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
+
+            [Krizhevsky, 2009]
+                Alex Krizhevsky.
+                "Learning Multiple Layers of Features from Tiny Images",
+                Tech Report, 2009.
+
+            The CIFAR-10 dataset is a labeled subsets of the 80
+            million tiny images dataset. It consists of 60000
+            32x32 colour images in 10 classes, with 6000 images
+            per class.
+
+            The compressed archive file that contains the
+            complete dataset is available for download at the
+            offical website linked above; specifically the binary
+            version for C programs. Note that using the data
+            responsibly and respecting copyright remains your
+            responsibility. The authors of CIFAR-10 aren't really
+            explicit about any terms of use, so please read the
+            website to make sure you want to download the
+            dataset.
+            """,
+            "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz",
+            "c4a38c50a1bc5f3a1c5537f2155ab9d68f9f25eb1ed8d9ddda3db29a59bca1dd",
+            post_fetch_method = file -> (run(BinDeps.unpack_cmd(file,dirname(file), ".gz", ".tar")); rm(file))
+        )
+    end
+end
@@ -0,0 +1,81 @@
+# CIFAR-10
+
+Description from the [original
+website](https://www.cs.toronto.edu/~kriz/cifar.html)
+
+> The CIFAR-10 and CIFAR-100 are labeled subsets of the
+> [80 million tiny images](http://people.csail.mit.edu/torralba/tinyimages/)
+> dataset. They were collected by Alex Krizhevsky, Vinod Nair,
+> and Geoffrey Hinton.
+>
+> The CIFAR-10 dataset consists of 60000 32x32 colour images in
+> 10 classes, with 6000 images per class. There are 50000
+> training images and 10000 test images.
+
+## Usage
+
+This sub-module provides a programmatic interface to download,
+load, and work with the CIFAR-10 dataset.
+
+```julia
+using MLDatasets
+
+# download dataset
+CIFAR10.download()
+
+# load full training set
+train_x, train_y = CIFAR10.traindata()
+
+# load full test set
+test_x,  test_y  = CIFAR10.testdata()
+```
+
+The provided functions also allow for optional arguments, such as
+the directory `dir` where the dataset is located, or the specific
+observation `indices` that one wants to work with. For more
+information on the interface take a look at the documentation
+(e.g. `?CIFAR10.traindata`).
+
+Function | Description
+---------|-------------
+`download([dir])` | Trigger interactive download of the dataset
+`classnames()` | Return the class names as a vector of strings
+`traintensor([T], [indices]; [dir])` | Load the training images as an array of eltype `T`
+`trainlabels([indices]; [dir])` | Load the labels for the training images
+`testtensor([T], [indices]; [dir])` | Load the test images as an array of eltype `T`
+`testlabels([indices]; [dir])` | Load the labels for the test images
+`traindata([T], [indices]; [dir])` | Load images and labels of the training data
+`testdata([T], [indices]; [dir])` | Load images and labels of the test data
+
+This module also provides utility functions to make working with
+the CIFAR10 dataset in Julia more convenient.
+
+You can use the function `convert2features` to convert the given
+CIFAR10 tensor to a feature matrix (or feature vector in the case
+of a single image). The purpose of this function is to drop the
+spatial dimensions such that traditional ML algorithms can
+process the dataset.
+
+```julia
+julia> CIFAR10.convert2features(CIFAR10.traintensor()) # full training data
+3072×50000 Array{N0f8,2}:
+[...]
+```
+
+To visualize an image or a prediction we provide the function
+`convert2image` to convert the given CIFAR10 horizontal-major
+tensor (or feature matrix) to a vertical-major `Colorant` array.
+
+```julia
+julia> CIFAR10.convert2image(CIFAR10.traintensor(1)) # first training image
+32×32 Array{RGB{N0f8},2}:
+[...]
+```
+
+## References
+
+- **Authors**: Alex Krizhevsky, Vinod Nair, Geoffrey Hinton
+
+- **Website**: https://www.cs.toronto.edu/~kriz/cifar.html
+
+- **[Krizhevsky, 2009]** Alex Krizhevsky. ["Learning Multiple Layers of Features from Tiny Images"](https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf), Tech Report, 2009.
@@ -0,0 +1,54 @@
+module Reader
+
+export
+
+    readdata!,
+    readdata
+
+const NROW = 32
+const NCOL = 32
+const NCHAN = 3
+const NBYTE = NROW * NCOL * NCHAN + 1 # "+ 1" for label
+const CHUNK_SIZE = 10_000
+
+function readnext!(buffer::Array{UInt8}, io::IO)
+    y = Int(read(io, UInt8))
+    read!(io, buffer)
+    buffer, y
+end
+
+function readdata!(buffer::Array{UInt8}, io::IO, index::Integer)
+    seek(io, (index - 1) * NBYTE)
+    readnext!(buffer, io)
+end
+
+function readdata(io::IO, index::Integer)
+    buffer = Array{UInt8}(NROW, NCOL, NCHAN)
+    readdata!(buffer, io, index)
+end
+
+function readdata(io::IO)
+    X = Array{UInt8}(NROW, NCOL, NCHAN, CHUNK_SIZE)
+    Y = Array{Int}(CHUNK_SIZE)
+    buffer = Array{UInt8}(NROW, NCOL, NCHAN)
+    @inbounds for index in 1:CHUNK_SIZE
+        _, ty = readnext!(buffer, io)
+        copy!(view(X,:,:,:,index), buffer)
+        Y[index] = ty
+    end
+    X, Y
+end
+
+function readdata(file::AbstractString, index::Integer)
+    open(file, "r") do io
+        readdata(io, index)
+    end::Tuple{Array{UInt8,3},Int}
+end
+
+function readdata(file::AbstractString)
+    open(file, "r") do io
+        readdata(io)
+    end::Tuple{Array{UInt8,4},Vector{Int}}
+end
+
+end