JuliaML
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 10 additions & 5 deletions b/‎README.md
Lines changed: 10 additions & 5 deletions
diff --git a/‎REQUIRE
Lines changed: 3 additions & 1 deletion b/‎REQUIRE
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/MLDatasets.jl
Lines changed: 1 addition & 1 deletion b/‎src/MLDatasets.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MNIST.jl
Lines changed: 0 additions & 39 deletions b/‎src/MNIST.jl
Lines changed: 0 additions & 39 deletions
diff --git a/‎src/MNIST/MNIST.jl
Lines changed: 30 additions & 0 deletions b/‎src/MNIST/MNIST.jl
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/MNIST/README.md
Lines changed: 83 additions & 0 deletions b/‎src/MNIST/README.md
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/MNIST/Reader/Reader.jl
Lines changed: 30 additions & 0 deletions b/‎src/MNIST/Reader/Reader.jl
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/MNIST/Reader/download.jl
Lines changed: 82 additions & 0 deletions b/‎src/MNIST/Reader/download.jl
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/MNIST/Reader/readheader.jl
Lines changed: 56 additions & 0 deletions b/‎src/MNIST/Reader/readheader.jl
Lines changed: 56 additions & 0 deletions
@@ -1,2 +1,2 @@
-/datasets
+datasets/
 sandbox.jl
@@ -1,10 +1,10 @@
 # MLDatasets.jl
 [![Build Status](https://travis-ci.org/JuliaML/MLDatasets.jl.svg?branch=master)](https://travis-ci.org/JuliaML/MLDatasets.jl)
 
-`MLDatasets` provides an access to common machine learning datasets for [Julia](http://julialang.org/).  
+`MLDatasets` provides an access to common machine learning datasets for [Julia](http://julialang.org/).
 Currently, julia 0.5 is supported.
 
-The datasets are automatically downloaded to the specified directory.  
+The datasets are automatically downloaded to the specified directory.
 The default directory is `MLDatasets/datasets`.
 
 ## Installation
@@ -27,15 +27,20 @@ Use `traindata(<directory>)` and `testdata(<directory>)` to change the default d
 The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 60000 32x32 color images in 10 classes.
 
 #### CIFAR-100
-The [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 600 32x32 color images in 100 classes.  
+The [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 600 32x32 color images in 100 classes.
 The 100 classes are grouped into 20 superclasses (fine and coarse labels).
 
 #### MNIST
-The [MNIST](http://yann.lecun.com/exdb/mnist/) dataset consists of 60000 28x28 images of handwritten digits.
+
+The [MNIST](http://yann.lecun.com/exdb/mnist/) dataset consists
+of 60000 28x28 images of handwritten digits.
+
+Take a look at the [sub-module](src/MNIST/README.md) for more
+information
 
 ### Language Modeling
 #### PTBLM
-The `PTBLM` dataset consists of Penn Treebank sentences for language modeling, available from [tomsercu/lstm](https://github.com/tomsercu/lstm).  
+The `PTBLM` dataset consists of Penn Treebank sentences for language modeling, available from [tomsercu/lstm](https://github.com/tomsercu/lstm).
 The unknown words are replaced with `<unk>` so that the total vocaburary size becomes 10000.
 
 This is the first sentence of the PTBLM dataset.
 
@@ -1,3 +1,5 @@
-julia 0.5-
+julia 0.5
+ImageCore 0.1.2
+ColorTypes 0.4
 GZip
 BinDeps
@@ -4,7 +4,7 @@ include("io/CoNLL.jl")
 
 include("CIFAR10.jl")
 include("CIFAR100.jl")
-include("MNIST.jl")
+include("MNIST/MNIST.jl")
 include("PTBLM.jl")
 include("UD_English.jl")
 
 
@@ -0,0 +1,30 @@
+export MNIST
+module MNIST
+    using ImageCore
+    using ColorTypes
+
+    export
+
+        traintensor,
+        testtensor,
+
+        trainlabels,
+        testlabels,
+
+        traindata,
+        testdata,
+
+        convert2image,
+        convert2features,
+
+        download_helper
+
+    const DEFAULT_DIR = abspath(joinpath(dirname(@__FILE__), "..", "..", "datasets", "mnist"))
+
+    include("Reader/Reader.jl")
+    import .Reader.download_helper
+    include("interface.jl")
+    include("utils.jl")
+
+    Reader.download_helper(; nargs...) = Reader.download_helper(DEFAULT_DIR; nargs...)
+end
@@ -0,0 +1,83 @@
+# THE MNIST DATABASE of handwritten digits
+
+Description from the [official website](http://yann.lecun.com/exdb/mnist/):
+
+> The MNIST database of handwritten digits, available from this
+> page, has a training set of 60,000 examples, and a test set of
+> 10,000 examples. It is a subset of a larger set available from
+> NIST. The digits have been size-normalized and centered in a
+> fixed-size image.
+>
+> It is a good database for people who want to try learning
+> techniques and pattern recognition methods on real-world data
+> while spending minimal efforts on preprocessing and formatting.
+
+## Usage
+
+This sub-module provides a programmatic interface to download,
+load, and work with the MNIST dataset of handwritten digits.
+
+```julia
+using MLDatasets
+
+# download dataset
+MNIST.download_helper()
+
+# load full training set
+train_x, train_y = MNIST.traindata()
+
+# load full test set
+test_x,  test_y  = MNIST.testdata()
+```
+
+The provided functions also allow for optional arguments, such as
+the directory `dir` where the dataset is located, or the specific
+observation `indices` that one wants to work with. For more
+information on the interface take a look at the documentation
+(e.g. `?MNIST.traindata`).
+
+Function | Description
+---------|-------------
+`download_helper([dir])` | Trigger interactive download of the dataset
+`traintensor([indices]; [dir], [decimal=true])` | Load the training images as an array
+`trainlabels([indices]; [dir])` | Load the labels for the training images
+`testtensor([indices]; [dir], [decimal=true])` | Load the test images as an array
+`testlabels([indices]; [dir])` | Load the labels for the test images
+`traindata([indices]; [dir], [decimal=true])` | Load images and labels of the training data
+`testdata([indices]; [dir], [decimal=true])` | Load images and labels of the test data
+
+This module also provides utility functions to make working with
+the MNIST dataset in Julia more convenient.
+
+You can use the function `convert2features` to convert the given
+MNIST tensor to a feature matrix (or feature vector in the case
+of a single image). The purpose of this function is to drop the
+spatial dimensions such that traditional ML algorithms can
+process the dataset.
+
+```julia
+julia> MNIST.convert2features(MNIST.traintensor()) # full training data
+784×60000 Array{Float64,2}:
+[...]
+```
+
+To visualize an image or a prediction we provide the function
+`convert2image` to convert the given MNIST horizontal-major
+tensor (or feature matrix) to a vertical-major `Colorant` array.
+The values are also color corrected according to the website's
+description, which means that the digits are black on a white
+background.
+
+```julia
+julia> MNIST.convert2image(MNIST.traintensor(1)) # first training image
+28×28 Array{Gray{Float64},2}:
+[...]
+```
+
+## References
+
+- **Authors**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges
+
+- **Website**: http://yann.lecun.com/exdb/mnist/
+
+- **[LeCun et al., 1998a]** Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner. "Gradient-based learning applied to document recognition." Proceedings of the IEEE, 86(11):2278-2324, November 1998
@@ -0,0 +1,30 @@
+module Reader
+    using GZip
+    using BinDeps
+
+    export
+
+        readtrainimages,
+        readtestimages,
+        readtrainimages,
+        readtestlabels,
+
+        download_helper
+
+    # Constants
+
+    const IMAGEOFFSET = 16
+    const LABELOFFSET = 8
+
+    const TRAINIMAGES = "train-images-idx3-ubyte.gz"
+    const TRAINLABELS = "train-labels-idx1-ubyte.gz"
+    const TESTIMAGES  = "t10k-images-idx3-ubyte.gz"
+    const TESTLABELS  = "t10k-labels-idx1-ubyte.gz"
+
+    # Includes
+
+    include("readheader.jl")
+    include("readimages.jl")
+    include("readlabels.jl")
+    include("download.jl")
+end
@@ -0,0 +1,82 @@
+msg_notfound(dir, filename) = "The MNIST file \"$filename\" was not found in \"$dir\". You can download the dataset at http://yann.lecun.com/exdb/mnist/, or alternatively use MNIST.download_helper(directory) to do it for you."
+
+msg_prompt(dir, files) = """
+Interactive session detected. MNIST.download_helper initiated.
+
+Dataset: THE MNIST DATABASE of handwritten digits
+Authors: Yann LeCun, Corinna Cortes, Christopher J.C. Burges
+Website: http://yann.lecun.com/exdb/mnist/
+
+[LeCun et al., 1998a]
+    Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner. "Gradient-based learning applied to document recognition." Proceedings of the IEEE, 86(11):2278-2324, November 1998
+
+The specified directory \"$dir\" is missing the files $(join(map(f->"\"$f\"", files), ", ", " and ")) of the full data set.
+
+The files are available for download at the offical website linked above.
+We can download these files for you if you wish, but that doesn't free
+you from the burden of using the data responsibly and respect copyright.
+The authors of MNIST aren't really explicit about any terms of use,
+so please read the website to make sure you want to download the dataset.
+
+    http://yann.lecun.com/exdb/mnist/
+
+Did you visit the website and want to download the dataset to \"$dir\"? [y/n] """
+
+function downloaded_file(dir, filename)
+    path = joinpath(dir, filename)
+    if !isfile(path)
+        if isinteractive()
+            warn(msg_notfound(dir, filename))
+            download_helper(dir)
+        else
+            error(msg_notfound(dir, filename))
+        end
+    end
+    path
+end
+
+"""
+    download_helper([dir]; [i_accept_the_terms_of_use = false])
+
+Check if the MNIST dataset is contained in the specified `dir`,
+or if any of the four files are missing. If `dir` is omitted it
+will default to `MLDatasets/datasets/mnist`.
+
+In the case that any of the four files are missing and
+`i_accept_the_terms_of_use=false` the function will raise a
+warning or an error depending on if julia is run in an
+interactive session. If an interactive session is detected the
+user will be presented with information and the option to
+download the dataset to the specified `dir`.
+
+If the download should happen automatically, please first visit
+the website at http://yann.lecun.com/exdb/mnist, before setting
+`i_accept_the_terms_of_use=true`.
+"""
+function download_helper(dir; i_accept_the_terms_of_use = false)
+    files = filter(file->!isfile(joinpath(dir, file)),
+                   [TRAINIMAGES, TRAINLABELS, TESTIMAGES, TESTLABELS])
+    if !isempty(files)
+        if !i_accept_the_terms_of_use && isinteractive()
+            print(msg_prompt(dir, files))
+            answer = first(readline())
+            if answer == 'y'
+                i_accept_the_terms_of_use = true
+            end
+        end
+        if i_accept_the_terms_of_use
+            mkpath(dir)
+            for file in files
+                url = "http://yann.lecun.com/exdb/mnist/$file"
+                path = joinpath(dir, file)
+                info("downloading $file from $url to $dir")
+                run(download_cmd(url, path))
+            end
+        else
+            error("Unable to download the dataset. Please visit the website at http://yann.lecun.com/exdb/mnist and download the files manually.")
+        end
+    else
+        info("Nothing to do.")
+    end
+    nothing
+end
@@ -0,0 +1,56 @@
+"""
+    readimageheader(io::IO)
+
+Reads four 32 bit integers at the current position of `io` and
+interprets them as a MNIST-image-file header, which is described
+in detail in the table below
+
+            ║     First    │  Second  │  Third  │   Fourth
+    ════════╬══════════════╪══════════╪═════════╪════════════
+    offset  ║         0000 │     0004 │    0008 │       0012
+    descr   ║ magic number │ # images │  # rows │  # columns
+
+These four numbers are returned as a Tuple in the same storage order
+"""
+function readimageheader(io::IO)
+    magic_number = bswap(read(io, UInt32))
+    total_items  = bswap(read(io, UInt32))
+    nrows = bswap(read(io, UInt32))
+    ncols = bswap(read(io, UInt32))
+    UInt32(magic_number), Int(total_items), Int(nrows), Int(ncols)
+end
+
+"""
+    readimageheader(file::AbstractString)
+
+Opens and reads the first four 32 bits values of `file` and
+returns them interpreted as an MNIST-image-file header
+"""
+function readimageheader(file::AbstractString)
+    gzopen(readimageheader, file, "r")::Tuple{UInt32,Int,Int,Int}
+end
+
+"""
+    readlabelheader(io::IO)
+
+Reads two 32 bit integers at the current position of `io` and
+interprets them as a MNIST-label-file header, which consists of a
+*magic number* and the *total number of labels* stored in the
+file. These two numbers are returned as a Tuple in the same
+storage order.
+"""
+function readlabelheader(io::IO)
+    magic_number = bswap(read(io, UInt32))
+    total_items  = bswap(read(io, UInt32))
+    UInt32(magic_number), Int(total_items)
+end
+
+"""
+    readlabelheader(file::AbstractString)
+
+Opens and reads the first two 32 bits values of `file` and
+returns them interpreted as an MNIST-label-file header
+"""
+function readlabelheader(file::AbstractString)
+    gzopen(readlabelheader, file, "r")::Tuple{UInt32,Int}
+end
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-/datasets`
	`1`	`+datasets/`
`2`	`2`	`sandbox.jl`