JuliaML
diff --git a/‎README.md
Lines changed: 45 additions & 10 deletions b/‎README.md
Lines changed: 45 additions & 10 deletions
diff --git a/‎src/FashionMNIST/FashionMNIST.jl
Lines changed: 51 additions & 4 deletions b/‎src/FashionMNIST/FashionMNIST.jl
Lines changed: 51 additions & 4 deletions
diff --git a/‎src/FashionMNIST/README.md
Lines changed: 82 additions & 0 deletions b/‎src/FashionMNIST/README.md
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/FashionMNIST/interface.jl
Lines changed: 22 additions & 14 deletions b/‎src/FashionMNIST/interface.jl
Lines changed: 22 additions & 14 deletions
diff --git a/‎src/FashionMNIST/reader.jl
Lines changed: 0 additions & 13 deletions b/‎src/FashionMNIST/reader.jl
Lines changed: 0 additions & 13 deletions
diff --git a/‎src/MNIST/MNIST.jl
Lines changed: 37 additions & 3 deletions b/‎src/MNIST/MNIST.jl
Lines changed: 37 additions & 3 deletions
@@ -1,34 +1,46 @@
 # MLDatasets.jl
+
 [![Build Status](https://travis-ci.org/JuliaML/MLDatasets.jl.svg?branch=master)](https://travis-ci.org/JuliaML/MLDatasets.jl)
 
-`MLDatasets` provides an access to common machine learning datasets for [Julia](http://julialang.org/).
-Currently, julia 0.5 is supported.
+`MLDatasets` provides an access to common machine learning
+datasets for [Julia](http://julialang.org/). Currently, julia 0.5
+is supported.
 
-The datasets are automatically downloaded to the specified directory.
-The default directory is `MLDatasets/datasets`.
+The datasets are automatically downloaded to the specified
+directory. The default directory is `MLDatasets/datasets`.
 
 ## Installation
+
 ```julia
 julia> Pkg.clone("https://github.com/JuliaML/MLDatasets.jl.git")
 ```
 
 ## Basic Usage
+
 ```julia
 using MLDatasets
 
 train_x, train_y = MNIST.traindata()
 test_x, test_y = MNIST.testdata()
 ```
+
 Use `traindata(<directory>)` and `testdata(<directory>)` to change the default directory.
 
 ## Available Datasets
+
 ### Image Classification
+
 #### CIFAR-10
-The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 60000 32x32 color images in 10 classes.
+
+The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html)
+dataset consists of 60000 32x32 color images in 10 classes.
 
 #### CIFAR-100
-The [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 600 32x32 color images in 100 classes.
-The 100 classes are grouped into 20 superclasses (fine and coarse labels).
+
+The [CIFAR-100](https://www.cs.toronto.edu/~kriz/cifar.html)
+dataset consists of 600 32x32 color images in 100 classes. The
+100 classes are grouped into 20 superclasses (fine and coarse
+labels).
 
 #### MNIST
 
@@ -38,12 +50,27 @@ of 60000 28x28 images of handwritten digits.
 Take a look at the [sub-module](src/MNIST/README.md) for more
 information
 
+#### Fashion-MNIST
+
+The [Fashion-MNIST](https://github.com/zalandoresearch/fashion-mnist)
+dataset consists of 60000 28x28 images of fashion products. It
+was designed to be a drop-in replacement for the MNIST dataset
+
+Take a look at the [sub-module](src/FashionMNIST/README.md) for more
+information
+
 ### Language Modeling
+
 #### PTBLM
-The `PTBLM` dataset consists of Penn Treebank sentences for language modeling, available from [tomsercu/lstm](https://github.com/tomsercu/lstm).
-The unknown words are replaced with `<unk>` so that the total vocaburary size becomes 10000.
+
+The `PTBLM` dataset consists of Penn Treebank sentences for
+language modeling, available from
+[tomsercu/lstm](https://github.com/tomsercu/lstm). The unknown
+words are replaced with `<unk>` so that the total vocaburary size
+becomes 10000.
 
 This is the first sentence of the PTBLM dataset.
+
 ```julia
 x, y = PTBLM.traindata()
 
@@ -52,11 +79,18 @@ x[1]
 y[1]
 > ["it", "was", "n't", "black", "monday", "<eos>"]
 ```
+
 where `MLDataset` adds the special word: `<eos>` to the end of `y`.
 
 ### Text Analysis (POS-Tagging, Parsing)
+
 #### UD English
-The [UD_English](https://github.com/UniversalDependencies/UD_English) dataset is an annotated corpus of morphological features, POS-tags and syntactic trees. The dataset follows CoNLL-style format.
+
+The [UD_English](https://github.com/UniversalDependencies/UD_English)
+dataset is an annotated corpus of morphological features,
+POS-tags and syntactic trees. The dataset follows CoNLL-style
+format.
+
 ```julia
 traindata = UD_English.traindata()
 devdata = UD_English.devdata()
@@ -69,5 +103,6 @@ testdata = UD_English.devdata()
 | **CIFAR-10** | image | 32x32x3x50000 | 50000 | 32x32x3x10000 | 10000 |
 | **CIFAR-100** | image | 32x32x3x500 | 2x500 | 32x32x3x100 | 2x100 |
 | **MNIST** | image | 28x28x60000 | 60000 | 28x28x10000 | 10000 |
+| **FashionMNIST** | image | 28x28x60000 | 60000 | 28x28x10000 | 10000 |
 | **PTBLM** | text | 42068 | 42068 | 3761 | 3761 |
 | **UD_English** | text | 12543 | - | 2077 | - |
@@ -2,6 +2,12 @@ export FashionMNIST
 module FashionMNIST
     using ImageCore
     using ColorTypes
+    import ..downloaded_file
+    import ..download_helper
+    import ..DownloadSettings
+    import ..MNIST.convert2image
+    import ..MNIST.convert2features
+    import ..MNIST.Reader
 
     export
 
@@ -17,11 +23,52 @@ module FashionMNIST
         convert2image,
         convert2features,
 
-        download_helper
+        download
 
-    const DEFAULT_DIR = abspath(joinpath(dirname(@__FILE__), "..", "..", "datasets", "fashion_mnist"))
+    const DEFAULT_DIR = abspath(joinpath(@__DIR__, "..", "..", "datasets", "fashion_mnist"))
+
+    const TRAINIMAGES = "train-images-idx3-ubyte.gz"
+    const TRAINLABELS = "train-labels-idx1-ubyte.gz"
+    const TESTIMAGES  = "t10k-images-idx3-ubyte.gz"
+    const TESTLABELS  = "t10k-labels-idx1-ubyte.gz"
+
+    const CLASSES = [
+        "T-Shirt",
+        "Trouser",
+        "Pullover",
+        "Dress",
+        "Coat",
+        "Sandal",
+        "Shirt",
+        "Sneaker",
+        "Bag",
+        "Ankle boot"
+    ]
+
+    const SETTINGS = DownloadSettings(
+        "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
+        """
+        Dataset: Fashion-MNIST
+        Authors: Han Xiao, Kashif Rasul, Roland Vollgraf
+        Website: https://github.com/zalandoresearch/fashion-mnist
+        License: MIT
+
+        [Han Xiao et al. 2017]
+            Han Xiao, Kashif Rasul, and Roland Vollgraf.
+            "Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms."
+            arXiv:1708.07747
+
+        The files are available for download at the offical
+        website linked above. We can download these files for you
+        if you wish, but that doesn't free you from the burden of
+        using the data responsibly and respect lincense and
+        authorship.
+        """,
+        [TRAINIMAGES, TRAINLABELS, TESTIMAGES, TESTLABELS]
+    )
+
+    download(dir = DEFAULT_DIR; kw...) =
+        download_helper(SETTINGS, dir; kw...)
 
-    include("reader.jl")
     include("interface.jl")
-    include(joinpath("..", "MNIST", "utils.jl"))
 end
@@ -0,0 +1,82 @@
+# Fashion-MNIST
+
+Description from the [official website](https://github.com/zalandoresearch/fashion-mnist)
+
+> Fashion-MNIST is a dataset of Zalando's article
+> images—consisting of a training set of 60,000 examples and a
+> test set of 10,000 examples. Each example is a 28x28 grayscale
+> image, associated with a label from 10 classes. We intend
+> Fashion-MNIST to serve as a direct drop-in replacement for the
+> original MNIST dataset for benchmarking machine learning
+> algorithms. It shares the same image size and structure of
+> training and testing splits.
+
+## Usage
+
+This sub-module provides a programmatic interface to download,
+load, and work with the MNIST dataset of handwritten digits.
+
+```julia
+using MLDatasets
+
+# download dataset
+FashionMNIST.download()
+
+# load full training set
+train_x, train_y = FashionMNIST.traindata()
+
+# load full test set
+test_x,  test_y  = FashionMNIST.testdata()
+```
+
+The provided functions also allow for optional arguments, such as
+the directory `dir` where the dataset is located, or the specific
+observation `indices` that one wants to work with. For more
+information on the interface take a look at the documentation
+(e.g. `?FashionMNIST.traindata`).
+
+Function | Description
+---------|-------------
+`download([dir])` | Trigger interactive download of the dataset
+`traintensor([indices]; [dir], [decimal=true])` | Load the training images as an array
+`trainlabels([indices]; [dir])` | Load the labels for the training images
+`testtensor([indices]; [dir], [decimal=true])` | Load the test images as an array
+`testlabels([indices]; [dir])` | Load the labels for the test images
+`traindata([indices]; [dir], [decimal=true])` | Load images and labels of the training data
+`testdata([indices]; [dir], [decimal=true])` | Load images and labels of the test data
+
+This module also provides utility functions to make working with
+the FashionMNIST dataset in Julia more convenient.
+
+You can use the function `convert2features` to convert the given
+FashionMNIST tensor to a feature matrix (or feature vector in the case
+of a single image). The purpose of this function is to drop the
+spatial dimensions such that traditional ML algorithms can
+process the dataset.
+
+```julia
+julia> FashionMNIST.convert2features(FashionMNIST.traintensor()) # full training data
+784×60000 Array{Float64,2}:
+[...]
+```
+
+To visualize an image or a prediction we provide the function
+`convert2image` to convert the given FashionMNIST horizontal-major
+tensor (or feature matrix) to a vertical-major `Colorant` array.
+The values are also color corrected according to the website's
+description, which means that the digits are black on a white
+background.
+
+```julia
+julia> FashionMNIST.convert2image(FashionMNIST.traintensor(1)) # first training image
+28×28 Array{Gray{Float64},2}:
+[...]
+```
+
+## References
+
+- **Authors**: Han Xiao, Kashif Rasul, Roland Vollgraf
+
+- **Website**: https://github.com/zalandoresearch/fashion-mnist
+
+- **[Han Xiao et al. 2017]** Han Xiao, Kashif Rasul, and Roland Vollgraf. "Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms." arXiv:1708.07747
@@ -60,11 +60,8 @@ julia> FashionMNIST.convert2image(FashionMNIST.traintensor(1)) # convert to colu
 ```
 """
 function traintensor(args...; dir=DEFAULT_DIR, decimal=true)
-    if decimal
-        Reader.readtrainimages(dir, args...) ./ 255
-    else
-        convert(Array{Float64}, Reader.readtrainimages(dir, args...))
-    end
+    rawimages = Reader.readimages(downloaded_file(SETTINGS, dir, TRAINIMAGES), args...)
+    decimal ? rawimages ./ 255 : convert(Array{Float64}, rawimages)
 end
 
 """
@@ -129,11 +126,8 @@ julia> FashionMNIST.convert2image(FashionMNIST.testtensor(1)) # convert to colum
 ```
 """
 function testtensor(args...; dir=DEFAULT_DIR, decimal=true)
-    if decimal
-        Reader.readtestimages(dir, args...) ./ 255
-    else
-        convert(Array{Float64}, Reader.readtestimages(dir, args...))
-    end
+    rawimages = Reader.readimages(downloaded_file(SETTINGS, dir, TESTIMAGES), args...)
+    decimal ? rawimages ./ 255 : convert(Array{Float64}, rawimages)
 end
 
 """
@@ -174,8 +168,15 @@ julia> FashionMNIST.trainlabels(dir="/home/user/fashion_mnist")
 WARNING: The FashionMNIST file "train-labels-idx1-ubyte.gz" was not found in "/home/user/fashion_mnist". You can download [...]
 ```
 """
-trainlabels(args...; dir=DEFAULT_DIR) = Vector{Int}(Reader.readtrainlabels(dir, args...))
-trainlabels(index::Integer; dir=DEFAULT_DIR) = Int(Reader.readtrainlabels(dir, index))
+function trainlabels(args...; dir=DEFAULT_DIR)
+    path = downloaded_file(SETTINGS, dir, TRAINLABELS)
+    Vector{Int}(Reader.readlabels(path, args...))
+end
+
+function trainlabels(index::Integer; dir=DEFAULT_DIR)
+    path = downloaded_file(SETTINGS, dir, TRAINLABELS)
+    Int(Reader.readlabels(path, index))
+end
 
 """
     testlabels([indices]; [dir])
@@ -215,8 +216,15 @@ julia> FashionMNIST.testlabels(dir="/home/user/fashion_mnist")
 WARNING: The FashionMNIST file "t10k-labels-idx1-ubyte.gz" was not found in "/home/user/fashion_mnist". You can download [...]
 ```
 """
-testlabels(args...; dir=DEFAULT_DIR) = Vector{Int}(Reader.readtestlabels(dir, args...))
-testlabels(index::Integer; dir=DEFAULT_DIR) = Int(Reader.readtestlabels(dir, index))
+function testlabels(args...; dir=DEFAULT_DIR)
+    path = downloaded_file(SETTINGS, dir, TESTLABELS)
+    Vector{Int}(Reader.readlabels(path, args...))
+end
+
+function testlabels(index::Integer; dir=DEFAULT_DIR)
+    path = downloaded_file(SETTINGS, dir, TESTLABELS)
+    Int(Reader.readlabels(path, index))
+end
 
 """
     traindata([indices]; [dir], [decimal=true]) -> Tuple
 
@@ -2,6 +2,9 @@ export MNIST
 module MNIST
     using ImageCore
     using ColorTypes
+    import ..downloaded_file
+    import ..download_helper
+    import ..DownloadSettings
 
     export
 
@@ -17,9 +20,40 @@ module MNIST
         convert2image,
         convert2features,
 
-        download_helper
-
-    const DEFAULT_DIR = abspath(joinpath(dirname(@__FILE__), "..", "..", "datasets", "mnist"))
+        download
+
+    const DEFAULT_DIR = abspath(joinpath(@__DIR__, "..", "..", "datasets", "mnist"))
+
+    const TRAINIMAGES = "train-images-idx3-ubyte.gz"
+    const TRAINLABELS = "train-labels-idx1-ubyte.gz"
+    const TESTIMAGES  = "t10k-images-idx3-ubyte.gz"
+    const TESTLABELS  = "t10k-labels-idx1-ubyte.gz"
+
+    const SETTINGS = DownloadSettings(
+        "http://yann.lecun.com/exdb/mnist/",
+        """
+        Dataset: THE MNIST DATABASE of handwritten digits
+        Authors: Yann LeCun, Corinna Cortes, Christopher J.C. Burges
+        Website: http://yann.lecun.com/exdb/mnist/
+
+        [LeCun et al., 1998a]
+            Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner.
+            "Gradient-based learning applied to document recognition."
+            Proceedings of the IEEE, 86(11):2278-2324, November 1998
+
+        The files are available for download at the offical
+        website linked above. We can download these files for you
+        if you wish, but that doesn't free you from the burden of
+        using the data responsibly and respect copyright. The
+        authors of MNIST aren't really explicit about any terms
+        of use, so please read the website to make sure you want
+        to download the dataset.
+        """,
+        [TRAINIMAGES, TRAINLABELS, TESTIMAGES, TESTLABELS]
+    )
+
+    download(dir = DEFAULT_DIR; kw...) =
+        download_helper(SETTINGS, dir; kw...)
 
     include(joinpath("Reader","Reader.jl"))
     include("interface.jl")