Merge pull request #31 from JuliaML/iris

CarloLucibello · web-flow · commit 3d6cddf1f793 · 2020-02-29T18:59:42.000+01:00
add Iris Dataset
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 sandbox.jl
+Manifest.toml
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ os:
   - osx
 julia:
   - 1.0
-  - 1.2
+  - 1.3
   - nightly
 notifications:
   email: false
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,12 @@
 name = "MLDatasets"
 uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
-version = "0.4.0"
+version = "0.4.1"
 
 [deps]
 BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
diff --git a/README.md b/README.md
@@ -75,6 +75,12 @@ Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
 
 (*) Note that the SVHN-2 dataset provides an additional 531131 observations aside from the training- and testset
 
+
+### Misc. Datasets
+Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
+:------:|:-------:|:-------------:|:-------------:|:------------:|:------------:
+**Iris** | 3 | 4x150 | 150 | - | -
+
 ### Language Modeling
 
 #### PTBLM
diff --git a/REQUIRE b/REQUIRE
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -69,17 +69,64 @@ the purpose of image classification.
 
 Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
 :------:|:-------:|:-------------:|:-------------:|:------------:|:------------:
-[**MNIST**](@ref MNIST) | 10 | 28x28x60000 | 60000 | 28x28x10000 | 10000
-[**FashionMNIST**](@ref FashionMNIST) | 10 | 28x28x60000 | 60000 | 28x28x10000 | 10000
-[**CIFAR-10**](@ref CIFAR10) | 10 | 32x32x3x50000 | 50000 | 32x32x3x10000 | 10000
-[**CIFAR-100**](@ref CIFAR100) | 100 (20) | 32x32x3x50000 | 50000 (x2) | 32x32x3x10000 | 10000 (x2)
-[**SVHN-2**](@ref SVHN2) (*) | 10 | 32x32x3x73257 | 73257 | 32x32x3x26032 | 26032
+[**MNIST**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/MNIST/) | 10 | 28x28x60000 | 60000 | 28x28x10000 | 10000
+[**FashionMNIST**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/) | 10 | 28x28x60000 | 60000 | 28x28x10000 | 10000
+[**CIFAR-10**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/) | 10 | 32x32x3x50000 | 50000 | 32x32x3x10000 | 10000
+[**CIFAR-100**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/) | 100 (20) | 32x32x3x50000 | 50000 (x2) | 32x32x3x10000 | 10000 (x2)
+[**SVHN-2**](https://juliaml.github.io/MLDatasets.jl/latest/datasets/SVHN2/) (*) | 10 | 32x32x3x73257 | 73257 | 32x32x3x26032 | 26032
 
 (*) Note that the SVHN-2 dataset provides an additional 531131 observations aside from the training- and testset
 
+
+### Misc. Datasets
+Dataset | Classes | `traintensor` | `trainlabels` | `testtensor` | `testlabels`
+:------:|:-------:|:-------------:|:-------------:|:------------:|:------------:
+**Iris** | 3 | 4x150 | 150 | - | -
+
 ### Language Modeling
 
-Work in progress
+#### PTBLM
+
+The `PTBLM` dataset consists of Penn Treebank sentences for
+language modeling, available from
+[tomsercu/lstm](https://github.com/tomsercu/lstm). The unknown
+words are replaced with `<unk>` so that the total vocabulary size
+becomes 10000.
+
+This is the first sentence of the PTBLM dataset.
+
+```julia
+x, y = PTBLM.traindata()
+
+x[1]
+> ["no", "it", "was", "n't", "black", "monday"]
+y[1]
+> ["it", "was", "n't", "black", "monday", "<eos>"]
+```
+
+where `MLDataset` adds the special word: `<eos>` to the end of `y`.
+
+### Text Analysis (POS-Tagging, Parsing)
+
+#### UD English
+
+The [UD_English](https://github.com/UniversalDependencies/UD_English-EWT)
+Universal Dependencies English Web Treebank dataset is an annotated corpus of morphological features,
+POS-tags and syntactic trees. The dataset follows CoNLL-style
+format.
+
+```julia
+traindata = UD_English.traindata()
+devdata = UD_English.devdata()
+testdata = UD_English.devdata()
+```
+
+## Data Size
+|    | Train x | Train y | Test x | Test y |
+|:--:|:-------:|:-------:|:------:|:------:|
+| **PTBLM** | 42068 | 42068 | 3761 | 3761 |
+| **UD_English** | 12543 | - | 2077 | - |
+
 
 ## Index
 
diff --git a/src/Iris/Iris.jl b/src/Iris/Iris.jl
@@ -0,0 +1,89 @@
+export Iris
+
+"""
+Fisher's classic iris dataset.
+
+Measurements from 3 different species of iris: setosa, versicolor and
+virginica.  There are 50 examples of each species.
+
+There are 4 measurements for each example: sepal length, sepal width, petal
+length and petal width.  The measurements are in centimeters.
+
+The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
+
+NOTE: no pre-defined train-test split for this dataset, `features` and `labels` return the whole dataset. 
+
+## Interface
+
+- [`Iris.features`](@ref)
+- [`Iris.labels`](@ref)
+
+## Utilities
+
+- [`Iris.download`](@ref)
+"""
+module Iris
+
+using DataDeps
+using ..MLDatasets: bytes_to_type, datafile, download_dep, download_docstring
+using DelimitedFiles
+
+export features, labels, download
+
+const DEPNAME = "Iris"
+const LINK = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/"
+const DOCS = "https://archive.ics.uci.edu/ml/datasets/Iris"
+const DATA = "iris.data"
+
+"""
+    download([dir]; [i_accept_the_terms_of_use])
+
+Trigger the (interactive) download of the full dataset into
+"`dir`". If no `dir` is provided the dataset will be
+downloaded into "~/.julia/datadeps/$DEPNAME".
+
+This function will display an interactive dialog unless
+either the keyword parameter `i_accept_the_terms_of_use` or
+the environment variable `DATADEPS_ALWAY_ACCEPT` is set to
+`true`. Note that using the data responsibly and respecting
+copyright/terms-of-use remains your responsibility.
+"""
+download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
+
+function __init__()
+    register(DataDep(
+        DEPNAME,
+        """
+        Dataset: The Iris dataset
+        Website: $DOCS
+        """,
+        LINK .* [DATA],
+        "1ec014c249120402fc228dbab231129b87a7359699675059035af0f4adc3b863"  # if checksum omitted, will be generated by DataDeps
+    ))
+end
+
+"""
+    labels(; dir = nothing)
+
+Return a string vector of length 150 containing observations' labels.
+"""
+
+function labels(; dir = nothing)
+    path = datafile(DEPNAME, DATA, dir)
+    iris = readdlm(path, ',')
+    Vector{String}(iris[:, end])
+end
+
+"""
+    features(; dir = nothing)
+
+Return a 4x150 matrix containing the 4-dimensional features of each observation.
+"""
+function features(; dir = nothing)
+    path = datafile(DEPNAME, DATA, dir)
+    iris = readdlm(path, ',')
+    Matrix{Float64}(iris[:, 1:4])' |> collect
+end
+
+end # module
+
diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -39,7 +39,7 @@ end
 
 include("download.jl")
 include("CoNLL.jl")
-
+include("Iris/Iris.jl")
 include("CIFAR10/CIFAR10.jl")
 include("CIFAR100/CIFAR100.jl")
 include("MNIST/MNIST.jl")
diff --git a/test/REQUIRE b/test/REQUIRE
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -2,6 +2,7 @@ using Test
 using MLDatasets
 
 tests = [
+    "tst_iris.jl",
     "tst_cifar10.jl",
     "tst_cifar100.jl",
     "tst_mnist.jl",
diff --git a/test/tst_cifar100.jl b/test/tst_cifar100.jl
@@ -32,7 +32,7 @@ if parse(Bool, get(ENV, "CI", "false"))
     @info "CI detected: skipping dataset download"
 else
     data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
-        datadep"CIFAR100"
+        datadep"Iris"
     end
 
     @testset "classnames" begin
diff --git a/test/tst_iris.jl b/test/tst_iris.jl
@@ -0,0 +1,23 @@
+module Iris_Tests
+using Test
+using DataDeps
+using MLDatasets
+
+if parse(Bool, get(ENV, "CI", "false"))
+    @info "CI detected: skipping dataset download"
+else
+    data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
+        datadep"Iris"
+    end
+
+    @testset "Iris" begin
+        X  = Iris.features()
+        Y  = Iris.labels()
+        @test X isa Matrix{Float64}
+        @test Y isa Vector{String}
+        @test size(X) == (4, 150)
+        @test size(Y) == (150,)
+    end
+end
+
+end