Merge pull request #9 from jbrea/dev

ablaom · web-flow · commit ebba5f9739ee · 2021-07-27T11:52:33.000+12:00
faster parsing of arff
diff --git a/Project.toml b/Project.toml
@@ -4,11 +4,14 @@ authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
 version = "1.0.0"
 
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 
 [compat]
+CSV = "0.8"
 HTTP = "^0.8, 0.9"
 JSON = "^0.21"
 julia = "1"
diff --git a/src/openml.jl b/src/openml.jl
@@ -1,5 +1,7 @@
 using HTTP
 using JSON
+using CSV
+import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce
 using Markdown
 
 const API_URL = "https://www.openml.org/api/v1/json"
@@ -9,10 +11,10 @@ const API_URL = "https://www.openml.org/api/v1/json"
 # https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
 # https://www.openml.org/api_docs#!/data/get_data_id
 
-# To do:
-# - Save the file in a local folder
-# - Check downloaded files in local folder before downloading it again
-# - Use local stored file whenever possible
+# TODO:
+# - Use e.g. DataDeps to cache data locally
+# - Put the ARFF parser to a separate package or use ARFFFiles when
+#   https://github.com/cjdoris/ARFFFiles.jl/issues/4 is fixed.
 
 """
 Returns information about a dataset. The information includes the name,
@@ -42,74 +44,163 @@ function load_Dataset_Description(id::Int; api_key::String="")
     return nothing
 end
 
+function _parse(openml, val)
+    val == "?" && return missing
+    openml ∈ ("real", "numeric", "integer") && return Meta.parse(val)
+    return val
+end
+
+emptyvec(::Type{String}, length) = fill("", length)
+emptyvec(T::Any, length) = zeros(T, length)
+function _vec(idxs, vals::AbstractVector{<:Union{Missing, T}}, length) where T
+    result = emptyvec(T, length)
+    for k in eachindex(idxs)
+        result[idxs[k]] = vals[k]
+    end
+    result
+end
+
+_scitype(scitype, ::DataType) = scitype
+_scitype(scitype, ::Type{Union{Missing, T}}) where T = Union{Missing, scitype}
+function scitype(openml, inferred)
+    (openml == "real" || (openml == "numeric" && inferred <: Union{Missing, <:Real})) && return _scitype(Continuous, inferred)
+    (openml == "integer" || (openml == "numeric" && inferred <: Union{Missing <: Integer})) && return _scitype(Count, inferred)
+    openml == "string" && return _scitype(Textual, inferred)
+    openml[1] == '{' && return _scitype(Multiclass, inferred)
+    error("Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred.")
+end
+
+function needs_coercion(is, shouldbe, name, verbosity)
+    if (shouldbe == "numeric" && !(is <: Union{Missing, <:Number})) ||
+       (shouldbe == "integer" && !(is <: Union{Missing, <:Integer})) ||
+       (shouldbe == "real" && !(is <: Union{Missing, <:Real})) ||
+       (shouldbe == "string" && !(is <: Union{Missing, <:AbstractString})) ||
+        shouldbe[1] == '{'
+        verbosity && @info "Inferred type `$is` does not match the OpenML metadata `$shouldbe` for feature `$name`. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
+        true
+    else
+        false
+    end
+end
+
 """
 Returns a Vector of NamedTuples.
 Receives an `HTTP.Message.response` that has an
 ARFF file format in the `body` of the `Message`.
 """
-function convert_ARFF_to_rowtable(response)
-    data = String(response.body)
-    data2 = split(data, "\n")
-
-    featureNames = String[]
+function convert_ARFF_to_columntable(response, verbosity, parser; kwargs...)
+    featureNames = Symbol[]
     dataTypes = String[]
-    # TODO: make this more performant by anticipating types?
-    named_tuples = [] # `Any` type here bad
-    for line in data2
+    io = IOBuffer(response.body)
+    for line in eachline(io)
         if length(line) > 0
             if line[1:1] != "%"
                 d = []
                 if occursin("@attribute", lowercase(line))
-                    push!(featureNames, replace(replace(split(line, " ")[2], "'" => ""), "-" => "_"))
-                    push!(dataTypes, split(line, " ")[3])
+                    splitline = split(line)
+                    push!(featureNames, Symbol(splitline[2]))
+                    push!(dataTypes, lowercase(join(splitline[3:end], "")))
                 elseif occursin("@relation", lowercase(line))
                     nothing
                 elseif occursin("@data", lowercase(line))
                     # it means the data starts
-                    nothing
-                else
-                    values = split(line, ",")
-                    for i in eachindex(featureNames)
-                        if lowercase(dataTypes[i]) in ["real","numeric"]
-                            push!(d, featureNames[i] => Meta.parse(values[i]))
-                        else
-                            # all the rest will be considered as String
-                            push!(d, featureNames[i] => values[i])
-                        end
-                    end
-                    push!(named_tuples, (; (Symbol(k) => v for (k,v) in d)...))
+                    break
                 end
             end
         end
     end
-    return identity.(named_tuples) # not performant; see above
+    while io.data[io.ptr] ∈ (0x0a, 0x25) # skip empty new lines and comments
+        readline(io)
+    end
+    if io.data[io.ptr] == 0x7b # sparse ARFF file
+        tmp = [(Int[], Union{Missing, type ∈ ("numeric", "real") ? Float64 : type == "integer" ? Int :  String}[]) for type in dataTypes]
+        i = 0
+        for line in eachline(io)
+            if line[1:1] != "%"
+                splitline = split(line[2:end-1], ",")
+                splitline == [""] && continue
+                i += 1
+                for entry in splitline
+                    idx_string, val = split(entry)
+                    idx = parse(Int, idx_string) + 1
+                    push!(tmp[idx][1], i)
+                    push!(tmp[idx][2], _parse(dataTypes[idx], val))
+                end
+            end
+        end
+        tmpd = Dict(featureNames[k] => _vec(tmp[k][1], identity.(tmp[k][2]), i)
+                    for k in eachindex(featureNames))
+        inferred = [eltype(tmpd[k]) for k in featureNames]
+        result = CSV.Tables.DictColumnTable(CSV.Tables.Schema(featureNames, inferred),
+                                            tmpd)
+    else
+        result = CSV.File(io;
+                          header = featureNames,
+                          comment = "%",
+                          missingstring = "?",
+                          quotechar = ''',
+                          escapechar = '\\',
+                          kwargs...)
+        inferred = CSV.gettypes(result)
+        result = CSV.Tables.dictcolumntable(result)
+    end
+    if parser != :csv && length(featureNames) > 2000
+        @info "Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
+        parser = :csv
+    end
+    idxs = needs_coercion.(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0)
+    if parser ∈ (:openml, :auto)
+        result = coerce(result, [name => scitype(type, inferred)
+                                 for (name, type, inferred) in
+                                 zip(featureNames[idxs], dataTypes[idxs], inferred[idxs])]...)
+    end
+    if parser == :auto
+        result = coerce(result, autotype(result))
+    end
+    return result
 end
 
 """
-    MLJOpenML.load(id)
+    MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
+
+Load the OpenML dataset with specified `id`, from those listed by
+[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
+If `parser = :csv` the types of the columns are automatically detected by the
+`CSV.read` function. A message is shown, if `verbosity > 0` and the detected
+type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
+is used to `coerce` the columns to scientific types according to the rules:
+| metadata | inferred type | scientific type |
+|----------|---------------|-----------------|
+|numeric   | <: Real       | Continuous      |
+|numeric   | <: Integer    | Count           |
+|real      | <: Any        | Continuous      |
+|integer   | <: Any        | Count           |
+|string    | <: Any        | Textual         |
+|{ANYTHING}| <: Any        | Multiclass      |
+
+See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
+for info on the OpenML metadata.
+
+With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
+used to coerce the data further.
+
+For data with more than 2000 features (columns) `parser = :csv` is used always,
+because `parser = :openml` can be much slower.
+
+Returns a table.
 
-Load the OpenML dataset with specified `id`, from those listed on the
-[OpenML site](https://www.openml.org/search?type=data).
-
-Returns a "row table", i.e., a `Vector` of identically typed
-`NamedTuple`s. A row table is compatible with the
-[Tables.jl](https://github.com/JuliaData/Tables.jl) interface and can
-therefore be readily converted to other compatible formats. For
-example:
+# Examples
 
 ```julia
 using DataFrames
-rowtable = MLJOpenML.load(61);
-df = DataFrame(rowtable);
-
-using MLJ
-df2 = coerce(df, :class=>Multiclass)
+table = MLJOpenML.load(61);
+df = DataFrame(table);
 ```
 """
-function load(id::Int)
+function load(id::Int; verbosity = 1, parser = :csv, kwargs...)
     response = load_Dataset_Description(id)
     arff_file = HTTP.request("GET", response["data_set_description"]["url"])
-    return convert_ARFF_to_rowtable(arff_file)
+    return convert_ARFF_to_columntable(arff_file, verbosity, parser; kwargs...)
 end
 
 
diff --git a/test/openml.jl b/test/openml.jl
@@ -23,8 +23,8 @@ end
 
 @testset "ARFF file conversion to NamedTuples" begin
     @test isempty(ntp_test) == false
-    @test length(ntp_test) == 150
-    @test length(ntp_test[1]) == 5
+    @test length(ntp_test[1]) == 150
+    @test length(ntp_test) == 5
 end
 
 @testset "data api functions" begin