use ARFFFiles

jbrea · jbrea · commit 23d2f2d8f225 · 2021-08-05T19:10:15.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -4,17 +4,17 @@ authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
 version = "1.1.0"
 
 [deps]
-CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 
 [compat]
-CSV = "0.8"
 HTTP = "0.8, 0.9"
 JSON = "0.21"
 ScientificTypes = "2"
+ARFFFiles = "1.3"
 julia = "1"
 
 [extras]
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Load the iris data set from OpenML:
 
 ```julia
 using MLJOpenML
-table = MLJOpenML.load(61) # a CSV.File object
+table = MLJOpenML.load(61) # a Tables.DictColumnTable
 ```
 
 Convert to a `DataFrame`:
diff --git a/src/openml.jl b/src/openml.jl
@@ -1,6 +1,6 @@
 using HTTP
 using JSON
-using CSV
+import ARFFFiles
 import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype
 using Markdown
 
@@ -44,151 +44,11 @@ function load_Dataset_Description(id::Int; api_key::String="")
     return nothing
 end
 
-function _parse(openml, val)
-    val == "?" && return missing
-    openml ∈ ("real", "numeric", "integer") && return Meta.parse(val)
-    return val
-end
-
-emptyvec(::Type{String}, length) = fill("", length)
-emptyvec(T::Any, length) = zeros(T, length)
-function _vec(idxs, vals::AbstractVector{<:Union{Missing, T}}, length) where T
-    result = emptyvec(T, length)
-    for k in eachindex(idxs)
-        result[idxs[k]] = vals[k]
-    end
-    result
-end
-
-_scitype(scitype, ::DataType) = scitype
-_scitype(scitype, ::Type{Union{Missing, T}}) where T = Union{Missing, scitype}
-function scitype(openml, inferred)
-    (openml == "real" || (openml == "numeric" && inferred <: Union{Missing, <:Real})) && return _scitype(Continuous, inferred)
-    (openml == "integer" || (openml == "numeric" && inferred <: Union{Missing <: Integer})) && return _scitype(Count, inferred)
-    openml == "string" && return _scitype(Textual, inferred)
-    openml[1] == '{' && return _scitype(Multiclass, inferred)
-    error("Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred.")
-end
-
-function needs_coercion(is, shouldbe, name, verbosity)
-    if (shouldbe == "numeric" && !(is <: Union{Missing, <:Number})) ||
-       (shouldbe == "integer" && !(is <: Union{Missing, <:Integer})) ||
-       (shouldbe == "real" && !(is <: Union{Missing, <:Real})) ||
-       (shouldbe == "string" && !(is <: Union{Missing, <:AbstractString})) ||
-        shouldbe[1] == '{'
-        verbosity && @info "Inferred type `$is` does not match the OpenML metadata `$shouldbe` for feature `$name`. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
-        true
-    else
-        false
-    end
-end
-
-"""
-Returns a Vector of NamedTuples.
-Receives an `HTTP.Message.response` that has an
-ARFF file format in the `body` of the `Message`.
-"""
-function convert_ARFF_to_columntable(response, verbosity, parser; kwargs...)
-    featureNames = Symbol[]
-    dataTypes = String[]
-    io = IOBuffer(response.body)
-    for line in eachline(io)
-        if length(line) > 0
-            if line[1:1] != "%"
-                d = []
-                if occursin("@attribute", lowercase(line))
-                    splitline = split(line)
-                    push!(featureNames, Symbol(splitline[2]))
-                    push!(dataTypes, lowercase(join(splitline[3:end], "")))
-                elseif occursin("@relation", lowercase(line))
-                    nothing
-                elseif occursin("@data", lowercase(line))
-                    # it means the data starts
-                    break
-                end
-            end
-        end
-    end
-    while io.data[io.ptr] ∈ (0x0a, 0x25) # skip empty new lines and comments
-        readline(io)
-    end
-    if io.data[io.ptr] == 0x7b # sparse ARFF file
-        tmp = [(Int[], Union{Missing, type ∈ ("numeric", "real") ? Float64 : type == "integer" ? Int :  String}[]) for type in dataTypes]
-        i = 0
-        for line in eachline(io)
-            if line[1:1] != "%"
-                splitline = split(line[2:end-1], ",")
-                splitline == [""] && continue
-                i += 1
-                for entry in splitline
-                    idx_string, val = split(entry)
-                    idx = parse(Int, idx_string) + 1
-                    push!(tmp[idx][1], i)
-                    push!(tmp[idx][2], _parse(dataTypes[idx], val))
-                end
-            end
-        end
-        tmpd = Dict(featureNames[k] => _vec(tmp[k][1], identity.(tmp[k][2]), i)
-                    for k in eachindex(featureNames))
-        inferred = [eltype(tmpd[k]) for k in featureNames]
-        result = CSV.Tables.DictColumnTable(CSV.Tables.Schema(featureNames, inferred),
-                                            tmpd)
-    else
-        result = CSV.File(io;
-                          header = featureNames,
-                          comment = "%",
-                          missingstring = "?",
-                          quotechar = ''',
-                          escapechar = '\\',
-                          kwargs...)
-        inferred = CSV.gettypes(result)
-        result = CSV.Tables.dictcolumntable(result)
-    end
-    if parser != :csv && length(featureNames) > 2000
-        @info "Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
-        parser = :csv
-    end
-    idxs = needs_coercion.(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0)
-    if parser ∈ (:openml, :auto)
-        result = coerce(result, [name => scitype(type, inferred)
-                                 for (name, type, inferred) in
-                                 zip(featureNames[idxs], dataTypes[idxs], inferred[idxs])]...)
-    end
-    if parser == :auto
-        result = coerce(result, autotype(result))
-    end
-    return result
-end
-
 """
     MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
 
 Load the OpenML dataset with specified `id`, from those listed by
 [`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
-If `parser = :csv` the types of the columns are automatically detected by the
-`CSV.read` function. A message is shown, if `verbosity > 0` and the detected
-type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
-is used to `coerce` the columns to scientific types according to the rules:
-
-| metadata | inferred type | scientific type |
-|----------|---------------|-----------------|
-|numeric   | <: Real       | Continuous      |
-|numeric   | <: Integer    | Count           |
-|real      | <: Any        | Continuous      |
-|integer   | <: Any        | Count           |
-|string    | <: Any        | Textual         |
-|{ANYTHING}| <: Any        | Multiclass      |
-
-See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
-for info on the OpenML metadata.
-
-With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
-used to coerce the data further.
-
-For data with more than 2000 features (columns) `parser = :csv` is used always,
-because `parser = :openml` can be much slower.
-
-Extra `kwargs` are passed to the CSV parser, `CSV.File(...)`.
 
 Returns a table.
 
@@ -203,7 +63,7 @@ df = DataFrame(table);
 function load(id::Int; verbosity = 1, parser = :csv, kwargs...)
     response = load_Dataset_Description(id)
     arff_file = HTTP.request("GET", response["data_set_description"]["url"])
-    return convert_ARFF_to_columntable(arff_file, verbosity, parser; kwargs...)
+    return ARFFFiles.load(IOBuffer(arff_file.body))
 end