Skip to content

Commit 23d2f2d

Browse files
committed
use ARFFFiles
1 parent 138f28a commit 23d2f2d

File tree

3 files changed

+5
-145
lines changed

3 files changed

+5
-145
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@ authors = ["Anthony D. Blaom <[email protected]>"]
44
version = "1.1.0"
55

66
[deps]
7-
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
7+
ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
88
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
99
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1010
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
1111
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
1212

1313
[compat]
14-
CSV = "0.8"
1514
HTTP = "0.8, 0.9"
1615
JSON = "0.21"
1716
ScientificTypes = "2"
17+
ARFFFiles = "1.3"
1818
julia = "1"
1919

2020
[extras]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Load the iris data set from OpenML:
2424

2525
```julia
2626
using MLJOpenML
27-
table = MLJOpenML.load(61) # a CSV.File object
27+
table = MLJOpenML.load(61) # a Tables.DictColumnTable
2828
```
2929

3030
Convert to a `DataFrame`:

src/openml.jl

Lines changed: 2 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
using HTTP
22
using JSON
3-
using CSV
3+
import ARFFFiles
44
import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype
55
using Markdown
66

@@ -44,151 +44,11 @@ function load_Dataset_Description(id::Int; api_key::String="")
4444
return nothing
4545
end
4646

47-
function _parse(openml, val)
48-
val == "?" && return missing
49-
openml ("real", "numeric", "integer") && return Meta.parse(val)
50-
return val
51-
end
52-
53-
emptyvec(::Type{String}, length) = fill("", length)
54-
emptyvec(T::Any, length) = zeros(T, length)
55-
function _vec(idxs, vals::AbstractVector{<:Union{Missing, T}}, length) where T
56-
result = emptyvec(T, length)
57-
for k in eachindex(idxs)
58-
result[idxs[k]] = vals[k]
59-
end
60-
result
61-
end
62-
63-
_scitype(scitype, ::DataType) = scitype
64-
_scitype(scitype, ::Type{Union{Missing, T}}) where T = Union{Missing, scitype}
65-
function scitype(openml, inferred)
66-
(openml == "real" || (openml == "numeric" && inferred <: Union{Missing, <:Real})) && return _scitype(Continuous, inferred)
67-
(openml == "integer" || (openml == "numeric" && inferred <: Union{Missing <: Integer})) && return _scitype(Count, inferred)
68-
openml == "string" && return _scitype(Textual, inferred)
69-
openml[1] == '{' && return _scitype(Multiclass, inferred)
70-
error("Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred.")
71-
end
72-
73-
function needs_coercion(is, shouldbe, name, verbosity)
74-
if (shouldbe == "numeric" && !(is <: Union{Missing, <:Number})) ||
75-
(shouldbe == "integer" && !(is <: Union{Missing, <:Integer})) ||
76-
(shouldbe == "real" && !(is <: Union{Missing, <:Real})) ||
77-
(shouldbe == "string" && !(is <: Union{Missing, <:AbstractString})) ||
78-
shouldbe[1] == '{'
79-
verbosity && @info "Inferred type `$is` does not match the OpenML metadata `$shouldbe` for feature `$name`. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
80-
true
81-
else
82-
false
83-
end
84-
end
85-
86-
"""
87-
Returns a Vector of NamedTuples.
88-
Receives an `HTTP.Message.response` that has an
89-
ARFF file format in the `body` of the `Message`.
90-
"""
91-
function convert_ARFF_to_columntable(response, verbosity, parser; kwargs...)
92-
featureNames = Symbol[]
93-
dataTypes = String[]
94-
io = IOBuffer(response.body)
95-
for line in eachline(io)
96-
if length(line) > 0
97-
if line[1:1] != "%"
98-
d = []
99-
if occursin("@attribute", lowercase(line))
100-
splitline = split(line)
101-
push!(featureNames, Symbol(splitline[2]))
102-
push!(dataTypes, lowercase(join(splitline[3:end], "")))
103-
elseif occursin("@relation", lowercase(line))
104-
nothing
105-
elseif occursin("@data", lowercase(line))
106-
# it means the data starts
107-
break
108-
end
109-
end
110-
end
111-
end
112-
while io.data[io.ptr] (0x0a, 0x25) # skip empty new lines and comments
113-
readline(io)
114-
end
115-
if io.data[io.ptr] == 0x7b # sparse ARFF file
116-
tmp = [(Int[], Union{Missing, type ("numeric", "real") ? Float64 : type == "integer" ? Int : String}[]) for type in dataTypes]
117-
i = 0
118-
for line in eachline(io)
119-
if line[1:1] != "%"
120-
splitline = split(line[2:end-1], ",")
121-
splitline == [""] && continue
122-
i += 1
123-
for entry in splitline
124-
idx_string, val = split(entry)
125-
idx = parse(Int, idx_string) + 1
126-
push!(tmp[idx][1], i)
127-
push!(tmp[idx][2], _parse(dataTypes[idx], val))
128-
end
129-
end
130-
end
131-
tmpd = Dict(featureNames[k] => _vec(tmp[k][1], identity.(tmp[k][2]), i)
132-
for k in eachindex(featureNames))
133-
inferred = [eltype(tmpd[k]) for k in featureNames]
134-
result = CSV.Tables.DictColumnTable(CSV.Tables.Schema(featureNames, inferred),
135-
tmpd)
136-
else
137-
result = CSV.File(io;
138-
header = featureNames,
139-
comment = "%",
140-
missingstring = "?",
141-
quotechar = ''',
142-
escapechar = '\\',
143-
kwargs...)
144-
inferred = CSV.gettypes(result)
145-
result = CSV.Tables.dictcolumntable(result)
146-
end
147-
if parser != :csv && length(featureNames) > 2000
148-
@info "Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
149-
parser = :csv
150-
end
151-
idxs = needs_coercion.(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0)
152-
if parser (:openml, :auto)
153-
result = coerce(result, [name => scitype(type, inferred)
154-
for (name, type, inferred) in
155-
zip(featureNames[idxs], dataTypes[idxs], inferred[idxs])]...)
156-
end
157-
if parser == :auto
158-
result = coerce(result, autotype(result))
159-
end
160-
return result
161-
end
162-
16347
"""
16448
MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
16549
16650
Load the OpenML dataset with specified `id`, from those listed by
16751
[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
168-
If `parser = :csv` the types of the columns are automatically detected by the
169-
`CSV.read` function. A message is shown, if `verbosity > 0` and the detected
170-
type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
171-
is used to `coerce` the columns to scientific types according to the rules:
172-
173-
| metadata | inferred type | scientific type |
174-
|----------|---------------|-----------------|
175-
|numeric | <: Real | Continuous |
176-
|numeric | <: Integer | Count |
177-
|real | <: Any | Continuous |
178-
|integer | <: Any | Count |
179-
|string | <: Any | Textual |
180-
|{ANYTHING}| <: Any | Multiclass |
181-
182-
See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
183-
for info on the OpenML metadata.
184-
185-
With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
186-
used to coerce the data further.
187-
188-
For data with more than 2000 features (columns) `parser = :csv` is used always,
189-
because `parser = :openml` can be much slower.
190-
191-
Extra `kwargs` are passed to the CSV parser, `CSV.File(...)`.
19252
19353
Returns a table.
19454
@@ -203,7 +63,7 @@ df = DataFrame(table);
20363
function load(id::Int; verbosity = 1, parser = :csv, kwargs...)
20464
response = load_Dataset_Description(id)
20565
arff_file = HTTP.request("GET", response["data_set_description"]["url"])
206-
return convert_ARFF_to_columntable(arff_file, verbosity, parser; kwargs...)
66+
return ARFFFiles.load(IOBuffer(arff_file.body))
20767
end
20868

20969

0 commit comments

Comments
 (0)