Skip to content

Commit ebba5f9

Browse files
authored
Merge pull request #9 from jbrea/dev
faster parsing of arff
2 parents 2526384 + 700ddc7 commit ebba5f9

File tree

3 files changed

+139
-45
lines changed

3 files changed

+139
-45
lines changed

Project.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@ authors = ["Anthony D. Blaom <[email protected]>"]
44
version = "1.0.0"
55

66
[deps]
7+
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
78
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
89
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
910
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
11+
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
1012

1113
[compat]
14+
CSV = "0.8"
1215
HTTP = "^0.8, 0.9"
1316
JSON = "^0.21"
1417
julia = "1"

src/openml.jl

Lines changed: 134 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using HTTP
22
using JSON
3+
using CSV
4+
import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce
35
using Markdown
46

57
const API_URL = "https://www.openml.org/api/v1/json"
@@ -9,10 +11,10 @@ const API_URL = "https://www.openml.org/api/v1/json"
911
# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
1012
# https://www.openml.org/api_docs#!/data/get_data_id
1113

12-
# To do:
13-
# - Save the file in a local folder
14-
# - Check downloaded files in local folder before downloading it again
15-
# - Use local stored file whenever possible
14+
# TODO:
15+
# - Use e.g. DataDeps to cache data locally
16+
# - Put the ARFF parser to a separate package or use ARFFFiles when
17+
# https://github.com/cjdoris/ARFFFiles.jl/issues/4 is fixed.
1618

1719
"""
1820
Returns information about a dataset. The information includes the name,
@@ -42,74 +44,163 @@ function load_Dataset_Description(id::Int; api_key::String="")
4244
return nothing
4345
end
4446

47+
function _parse(openml, val)
48+
val == "?" && return missing
49+
openml ("real", "numeric", "integer") && return Meta.parse(val)
50+
return val
51+
end
52+
53+
emptyvec(::Type{String}, length) = fill("", length)
54+
emptyvec(T::Any, length) = zeros(T, length)
55+
function _vec(idxs, vals::AbstractVector{<:Union{Missing, T}}, length) where T
56+
result = emptyvec(T, length)
57+
for k in eachindex(idxs)
58+
result[idxs[k]] = vals[k]
59+
end
60+
result
61+
end
62+
63+
_scitype(scitype, ::DataType) = scitype
64+
_scitype(scitype, ::Type{Union{Missing, T}}) where T = Union{Missing, scitype}
65+
function scitype(openml, inferred)
66+
(openml == "real" || (openml == "numeric" && inferred <: Union{Missing, <:Real})) && return _scitype(Continuous, inferred)
67+
(openml == "integer" || (openml == "numeric" && inferred <: Union{Missing <: Integer})) && return _scitype(Count, inferred)
68+
openml == "string" && return _scitype(Textual, inferred)
69+
openml[1] == '{' && return _scitype(Multiclass, inferred)
70+
error("Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred.")
71+
end
72+
73+
function needs_coercion(is, shouldbe, name, verbosity)
74+
if (shouldbe == "numeric" && !(is <: Union{Missing, <:Number})) ||
75+
(shouldbe == "integer" && !(is <: Union{Missing, <:Integer})) ||
76+
(shouldbe == "real" && !(is <: Union{Missing, <:Real})) ||
77+
(shouldbe == "string" && !(is <: Union{Missing, <:AbstractString})) ||
78+
shouldbe[1] == '{'
79+
verbosity && @info "Inferred type `$is` does not match the OpenML metadata `$shouldbe` for feature `$name`. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
80+
true
81+
else
82+
false
83+
end
84+
end
85+
4586
"""
4687
Returns a Vector of NamedTuples.
4788
Receives an `HTTP.Message.response` that has an
4889
ARFF file format in the `body` of the `Message`.
4990
"""
50-
function convert_ARFF_to_rowtable(response)
51-
data = String(response.body)
52-
data2 = split(data, "\n")
53-
54-
featureNames = String[]
91+
function convert_ARFF_to_columntable(response, verbosity, parser; kwargs...)
92+
featureNames = Symbol[]
5593
dataTypes = String[]
56-
# TODO: make this more performant by anticipating types?
57-
named_tuples = [] # `Any` type here bad
58-
for line in data2
94+
io = IOBuffer(response.body)
95+
for line in eachline(io)
5996
if length(line) > 0
6097
if line[1:1] != "%"
6198
d = []
6299
if occursin("@attribute", lowercase(line))
63-
push!(featureNames, replace(replace(split(line, " ")[2], "'" => ""), "-" => "_"))
64-
push!(dataTypes, split(line, " ")[3])
100+
splitline = split(line)
101+
push!(featureNames, Symbol(splitline[2]))
102+
push!(dataTypes, lowercase(join(splitline[3:end], "")))
65103
elseif occursin("@relation", lowercase(line))
66104
nothing
67105
elseif occursin("@data", lowercase(line))
68106
# it means the data starts
69-
nothing
70-
else
71-
values = split(line, ",")
72-
for i in eachindex(featureNames)
73-
if lowercase(dataTypes[i]) in ["real","numeric"]
74-
push!(d, featureNames[i] => Meta.parse(values[i]))
75-
else
76-
# all the rest will be considered as String
77-
push!(d, featureNames[i] => values[i])
78-
end
79-
end
80-
push!(named_tuples, (; (Symbol(k) => v for (k,v) in d)...))
107+
break
81108
end
82109
end
83110
end
84111
end
85-
return identity.(named_tuples) # not performant; see above
112+
while io.data[io.ptr] (0x0a, 0x25) # skip empty new lines and comments
113+
readline(io)
114+
end
115+
if io.data[io.ptr] == 0x7b # sparse ARFF file
116+
tmp = [(Int[], Union{Missing, type ("numeric", "real") ? Float64 : type == "integer" ? Int : String}[]) for type in dataTypes]
117+
i = 0
118+
for line in eachline(io)
119+
if line[1:1] != "%"
120+
splitline = split(line[2:end-1], ",")
121+
splitline == [""] && continue
122+
i += 1
123+
for entry in splitline
124+
idx_string, val = split(entry)
125+
idx = parse(Int, idx_string) + 1
126+
push!(tmp[idx][1], i)
127+
push!(tmp[idx][2], _parse(dataTypes[idx], val))
128+
end
129+
end
130+
end
131+
tmpd = Dict(featureNames[k] => _vec(tmp[k][1], identity.(tmp[k][2]), i)
132+
for k in eachindex(featureNames))
133+
inferred = [eltype(tmpd[k]) for k in featureNames]
134+
result = CSV.Tables.DictColumnTable(CSV.Tables.Schema(featureNames, inferred),
135+
tmpd)
136+
else
137+
result = CSV.File(io;
138+
header = featureNames,
139+
comment = "%",
140+
missingstring = "?",
141+
quotechar = ''',
142+
escapechar = '\\',
143+
kwargs...)
144+
inferred = CSV.gettypes(result)
145+
result = CSV.Tables.dictcolumntable(result)
146+
end
147+
if parser != :csv && length(featureNames) > 2000
148+
@info "Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
149+
parser = :csv
150+
end
151+
idxs = needs_coercion.(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0)
152+
if parser (:openml, :auto)
153+
result = coerce(result, [name => scitype(type, inferred)
154+
for (name, type, inferred) in
155+
zip(featureNames[idxs], dataTypes[idxs], inferred[idxs])]...)
156+
end
157+
if parser == :auto
158+
result = coerce(result, autotype(result))
159+
end
160+
return result
86161
end
87162

88163
"""
89-
MLJOpenML.load(id)
164+
MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
165+
166+
Load the OpenML dataset with specified `id`, from those listed by
167+
[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
168+
If `parser = :csv` the types of the columns are automatically detected by the
169+
`CSV.read` function. A message is shown, if `verbosity > 0` and the detected
170+
type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
171+
is used to `coerce` the columns to scientific types according to the rules:
172+
| metadata | inferred type | scientific type |
173+
|----------|---------------|-----------------|
174+
|numeric | <: Real | Continuous |
175+
|numeric | <: Integer | Count |
176+
|real | <: Any | Continuous |
177+
|integer | <: Any | Count |
178+
|string | <: Any | Textual |
179+
|{ANYTHING}| <: Any | Multiclass |
180+
181+
See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
182+
for info on the OpenML metadata.
183+
184+
With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
185+
used to coerce the data further.
186+
187+
For data with more than 2000 features (columns) `parser = :csv` is used always,
188+
because `parser = :openml` can be much slower.
189+
190+
Returns a table.
90191
91-
Load the OpenML dataset with specified `id`, from those listed on the
92-
[OpenML site](https://www.openml.org/search?type=data).
93-
94-
Returns a "row table", i.e., a `Vector` of identically typed
95-
`NamedTuple`s. A row table is compatible with the
96-
[Tables.jl](https://github.com/JuliaData/Tables.jl) interface and can
97-
therefore be readily converted to other compatible formats. For
98-
example:
192+
# Examples
99193
100194
```julia
101195
using DataFrames
102-
rowtable = MLJOpenML.load(61);
103-
df = DataFrame(rowtable);
104-
105-
using MLJ
106-
df2 = coerce(df, :class=>Multiclass)
196+
table = MLJOpenML.load(61);
197+
df = DataFrame(table);
107198
```
108199
"""
109-
function load(id::Int)
200+
function load(id::Int; verbosity = 1, parser = :csv, kwargs...)
110201
response = load_Dataset_Description(id)
111202
arff_file = HTTP.request("GET", response["data_set_description"]["url"])
112-
return convert_ARFF_to_rowtable(arff_file)
203+
return convert_ARFF_to_columntable(arff_file, verbosity, parser; kwargs...)
113204
end
114205

115206

test/openml.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ end
2323

2424
@testset "ARFF file conversion to NamedTuples" begin
2525
@test isempty(ntp_test) == false
26-
@test length(ntp_test) == 150
27-
@test length(ntp_test[1]) == 5
26+
@test length(ntp_test[1]) == 150
27+
@test length(ntp_test) == 5
2828
end
2929

3030
@testset "data api functions" begin

0 commit comments

Comments
 (0)