1
1
using HTTP
2
2
using JSON
3
+ using CSV
4
+ import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce
3
5
using Markdown
4
6
5
7
const API_URL = " https://www.openml.org/api/v1/json"
@@ -9,10 +11,10 @@ const API_URL = "https://www.openml.org/api/v1/json"
9
11
# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
10
12
# https://www.openml.org/api_docs#!/data/get_data_id
11
13
12
- # To do :
13
- # - Save the file in a local folder
14
- # - Check downloaded files in local folder before downloading it again
15
- # - Use local stored file whenever possible
14
+ # TODO :
15
+ # - Use e.g. DataDeps to cache data locally
16
+ # - Put the ARFF parser to a separate package or use ARFFFiles when
17
+ # https://github.com/cjdoris/ARFFFiles.jl/issues/4 is fixed.
16
18
17
19
"""
18
20
Returns information about a dataset. The information includes the name,
@@ -42,74 +44,163 @@ function load_Dataset_Description(id::Int; api_key::String="")
42
44
return nothing
43
45
end
44
46
47
+ function _parse (openml, val)
48
+ val == " ?" && return missing
49
+ openml ∈ (" real" , " numeric" , " integer" ) && return Meta. parse (val)
50
+ return val
51
+ end
52
+
53
+ emptyvec (:: Type{String} , length) = fill (" " , length)
54
+ emptyvec (T:: Any , length) = zeros (T, length)
55
+ function _vec (idxs, vals:: AbstractVector{<:Union{Missing, T}} , length) where T
56
+ result = emptyvec (T, length)
57
+ for k in eachindex (idxs)
58
+ result[idxs[k]] = vals[k]
59
+ end
60
+ result
61
+ end
62
+
63
+ _scitype (scitype, :: DataType ) = scitype
64
+ _scitype (scitype, :: Type{Union{Missing, T}} ) where T = Union{Missing, scitype}
65
+ function scitype (openml, inferred)
66
+ (openml == " real" || (openml == " numeric" && inferred <: Union{Missing, <:Real} )) && return _scitype (Continuous, inferred)
67
+ (openml == " integer" || (openml == " numeric" && inferred <: Union{Missing <: Integer} )) && return _scitype (Count, inferred)
68
+ openml == " string" && return _scitype (Textual, inferred)
69
+ openml[1 ] == ' {' && return _scitype (Multiclass, inferred)
70
+ error (" Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred ." )
71
+ end
72
+
73
+ function needs_coercion (is, shouldbe, name, verbosity)
74
+ if (shouldbe == " numeric" && ! (is <: Union{Missing, <:Number} )) ||
75
+ (shouldbe == " integer" && ! (is <: Union{Missing, <:Integer} )) ||
76
+ (shouldbe == " real" && ! (is <: Union{Missing, <:Real} )) ||
77
+ (shouldbe == " string" && ! (is <: Union{Missing, <:AbstractString} )) ||
78
+ shouldbe[1 ] == ' {'
79
+ verbosity && @info " Inferred type `$is ` does not match the OpenML metadata `$shouldbe ` for feature `$name `. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
80
+ true
81
+ else
82
+ false
83
+ end
84
+ end
85
+
45
86
"""
46
87
Returns a Vector of NamedTuples.
47
88
Receives an `HTTP.Message.response` that has an
48
89
ARFF file format in the `body` of the `Message`.
49
90
"""
50
- function convert_ARFF_to_rowtable (response)
51
- data = String (response. body)
52
- data2 = split (data, " \n " )
53
-
54
- featureNames = String[]
91
+ function convert_ARFF_to_columntable (response, verbosity, parser; kwargs... )
92
+ featureNames = Symbol[]
55
93
dataTypes = String[]
56
- # TODO : make this more performant by anticipating types?
57
- named_tuples = [] # `Any` type here bad
58
- for line in data2
94
+ io = IOBuffer (response. body)
95
+ for line in eachline (io)
59
96
if length (line) > 0
60
97
if line[1 : 1 ] != " %"
61
98
d = []
62
99
if occursin (" @attribute" , lowercase (line))
63
- push! (featureNames, replace (replace (split (line, " " )[2 ], " '" => " " ), " -" => " _" ))
64
- push! (dataTypes, split (line, " " )[3 ])
100
+ splitline = split (line)
101
+ push! (featureNames, Symbol (splitline[2 ]))
102
+ push! (dataTypes, lowercase (join (splitline[3 : end ], " " )))
65
103
elseif occursin (" @relation" , lowercase (line))
66
104
nothing
67
105
elseif occursin (" @data" , lowercase (line))
68
106
# it means the data starts
69
- nothing
70
- else
71
- values = split (line, " ," )
72
- for i in eachindex (featureNames)
73
- if lowercase (dataTypes[i]) in [" real" ," numeric" ]
74
- push! (d, featureNames[i] => Meta. parse (values[i]))
75
- else
76
- # all the rest will be considered as String
77
- push! (d, featureNames[i] => values[i])
78
- end
79
- end
80
- push! (named_tuples, (; (Symbol (k) => v for (k,v) in d). .. ))
107
+ break
81
108
end
82
109
end
83
110
end
84
111
end
85
- return identity .(named_tuples) # not performant; see above
112
+ while io. data[io. ptr] ∈ (0x0a , 0x25 ) # skip empty new lines and comments
113
+ readline (io)
114
+ end
115
+ if io. data[io. ptr] == 0x7b # sparse ARFF file
116
+ tmp = [(Int[], Union{Missing, type ∈ (" numeric" , " real" ) ? Float64 : type == " integer" ? Int : String}[]) for type in dataTypes]
117
+ i = 0
118
+ for line in eachline (io)
119
+ if line[1 : 1 ] != " %"
120
+ splitline = split (line[2 : end - 1 ], " ," )
121
+ splitline == [" " ] && continue
122
+ i += 1
123
+ for entry in splitline
124
+ idx_string, val = split (entry)
125
+ idx = parse (Int, idx_string) + 1
126
+ push! (tmp[idx][1 ], i)
127
+ push! (tmp[idx][2 ], _parse (dataTypes[idx], val))
128
+ end
129
+ end
130
+ end
131
+ tmpd = Dict (featureNames[k] => _vec (tmp[k][1 ], identity .(tmp[k][2 ]), i)
132
+ for k in eachindex (featureNames))
133
+ inferred = [eltype (tmpd[k]) for k in featureNames]
134
+ result = CSV. Tables. DictColumnTable (CSV. Tables. Schema (featureNames, inferred),
135
+ tmpd)
136
+ else
137
+ result = CSV. File (io;
138
+ header = featureNames,
139
+ comment = " %" ,
140
+ missingstring = " ?" ,
141
+ quotechar = ' '' ,
142
+ escapechar = ' \\ ' ,
143
+ kwargs... )
144
+ inferred = CSV. gettypes (result)
145
+ result = CSV. Tables. dictcolumntable (result)
146
+ end
147
+ if parser != :csv && length (featureNames) > 2000
148
+ @info " Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
149
+ parser = :csv
150
+ end
151
+ idxs = needs_coercion .(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0 )
152
+ if parser ∈ (:openml , :auto )
153
+ result = coerce (result, [name => scitype (type, inferred)
154
+ for (name, type, inferred) in
155
+ zip (featureNames[idxs], dataTypes[idxs], inferred[idxs])]. .. )
156
+ end
157
+ if parser == :auto
158
+ result = coerce (result, autotype (result))
159
+ end
160
+ return result
86
161
end
87
162
88
163
"""
89
- MLJOpenML.load(id)
164
+ MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
165
+
166
+ Load the OpenML dataset with specified `id`, from those listed by
167
+ [`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
168
+ If `parser = :csv` the types of the columns are automatically detected by the
169
+ `CSV.read` function. A message is shown, if `verbosity > 0` and the detected
170
+ type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
171
+ is used to `coerce` the columns to scientific types according to the rules:
172
+ | metadata | inferred type | scientific type |
173
+ |----------|---------------|-----------------|
174
+ |numeric | <: Real | Continuous |
175
+ |numeric | <: Integer | Count |
176
+ |real | <: Any | Continuous |
177
+ |integer | <: Any | Count |
178
+ |string | <: Any | Textual |
179
+ |{ANYTHING}| <: Any | Multiclass |
180
+
181
+ See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
182
+ for info on the OpenML metadata.
183
+
184
+ With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
185
+ used to coerce the data further.
186
+
187
+ For data with more than 2000 features (columns) `parser = :csv` is used always,
188
+ because `parser = :openml` can be much slower.
189
+
190
+ Returns a table.
90
191
91
- Load the OpenML dataset with specified `id`, from those listed on the
92
- [OpenML site](https://www.openml.org/search?type=data).
93
-
94
- Returns a "row table", i.e., a `Vector` of identically typed
95
- `NamedTuple`s. A row table is compatible with the
96
- [Tables.jl](https://github.com/JuliaData/Tables.jl) interface and can
97
- therefore be readily converted to other compatible formats. For
98
- example:
192
+ # Examples
99
193
100
194
```julia
101
195
using DataFrames
102
- rowtable = MLJOpenML.load(61);
103
- df = DataFrame(rowtable);
104
-
105
- using MLJ
106
- df2 = coerce(df, :class=>Multiclass)
196
+ table = MLJOpenML.load(61);
197
+ df = DataFrame(table);
107
198
```
108
199
"""
109
- function load (id:: Int )
200
+ function load (id:: Int ; verbosity = 1 , parser = :csv , kwargs ... )
110
201
response = load_Dataset_Description (id)
111
202
arff_file = HTTP. request (" GET" , response[" data_set_description" ][" url" ])
112
- return convert_ARFF_to_rowtable (arff_file)
203
+ return convert_ARFF_to_columntable (arff_file, verbosity, parser; kwargs ... )
113
204
end
114
205
115
206
0 commit comments