1
1
using HTTP
2
2
using JSON
3
- using CSV
3
+ import ARFFFiles
4
4
import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype
5
5
using Markdown
6
6
@@ -44,151 +44,11 @@ function load_Dataset_Description(id::Int; api_key::String="")
44
44
return nothing
45
45
end
46
46
47
- function _parse (openml, val)
48
- val == " ?" && return missing
49
- openml ∈ (" real" , " numeric" , " integer" ) && return Meta. parse (val)
50
- return val
51
- end
52
-
53
- emptyvec (:: Type{String} , length) = fill (" " , length)
54
- emptyvec (T:: Any , length) = zeros (T, length)
55
- function _vec (idxs, vals:: AbstractVector{<:Union{Missing, T}} , length) where T
56
- result = emptyvec (T, length)
57
- for k in eachindex (idxs)
58
- result[idxs[k]] = vals[k]
59
- end
60
- result
61
- end
62
-
63
- _scitype (scitype, :: DataType ) = scitype
64
- _scitype (scitype, :: Type{Union{Missing, T}} ) where T = Union{Missing, scitype}
65
- function scitype (openml, inferred)
66
- (openml == " real" || (openml == " numeric" && inferred <: Union{Missing, <:Real} )) && return _scitype (Continuous, inferred)
67
- (openml == " integer" || (openml == " numeric" && inferred <: Union{Missing <: Integer} )) && return _scitype (Count, inferred)
68
- openml == " string" && return _scitype (Textual, inferred)
69
- openml[1 ] == ' {' && return _scitype (Multiclass, inferred)
70
- error (" Cannot infer the scientific type for OpenML metadata $openml and inferred type $inferred ." )
71
- end
72
-
73
- function needs_coercion (is, shouldbe, name, verbosity)
74
- if (shouldbe == " numeric" && ! (is <: Union{Missing, <:Number} )) ||
75
- (shouldbe == " integer" && ! (is <: Union{Missing, <:Integer} )) ||
76
- (shouldbe == " real" && ! (is <: Union{Missing, <:Real} )) ||
77
- (shouldbe == " string" && ! (is <: Union{Missing, <:AbstractString} )) ||
78
- shouldbe[1 ] == ' {'
79
- verbosity && @info " Inferred type `$is ` does not match the OpenML metadata `$shouldbe ` for feature `$name `. Please coerce to the desired type manually, or specify `parser = :openml` or `parser = :auto`. To suppress this message, specify `verbosity = 0`."
80
- true
81
- else
82
- false
83
- end
84
- end
85
-
86
- """
87
- Returns a Vector of NamedTuples.
88
- Receives an `HTTP.Message.response` that has an
89
- ARFF file format in the `body` of the `Message`.
90
- """
91
- function convert_ARFF_to_columntable (response, verbosity, parser; kwargs... )
92
- featureNames = Symbol[]
93
- dataTypes = String[]
94
- io = IOBuffer (response. body)
95
- for line in eachline (io)
96
- if length (line) > 0
97
- if line[1 : 1 ] != " %"
98
- d = []
99
- if occursin (" @attribute" , lowercase (line))
100
- splitline = split (line)
101
- push! (featureNames, Symbol (splitline[2 ]))
102
- push! (dataTypes, lowercase (join (splitline[3 : end ], " " )))
103
- elseif occursin (" @relation" , lowercase (line))
104
- nothing
105
- elseif occursin (" @data" , lowercase (line))
106
- # it means the data starts
107
- break
108
- end
109
- end
110
- end
111
- end
112
- while io. data[io. ptr] ∈ (0x0a , 0x25 ) # skip empty new lines and comments
113
- readline (io)
114
- end
115
- if io. data[io. ptr] == 0x7b # sparse ARFF file
116
- tmp = [(Int[], Union{Missing, type ∈ (" numeric" , " real" ) ? Float64 : type == " integer" ? Int : String}[]) for type in dataTypes]
117
- i = 0
118
- for line in eachline (io)
119
- if line[1 : 1 ] != " %"
120
- splitline = split (line[2 : end - 1 ], " ," )
121
- splitline == [" " ] && continue
122
- i += 1
123
- for entry in splitline
124
- idx_string, val = split (entry)
125
- idx = parse (Int, idx_string) + 1
126
- push! (tmp[idx][1 ], i)
127
- push! (tmp[idx][2 ], _parse (dataTypes[idx], val))
128
- end
129
- end
130
- end
131
- tmpd = Dict (featureNames[k] => _vec (tmp[k][1 ], identity .(tmp[k][2 ]), i)
132
- for k in eachindex (featureNames))
133
- inferred = [eltype (tmpd[k]) for k in featureNames]
134
- result = CSV. Tables. DictColumnTable (CSV. Tables. Schema (featureNames, inferred),
135
- tmpd)
136
- else
137
- result = CSV. File (io;
138
- header = featureNames,
139
- comment = " %" ,
140
- missingstring = " ?" ,
141
- quotechar = ' '' ,
142
- escapechar = ' \\ ' ,
143
- kwargs... )
144
- inferred = CSV. gettypes (result)
145
- result = CSV. Tables. dictcolumntable (result)
146
- end
147
- if parser != :csv && length (featureNames) > 2000
148
- @info " Parser $parser is very slow for more than 2000 features. Returning result of csv parser."
149
- parser = :csv
150
- end
151
- idxs = needs_coercion .(inferred, dataTypes, featureNames, parser == :csv && verbosity > 0 )
152
- if parser ∈ (:openml , :auto )
153
- result = coerce (result, [name => scitype (type, inferred)
154
- for (name, type, inferred) in
155
- zip (featureNames[idxs], dataTypes[idxs], inferred[idxs])]. .. )
156
- end
157
- if parser == :auto
158
- result = coerce (result, autotype (result))
159
- end
160
- return result
161
- end
162
-
163
47
"""
164
48
MLJOpenML.load(id; verbosity = 1, parser = :csv, kwargs...)
165
49
166
50
Load the OpenML dataset with specified `id`, from those listed by
167
51
[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
168
- If `parser = :csv` the types of the columns are automatically detected by the
169
- `CSV.read` function. A message is shown, if `verbosity > 0` and the detected
170
- type does not match the OpenML metadata. If `parser = :openml` the OpenML metadata
171
- is used to `coerce` the columns to scientific types according to the rules:
172
-
173
- | metadata | inferred type | scientific type |
174
- |----------|---------------|-----------------|
175
- |numeric | <: Real | Continuous |
176
- |numeric | <: Integer | Count |
177
- |real | <: Any | Continuous |
178
- |integer | <: Any | Count |
179
- |string | <: Any | Textual |
180
- |{ANYTHING}| <: Any | Multiclass |
181
-
182
- See [here](https://waikato.github.io/weka-wiki/formats_and_processing/arff_developer/)
183
- for info on the OpenML metadata.
184
-
185
- With `parser = :auto`, the `autotype`'s of the output of `parser = :openml` are
186
- used to coerce the data further.
187
-
188
- For data with more than 2000 features (columns) `parser = :csv` is used always,
189
- because `parser = :openml` can be much slower.
190
-
191
- Extra `kwargs` are passed to the CSV parser, `CSV.File(...)`.
192
52
193
53
Returns a table.
194
54
@@ -203,7 +63,7 @@ df = DataFrame(table);
203
63
function load (id:: Int ; verbosity = 1 , parser = :csv , kwargs... )
204
64
response = load_Dataset_Description (id)
205
65
arff_file = HTTP. request (" GET" , response[" data_set_description" ][" url" ])
206
- return convert_ARFF_to_columntable ( arff_file, verbosity, parser; kwargs ... )
66
+ return ARFFFiles . load ( IOBuffer ( arff_file. body) )
207
67
end
208
68
209
69
0 commit comments