1
1
using HTTP
2
2
using JSON
3
+ import ARFFFiles
4
+ import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype
5
+ using Markdown
3
6
4
7
const API_URL = " https://www.openml.org/api/v1/json"
5
8
@@ -8,10 +11,10 @@ const API_URL = "https://www.openml.org/api/v1/json"
8
11
# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
9
12
# https://www.openml.org/api_docs#!/data/get_data_id
10
13
11
- # To do :
12
- # - Save the file in a local folder
13
- # - Check downloaded files in local folder before downloading it again
14
- # - Use local stored file whenever possible
14
+ # TODO :
15
+ # - Use e.g. DataDeps to cache data locally
16
+ # - Put the ARFF parser to a separate package or use ARFFFiles when
17
+ # https://github.com/cjdoris/ARFFFiles.jl/issues/4 is fixed.
15
18
16
19
"""
17
20
Returns information about a dataset. The information includes the name,
@@ -42,73 +45,33 @@ function load_Dataset_Description(id::Int; api_key::String="")
42
45
end
43
46
44
47
"""
45
- Returns a Vector of NamedTuples.
46
- Receives an `HTTP.Message.response` that has an
47
- ARFF file format in the `body` of the `Message`.
48
- """
49
- function convert_ARFF_to_rowtable (response)
50
- data = String (response. body)
51
- data2 = split (data, " \n " )
52
-
53
- featureNames = String[]
54
- dataTypes = String[]
55
- # TODO : make this more performant by anticipating types?
56
- named_tuples = [] # `Any` type here bad
57
- for line in data2
58
- if length (line) > 0
59
- if line[1 : 1 ] != " %"
60
- d = []
61
- if occursin (" @attribute" , lowercase (line))
62
- push! (featureNames, replace (replace (split (line, " " )[2 ], " '" => " " ), " -" => " _" ))
63
- push! (dataTypes, split (line, " " )[3 ])
64
- elseif occursin (" @relation" , lowercase (line))
65
- nothing
66
- elseif occursin (" @data" , lowercase (line))
67
- # it means the data starts
68
- nothing
69
- else
70
- values = split (line, " ," )
71
- for i in eachindex (featureNames)
72
- if lowercase (dataTypes[i]) in [" real" ," numeric" ]
73
- push! (d, featureNames[i] => Meta. parse (values[i]))
74
- else
75
- # all the rest will be considered as String
76
- push! (d, featureNames[i] => values[i])
77
- end
78
- end
79
- push! (named_tuples, (; (Symbol (k) => v for (k,v) in d). .. ))
80
- end
81
- end
82
- end
83
- end
84
- return identity .(named_tuples) # not performant; see above
85
- end
48
+ MLJOpenML.load(id; parser = :arff)
86
49
87
- """
88
- MLJOpenML.load(id)
50
+ Load the OpenML dataset with specified `id`, from those listed by
51
+ [`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
52
+ With `parser = :arff` (default) the ARFFFiles.jl parser is used.
53
+ With `parser = :auto` the output of the ARFFFiles parser is coerced to
54
+ automatically detected scientific types.
89
55
90
- Load the OpenML dataset with specified `id`, from those listed on the
91
- [OpenML site](https://www.openml.org/search?type=data).
56
+ Returns a table.
92
57
93
- Returns a "row table", i.e., a `Vector` of identically typed
94
- `NamedTuple`s. A row table is compatible with the
95
- [Tables.jl](https://github.com/JuliaData/Tables.jl) interface and can
96
- therefore be readily converted to other compatible formats. For
97
- example:
58
+ # Examples
98
59
99
60
```julia
100
61
using DataFrames
101
- rowtable = MLJOpenML.load(61);
102
- df = DataFrame(rowtable);
103
-
104
- using MLJ
105
- df2 = coerce(df, :class=>Multiclass)
62
+ table = MLJOpenML.load(61);
63
+ df = DataFrame(table);
106
64
```
107
65
"""
108
- function load (id:: Int )
66
+ function load (id:: Int ; parser = :arff )
109
67
response = load_Dataset_Description (id)
110
68
arff_file = HTTP. request (" GET" , response[" data_set_description" ][" url" ])
111
- return convert_ARFF_to_rowtable (arff_file)
69
+ data = ARFFFiles. load (IOBuffer (arff_file. body))
70
+ if parser == :auto
71
+ return coerce (data, autotype (data))
72
+ else
73
+ return data
74
+ end
112
75
end
113
76
114
77
@@ -205,33 +168,9 @@ function load_Data_Qualities(id::Int; api_key::String = "")
205
168
end
206
169
207
170
"""
208
- List datasets, possibly filtered by a range of properties.
209
- Any number of properties can be combined by listing them one after
210
- the other in the
211
- form '/data/list/{filter}/{value}/{filter}/{value}/...'
212
- Returns an array with all datasets that match the constraints.
213
-
214
- Any combination of these filters /limit/{limit}/offset/{offset} -
215
- returns only {limit} results starting from result number {offset}.
216
- Useful for paginating results. With /limit/5/offset/10,
217
- results 11..15 will be returned.
218
-
219
- Both limit and offset need to be specified.
220
- /status/{status} - returns only datasets with a given status,
221
- either 'active', 'deactivated', or 'in_preparation'.
222
- /tag/{tag} - returns only datasets tagged with the given tag.
223
- /{data_quality}/{range} - returns only tasks for which the
224
- underlying datasets have certain qualities.
225
- {data_quality} can be data_id, data_name, data_version, number_instances,
226
- number_features, number_classes, number_missing_values. {range} can be a
227
- specific value or a range in the form 'low..high'.
228
- Multiple qualities can be combined, as in
229
- 'number_instances/0..50/number_features/0..10'.
230
-
231
- - 370 - Illegal filter specified.
232
- - 371 - Filter values/ranges not properly specified.
233
- - 372 - No results. There where no matches for the given constraints.
234
- - 373 - Can not specify an offset without a limit.
171
+ load_List_And_Filter(filters; api_key = "")
172
+
173
+ See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
235
174
"""
236
175
function load_List_And_Filter (filters:: String ; api_key:: String = " " )
237
176
if api_key == " "
@@ -257,6 +196,126 @@ function load_List_And_Filter(filters::String; api_key::String = "")
257
196
return nothing
258
197
end
259
198
199
+ qualitynames (x) = haskey (x, " name" ) ? [x[" name" ]] : []
200
+
201
+ """
202
+ list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple)
203
+
204
+ Lists all active OpenML datasets, if `tag = nothing` (default).
205
+ To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref).
206
+ An alternative `output_format` can be chosen, e.g. `DataFrame`, if the
207
+ `DataFrames` package is loaded.
208
+
209
+ A filter is a string of `<data quality>/<range>` or `<data quality>/<value>`
210
+ pairs, concatenated using `/`, such as
211
+
212
+ ```julia
213
+ filter = "number_features/10/number_instances/500..10000"
214
+ ```
215
+
216
+ The allowed data qualities include `tag`, `status`, `limit`, `offset`,
217
+ `data_id`, `data_name`, `data_version`, `uploader`,
218
+ `number_instances`, `number_features`, `number_classes`,
219
+ `number_missing_values`.
220
+
221
+ For more on the format and effect of `filters` refer to the [openml
222
+ API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
223
+
224
+ # Examples
225
+ ```
226
+ julia> using DataFrames
227
+
228
+ julia> ds = MLJOpenML.list_datasets(
229
+ tag = "OpenML100",
230
+ filter = "number_instances/100..1000/number_features/1..10",
231
+ output_format = DataFrame
232
+ )
233
+
234
+ julia> sort!(ds, :NumberOfFeatures)
235
+ ```
236
+ """
237
+ function list_datasets (; tag = nothing , filter = " " , filters= filter,
238
+ api_key = " " , output_format = NamedTuple)
239
+ if tag != = nothing
240
+ if is_valid_tag (tag)
241
+ filters *= " /tag/$tag "
242
+ else
243
+ @warn " $tag is not a valid tag. See `list_tags()` for a list of tags."
244
+ return
245
+ end
246
+ end
247
+ data = MLJOpenML. load_List_And_Filter (filters; api_key = api_key)
248
+ datasets = data[" data" ][" dataset" ]
249
+ qualities = Symbol .(union (vcat ([vcat (qualitynames .(entry[" quality" ])... ) for entry in datasets]. .. )))
250
+ result = merge ((id = Int[], name = String[], status = String[]),
251
+ NamedTuple {tuple(qualities...)} (ntuple (i -> Union{Missing, Int}[], length (qualities))))
252
+ for entry in datasets
253
+ push! (result. id, entry[" did" ])
254
+ push! (result. name, entry[" name" ])
255
+ push! (result. status, entry[" status" ])
256
+ for quality in entry[" quality" ]
257
+ push! (getproperty (result, Symbol (quality[" name" ])),
258
+ Meta. parse (quality[" value" ]))
259
+ end
260
+ for quality in qualities
261
+ if length (getproperty (result, quality)) < length (result. id)
262
+ push! (getproperty (result, quality), missing )
263
+ end
264
+ end
265
+ end
266
+ output_format (result)
267
+ end
268
+
269
+ is_valid_tag (tag:: String ) = tag ∈ list_tags ()
270
+ is_valid_tag (tag) = false
271
+
272
+ """
273
+ list_tags()
274
+
275
+ List all available tags.
276
+ """
277
+ function list_tags ()
278
+ url = string (API_URL, " /data/tag/list" )
279
+ try
280
+ r = HTTP. request (" GET" , url)
281
+ return JSON. parse (String (r. body))[" data_tag_list" ][" tag" ]
282
+ catch
283
+ return nothing
284
+ end
285
+ end
286
+
287
+ """
288
+ describe_dataset(id)
289
+
290
+ Load and show the OpenML description of the data set `id`.
291
+ Use [`list_datasets`](@ref) to browse available data sets.
292
+
293
+ # Examples
294
+ ```
295
+ julia> MLJOpenML.describe_dataset(6)
296
+ Author: David J. Slate Source: UCI
297
+ (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P.
298
+ W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers".
299
+ Machine Learning 6(2), 1991
300
+
301
+ 1. TITLE:
302
+
303
+ Letter Image Recognition Data
304
+
305
+ The objective is to identify each of a large number of black-and-white
306
+ rectangular pixel displays as one of the 26 capital letters in the English
307
+ alphabet. The character images were based on 20 different fonts and each
308
+ letter within these 20 fonts was randomly distorted to produce a file of
309
+ 20,000 unique stimuli. Each stimulus was converted into 16 primitive
310
+ numerical attributes (statistical moments and edge counts) which were then
311
+ scaled to fit into a range of integer values from 0 through 15. We
312
+ typically train on the first 16000 items and then use the resulting model
313
+ to predict the letter category for the remaining 4000. See the article
314
+ cited above for more details.
315
+ ```
316
+ """
317
+ describe_dataset (id) = Markdown. parse (load_Dataset_Description (id)[" data_set_description" ][" description" ])
318
+
260
319
# Flow API
261
320
262
321
# Task API
0 commit comments