Skip to content

Commit 738790c

Browse files
authored
Merge pull request #19 from JuliaAI/dev
For a 0.3.0 release
2 parents 4444745 + ec9cd80 commit 738790c

File tree

2 files changed

+66
-150
lines changed

2 files changed

+66
-150
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "OpenML"
22
uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66"
33
authors = ["Diego Arenas <[email protected]>", "Anthony D. Blaom <[email protected]>"]
4-
version = "0.2.0"
4+
version = "0.3.0"
55

66
[deps]
77
ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
@@ -11,8 +11,8 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
1111
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
1212

1313
[compat]
14-
ARFFFiles = "1.3"
15-
HTTP = "0.8, 0.9"
14+
ARFFFiles = "1.4.1"
15+
HTTP = "0.8, 0.9,1"
1616
JSON = "0.21"
1717
julia = "1"
1818

src/data.jl

Lines changed: 63 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,51 @@
11
const API_URL = "https://www.openml.org/api/v1/json"
22

3-
# Data API
4-
# The structures are based on these descriptions
5-
# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
6-
# https://www.openml.org/api_docs#!/data/get_data_id
3+
struct OpenMLAPIError <: Exception
4+
msg::String
5+
end
6+
function Base.showerror(io::IO, e::OpenMLAPIError)
7+
print(io, e.msg)
8+
end
79

810

9-
"""
10-
Returns information about a dataset. The information includes the name,
11-
information about the creator, URL to download it and more.
12-
13-
- 110 - Please provide data_id.
14-
- 111 - Unknown dataset. Data set description with data_id was not found in the database.
15-
- 112 - No access granted. This dataset is not shared with you.
16-
"""
17-
function load_Dataset_Description(id::Int; api_key::String="")
18-
url = string(API_URL, "/data/$id")
11+
# Data API. See REST API on https://www.openml.org/apis
12+
function get(query)
1913
try
20-
r = HTTP.request("GET", url)
21-
if r.status == 200
22-
return JSON.parse(String(r.body))
23-
elseif r.status == 110
24-
println("Please provide data_id.")
25-
elseif r.status == 111
26-
println("Unknown dataset. Data set description with data_id was not found in the database.")
27-
elseif r.status == 112
28-
println("No access granted. This dataset is not shared with you.")
29-
end
14+
r = HTTP.request("GET", string(API_URL, query))
15+
return JSON.parse(String(r.body))
3016
catch e
31-
println("Error occurred. Check if there exists a dataset with id $id.")
32-
println("See e.g. OpenML.list_datasets()\n")
33-
println(e)
34-
return nothing
17+
if isa(e, HTTP.StatusError) && e.status == 412
18+
error_string = String(e.response.body)
19+
err = try
20+
JSON.parse(error_string)["error"]
21+
catch
22+
@error(error_string)
23+
throw(OpenMLAPIError("Malformed query \"$query\"."))
24+
end
25+
msg = err["message"]
26+
code = err["code"]
27+
additional_msg = haskey(err, "additional_message") ? err["additional_message"] : ""
28+
if code == "111"
29+
additional_msg *= "Check if there is a dataset with id $(last(split(query, '/'))).\nSee e.g. OpenML.list_datasets(). "
30+
end
31+
throw(OpenMLAPIError(msg * ". " * additional_msg * "(error code $code)"))
32+
else
33+
rethrow()
34+
end
3535
end
3636
return nothing
3737
end
3838

3939
"""
40-
OpenML.load(id)
40+
OpenML.load_Dataset_Description(id::Int)
41+
42+
Returns information about a dataset. The information includes the name,
43+
information about the creator, URL to download it and more.
44+
"""
45+
load_Dataset_Description(id::Int) = get("/data/$id")
46+
47+
"""
48+
OpenML.load(id; maxbytes = nothing)
4149
4250
Load the OpenML dataset with specified `id`, from those listed by
4351
[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
@@ -54,9 +62,11 @@ table = OpenML.load(61)
5462
df = DataFrame(table) # transform to a DataFrame
5563
using ScientificTypes
5664
df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types
65+
66+
peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table
5767
```
5868
"""
59-
function load(id::Int)
69+
function load(id::Int; maxbytes = nothing)
6070
if VERSION > v"1.3.0"
6171
dir = first(Artifacts.artifacts_dirs())
6272
toml = joinpath(dir, "OpenMLArtifacts.toml")
@@ -74,135 +84,42 @@ function load(id::Int)
7484
filename = tempname()
7585
download(url, filename)
7686
end
77-
ARFFFiles.load(filename)
87+
ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes = maxbytes), filename)
7888
end
7989

8090

8191
"""
82-
Returns a list of all data qualities in the system.
92+
load_Data_Qualities_List()
8393
84-
- 412 - Precondition failed. An error code and message are returned
85-
- 370 - No data qualities available. There are no data qualities in the system.
94+
Returns a list of all data qualities in the system.
8695
"""
87-
function load_Data_Qualities_List()
88-
url = string(API_URL, "/data/qualities/list")
89-
try
90-
r = HTTP.request("GET", url)
91-
if r.status == 200
92-
return JSON.parse(String(r.body))
93-
elseif r.status == 370
94-
println("No data qualities available. There are no data qualities in the system.")
95-
end
96-
catch e
97-
println("Error occurred : $e")
98-
return nothing
99-
end
100-
return nothing
101-
end
96+
load_Data_Qualities_List() = get("/data/qualities/list")
10297

10398
"""
104-
Returns a list of all data qualities in the system.
99+
load_Data_Qualities(id::Int)
105100
106-
- 271 - Unknown dataset. Data set with the given data ID was not found (or is not shared with you).
107-
- 272 - No features found. The dataset did not contain any features, or we could not extract them.
108-
- 273 - Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.
109-
- 274 - Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins.
101+
Returns the qualities of dataset `id`.
110102
"""
111-
function load_Data_Features(id::Int; api_key::String = "")
112-
if api_key == ""
113-
url = string(API_URL, "/data/features/$id")
114-
end
115-
try
116-
r = HTTP.request("GET", url)
117-
if r.status == 200
118-
return JSON.parse(String(r.body))
119-
elseif r.status == 271
120-
println("Unknown dataset. Data set with the given data ID was not found (or is not shared with you).")
121-
elseif r.status == 272
122-
println("No features found. The dataset did not contain any features, or we could not extract them.")
123-
elseif r.status == 273
124-
println("Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.")
125-
elseif r.status == 274
126-
println("Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins.")
127-
end
128-
catch e
129-
println("Error occurred : $e")
130-
return nothing
131-
end
132-
return nothing
133-
end
103+
load_Data_Qualities(id::Int) = get("/data/qualities/$id")
134104

135105
"""
136-
Returns the qualities of a dataset.
137-
138-
- 360 - Please provide data set ID
139-
- 361 - Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you.
140-
- 362 - No qualities found. The registered dataset did not contain any calculated qualities.
141-
- 363 - Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes.
142-
- 364 - Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team.
143-
- 365 - Interval start or end illegal. There was a problem with the interval start or end.
106+
load_Data_Features(id::Int)
107+
108+
Returns a list of all data qualities for dataset `id`.
144109
"""
145-
function load_Data_Qualities(id::Int; api_key::String = "")
146-
if api_key == ""
147-
url = string(API_URL, "/data/qualities/$id")
148-
end
149-
try
150-
r = HTTP.request("GET", url)
151-
if r.status == 200
152-
return JSON.parse(String(r.body))
153-
elseif r.status == 360
154-
println("Please provide data set ID")
155-
elseif r.status == 361
156-
println("Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you.")
157-
elseif r.status == 362
158-
println("No qualities found. The registered dataset did not contain any calculated qualities.")
159-
elseif r.status == 363
160-
println("Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes.")
161-
elseif r.status == 364
162-
println("Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team.")
163-
elseif r.status == 365
164-
println("Interval start or end illegal. There was a problem with the interval start or end.")
165-
end
166-
catch e
167-
println("Error occurred : $e")
168-
return nothing
169-
end
170-
return nothing
171-
end
110+
load_Data_Features(id::Int) = get("/data/features/$id")
172111

173112
"""
174-
load_List_And_Filter(filters; api_key = "")
113+
load_List_And_Filter(filters)
175114
176115
See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
177116
"""
178-
function load_List_And_Filter(filters::String; api_key::String = "")
179-
if api_key == ""
180-
url = string(API_URL, "/data/list/$filters")
181-
end
182-
try
183-
r = HTTP.request("GET", url)
184-
if r.status == 200
185-
return JSON.parse(String(r.body))
186-
elseif r.status == 370
187-
println("Illegal filter specified.")
188-
elseif r.status == 371
189-
println("Filter values/ranges not properly specified.")
190-
elseif r.status == 372
191-
println("No results. There where no matches for the given constraints.")
192-
elseif r.status == 373
193-
println("Can not specify an offset without a limit.")
194-
end
195-
catch e
196-
println("Error occurred : $e")
197-
return nothing
198-
end
199-
return nothing
200-
end
117+
load_List_And_Filter(filters::String) = get("/data/list/$filters")
201118

202119
qualitynames(x) = haskey(x, "name") ? [x["name"]] : []
203120

204121
"""
205-
list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple)
122+
list_datasets(; tag = nothing, filters = "", output_format = NamedTuple)
206123
207124
Lists all active OpenML datasets, if `tag = nothing` (default).
208125
To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref).
@@ -237,8 +154,8 @@ julia> ds = OpenML.list_datasets(
237154
julia> sort!(ds, :NumberOfFeatures)
238155
```
239156
"""
240-
function list_datasets(; tag = nothing, filter = "", filters=filter,
241-
api_key = "", output_format = NamedTuple)
157+
function list_datasets(; tag = nothing, filter = "", filters = filter,
158+
output_format = NamedTuple)
242159
if tag !== nothing
243160
if is_valid_tag(tag)
244161
filters *= "/tag/$tag"
@@ -247,7 +164,7 @@ function list_datasets(; tag = nothing, filter = "", filters=filter,
247164
return
248165
end
249166
end
250-
data = OpenML.load_List_And_Filter(filters; api_key = api_key)
167+
data = OpenML.load_List_And_Filter(filters)
251168
datasets = data["data"]["dataset"]
252169
qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...)))
253170
result = merge((id = Int[], name = String[], status = String[]),
@@ -278,12 +195,9 @@ is_valid_tag(tag) = false
278195
List all available tags.
279196
"""
280197
function list_tags()
281-
url = string(API_URL, "/data/tag/list")
282-
try
283-
r = HTTP.request("GET", url)
284-
return JSON.parse(String(r.body))["data_tag_list"]["tag"]
285-
catch
286-
return nothing
198+
result = get("/data/tag/list")
199+
if !isnothing(result)
200+
return result["data_tag_list"]["tag"]
287201
end
288202
end
289203

@@ -318,7 +232,9 @@ julia> OpenML.describe_dataset(6)
318232
```
319233
"""
320234
function describe_dataset(id)
321-
description = load_Dataset_Description(id)["data_set_description"]["description"]
235+
result = load_Dataset_Description(id)
236+
result === nothing && return
237+
description = result["data_set_description"]["description"]
322238
if isa(description, AbstractString)
323239
Markdown.parse(description)
324240
else

0 commit comments

Comments
 (0)