Skip to content

Commit 5a52b02

Browse files
authored
refactor: do not swallow non-backend errors in datasets() (#85)
* add basic Dataset constructor tests * make sure Dataset only throws JuliaHubError * in mocking, .tags is sometimes Any[] * only silently capture JuliaHubErrors in datasets()
1 parent 9440d85 commit 5a52b02

File tree

5 files changed

+220
-91
lines changed

5 files changed

+220
-91
lines changed

src/datasets.jl

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -131,30 +131,39 @@ Base.@kwdef struct Dataset
131131
end
132132

133133
function Dataset(d::Dict)
134-
owner = d["owner"]["username"]
135-
name = d["name"]
134+
owner = _get_json(
135+
_get_json(d, "owner", Dict),
136+
"username", String,
137+
)
138+
name = _get_json(d, "name", AbstractString)
136139
versions_json = _get_json_or(d, "versions", Vector, [])
137-
versions = sort([DatasetVersion(json; owner, name) for json in versions_json]; by=dsv -> dsv.id)
138-
Dataset(;
139-
uuid=UUIDs.UUID(d["id"]),
140+
versions = sort(
141+
[DatasetVersion(json; owner, name) for json in versions_json];
142+
by=dsv -> dsv.id,
143+
)
144+
_storage = let storage_json = _get_json(d, "storage", Dict)
145+
_DatasetStorage(;
146+
credentials_url=_get_json(d, "credentials_url", AbstractString),
147+
region=_get_json(storage_json, "bucket_region", AbstractString),
148+
bucket=_get_json(storage_json, "bucket", AbstractString),
149+
prefix=_get_json(storage_json, "prefix", AbstractString),
150+
)
151+
end
152+
return Dataset(;
153+
uuid=_get_json_convert(d, "id", UUIDs.UUID),
140154
name, owner, versions,
141-
dtype=d["type"],
142-
description=d["description"],
143-
size=d["size"],
144-
tags=d["tags"],
145-
_downloadURL=d["downloadURL"],
146-
_last_modified=_nothing_or(d["lastModified"]) do last_modified
155+
dtype=_get_json(d, "type", AbstractString),
156+
description=_get_json(d, "description", AbstractString),
157+
size=_get_json(d, "size", Integer),
158+
tags=_get_json(d, "tags", Vector),
159+
_downloadURL=_get_json(d, "downloadURL", AbstractString),
160+
_last_modified=_nothing_or(_get_json(d, "lastModified", AbstractString)) do last_modified
147161
datetime_utc = Dates.DateTime(
148162
last_modified, Dates.dateformat"YYYY-mm-ddTHH:MM:SS.ss"
149163
)
150164
_utc2localtz(datetime_utc)
151165
end,
152-
_storage=_DatasetStorage(;
153-
credentials_url=d["credentials_url"],
154-
region=d["storage"]["bucket_region"],
155-
bucket=d["storage"]["bucket"],
156-
prefix=d["storage"]["prefix"],
157-
),
166+
_storage,
158167
_json=d,
159168
)
160169
end
@@ -351,6 +360,10 @@ function datasets(
351360
end
352361
return Dataset(dataset)
353362
catch e
363+
# If Dataset() fails due to some unexpected value in one of the dataset JSON objects that
364+
# JuliaHub.jl can not handle, it should only throw a JuliaHubError. So we rethrow on other
365+
# error types, as filtering all of them out could potentially hide JuliaHub.jl bugs.
366+
isa(e, JuliaHubError) || rethrow()
354367
@debug "Invalid dataset in GET /datasets response" dataset exception = (
355368
e, catch_backtrace()
356369
)

src/utils.jl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,34 @@ function _get_json_or(
176176
haskey(json, key) ? _get_json(json, key, T; msg) : default
177177
end
178178

179+
# _get_json_convert is a _get_json-type helper that also does some sort of type conversion
180+
# parsing etc. The general signature is the following:
181+
#
182+
# function _get_json_convert(
183+
# json::Dict, key::AbstractString, ::Type{T};
184+
# msg::Union{AbstractString, Nothing}=nothing
185+
# )::T
186+
#
187+
# Although in practice we implement for each type separately, since the parsing/conversion logic
188+
# can vary dramatically.
189+
#
190+
# A key point, though, is that it will throw a JuliaHubError if the server response is somehow
191+
# invalid and we can't parse/convert it properly.
192+
function _get_json_convert(
193+
json::Dict, key::AbstractString, ::Type{UUIDs.UUID}; msg::Union{AbstractString, Nothing}=nothing
194+
)::UUIDs.UUID
195+
uuid_str = _get_json(json, key, String; msg)
196+
uuid = tryparse(UUIDs.UUID, uuid_str)
197+
if isnothing(uuid)
198+
errormsg = """
199+
Invalid JSON returned by the server: `$key` not a valid UUID string.
200+
Server returned '$(uuid_str)'."""
201+
isnothing(msg) || (errormsg = string(msg, '\n', errormsg))
202+
throw(JuliaHubError(errormsg))
203+
end
204+
return uuid
205+
end
206+
179207
"""
180208
mutable struct Secret
181209

test/datasets.jl

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,81 @@
2121
)
2222
end
2323

24+
# These tests mainly exercise the Dataset() constructor, to ensure that it throws the
25+
# correct error objects.
26+
@testset "Dataset" begin
27+
d0 = () -> _dataset_json("test/test"; version_sizes=[42])
28+
let ds = JuliaHub.Dataset(d0())
29+
@test ds isa JuliaHub.Dataset
30+
@test ds.uuid == Base.UUID("3c4441bd-04bd-59f2-5426-70de923e67c2")
31+
@test ds.owner == "test"
32+
@test ds.name == "test"
33+
@test ds.description == "An example dataset"
34+
@test ds.tags == ["tag1", "tag2"]
35+
@test ds.dtype == "Blob"
36+
@test ds.size == 42
37+
@test length(ds.versions) == 1
38+
@test ds.versions[1].id == 1
39+
@test ds.versions[1].size == 42
40+
end
41+
42+
# We don't verify dtype values (this list might expand in the future)
43+
let d = Dict(d0()..., "type" => "Unknown Dtype")
44+
ds = JuliaHub.Dataset(d)
45+
@test ds.dtype == "Unknown Dtype"
46+
end
47+
48+
# If there are critical fields missing, it will throw
49+
@testset "required property: $(pname)" for pname in (
50+
"id", "owner", "name", "type", "description", "tags",
51+
"downloadURL", "lastModified", "credentials_url", "storage",
52+
)
53+
let d = Dict(d0()...)
54+
delete!(d, pname)
55+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
56+
@test startswith(
57+
e.value.msg,
58+
"Invalid JSON returned by the server: `$pname` missing in the response.",
59+
)
60+
end
61+
# Replace the value with a value that's of incorrect type
62+
let d = Dict(d0()..., pname => missing)
63+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
64+
@test startswith(
65+
e.value.msg,
66+
"Invalid JSON returned by the server: `$(pname)` of type `Missing`, expected",
67+
)
68+
end
69+
end
70+
# We also need to be able to parse the UUID into UUIDs.UUID
71+
let d = Dict(d0()..., "id" => "1234")
72+
@test_throws JuliaHub.JuliaHubError(
73+
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '1234'."
74+
) JuliaHub.Dataset(d)
75+
end
76+
# And correctly throw for invalid owner.username
77+
let d = Dict(d0()..., "owner" => nothing)
78+
@test_throws JuliaHub.JuliaHubError(
79+
"Invalid JSON returned by the server: `owner` of type `Nothing`, expected `<: Dict`."
80+
) JuliaHub.Dataset(d)
81+
end
82+
83+
# Missing versions list is okay though. We assume that there are no
84+
# versions then.
85+
let d = d0()
86+
delete!(d, "versions")
87+
ds = JuliaHub.Dataset(d)
88+
@test length(ds.versions) == 0
89+
end
90+
# But a bad type is not okay
91+
let d = Dict(d0()..., "versions" => 0)
92+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
93+
@test startswith(
94+
e.value.msg, "Invalid JSON returned by the server: `versions` of type `Int64`"
95+
)
96+
end
97+
end
98+
2499
@testset "JuliaHub.dataset(s)" begin
25100
empty!(MOCK_JULIAHUB_STATE)
26101
Mocking.apply(mocking_patch) do
@@ -119,7 +194,7 @@ end
119194
@test isempty(ds.versions)
120195
end
121196

122-
MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["erroneous_dataset"]
197+
MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["bad-user/erroneous_dataset"]
123198
err_ds_warn = (
124199
:warn,
125200
"The JuliaHub GET /datasets response contains erroneous datasets. Omitting 1 entries.",

test/mocking.jl

Lines changed: 64 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -332,83 +332,25 @@ function _restcall_mocked(method, url, headers, payload; query)
332332
Dict("message" => "", "success" => true) |> jsonresponse(200)
333333
end
334334
elseif (method == :GET) && endswith(url, "datasets")
335-
dataset_params = get(MOCK_JULIAHUB_STATE, :dataset_params, Dict())
336-
dataset_version_sizes = get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing)
337-
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
338-
versions_json =
339-
dataset -> begin
340-
version_sizes = something(
341-
dataset_version_sizes,
342-
(dataset == "example-dataset") ? [57, 331] : [57],
343-
)
344-
Dict(
345-
"version" => string("v", length(version_sizes)),
346-
"versions" => map(enumerate(version_sizes)) do (i, sz)
347-
Dict(
348-
"version" => i,
349-
"blobstore_path" => string("u", 2),
350-
"size" => sz,
351-
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
352-
)
353-
end,
354-
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
355-
)
356-
end
357-
#! format: off
358-
shared = Dict(
359-
"groups" => Any[],
360-
"storage" => Dict(
361-
"bucket_region" => "us-east-1",
362-
"bucket" => "datasets-bucket",
363-
"prefix" => "datasets",
364-
"vendor" => "aws",
365-
),
366-
"description" => get(dataset_params, "description", "An example dataset"),
367-
"tags" => get(dataset_params, "tags", ["tag1", "tag2"]),
368-
"license" => (
369-
"name" => "MIT License",
370-
"spdx_id" => "MIT",
371-
"text" => nothing,
372-
"url" => "https://opensource.org/licenses/MIT",
373-
),
374-
"lastModified" => "2022-10-12T05:39:42.906",
375-
"downloadURL" => "",
376-
"credentials_url" => "...",
377-
)
378-
#! format: on
379-
datasets = []
380-
for dataset_full_id in existing_datasets
381-
username, dataset = split(dataset_full_id, '/'; limit=2)
382-
push!(datasets,
383-
Dict(
384-
"id" => string(uuidhash(dataset_full_id)),
385-
"name" => dataset,
386-
"owner" => Dict(
387-
"username" => username,
388-
"type" => "User",
389-
),
390-
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
391-
"visibility" => occursin("public", dataset) ? "public" : "private",
392-
versions_json(dataset)...,
393-
shared...,
335+
datasets = Dict[]
336+
for dataset_name in existing_datasets
337+
d = _dataset_json(
338+
dataset_name;
339+
params=get(MOCK_JULIAHUB_STATE, :dataset_params, Dict()),
340+
version_sizes=something(
341+
get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing),
342+
endswith(dataset_name, "/example-dataset") ? [57, 331] : [57],
394343
),
395344
)
345+
push!(datasets, d)
396346
end
397-
for dataset in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
398-
push!(datasets,
399-
Dict(
400-
"id" => string(uuidhash(dataset)),
401-
"name" => dataset,
402-
"owner" => Dict(
403-
"username" => nothing,
404-
"type" => "User",
405-
),
406-
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
407-
"visibility" => occursin("public", dataset) ? "public" : "private",
408-
versions_json(dataset)...,
409-
shared...,
410-
),
347+
for dataset_name in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
348+
d = _dataset_json(
349+
dataset_name;
350+
version_sizes=(dataset_name == "example-dataset") ? [57, 331] : [57],
411351
)
352+
d["owner"]["username"] = nothing
353+
push!(datasets, d)
412354
end
413355
datasets |> jsonresponse(200)
414356
elseif (method == :DELETE) && endswith(url, DATASET_REGEX)
@@ -776,3 +718,52 @@ function _http_request_mocked(
776718
]
777719
HTTP.Response(200, headers, b"success")
778720
end
721+
722+
function _dataset_json(
723+
dataset_name::AbstractString;
724+
params=Dict(),
725+
version_sizes=[],
726+
)
727+
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
728+
username, dataset = string.(split(dataset_name, '/'; limit=2))
729+
return Dict{String, Any}(
730+
"id" => string(uuidhash(dataset_name)),
731+
"name" => dataset,
732+
"owner" => Dict{String, Any}(
733+
"username" => username,
734+
"type" => "User",
735+
),
736+
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
737+
"visibility" => occursin("public", dataset) ? "public" : "private",
738+
# versions
739+
"version" => string("v", length(version_sizes)),
740+
"versions" => map(enumerate(version_sizes)) do (i, sz)
741+
Dict{String, Any}(
742+
"version" => i,
743+
"blobstore_path" => string("u", 2),
744+
"size" => sz,
745+
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
746+
)
747+
end,
748+
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
749+
# shared
750+
"groups" => Any[],
751+
"storage" => Dict{String, Any}(
752+
"bucket_region" => "us-east-1",
753+
"bucket" => "datasets-bucket",
754+
"prefix" => "datasets",
755+
"vendor" => "aws",
756+
),
757+
"description" => get(params, "description", "An example dataset"),
758+
"tags" => get(params, "tags", ["tag1", "tag2"]),
759+
"license" => (
760+
"name" => "MIT License",
761+
"spdx_id" => "MIT",
762+
"text" => nothing,
763+
"url" => "https://opensource.org/licenses/MIT",
764+
),
765+
"lastModified" => "2022-10-12T05:39:42.906",
766+
"downloadURL" => "",
767+
"credentials_url" => "...",
768+
)
769+
end

test/utils.jl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,25 @@ end
7777
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("")
7878
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("bad-string")
7979
end
80+
81+
@testset "_get_json_convert" begin
82+
@test JuliaHub._get_json_convert(
83+
Dict("id" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
84+
) == UUIDs.UUID("123e4567-e89b-12d3-a456-426614174000")
85+
# Error cases:
86+
@test_throws JuliaHub.JuliaHubError(
87+
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '123'."
88+
) JuliaHub._get_json_convert(
89+
Dict("id" => "123"), "id", UUIDs.UUID
90+
)
91+
@test_throws JuliaHub.JuliaHubError(
92+
"Invalid JSON returned by the server: `id` of type `Int64`, expected `<: String`."
93+
) JuliaHub._get_json_convert(
94+
Dict("id" => 123), "id", UUIDs.UUID
95+
)
96+
@test_throws JuliaHub.JuliaHubError(
97+
"Invalid JSON returned by the server: `id` missing in the response.\nKeys present: _id_missing\njson: Dict{String, String} with 1 entry:\n \"_id_missing\" => \"123e4567-e89b-12d3-a456-426614174000\""
98+
) JuliaHub._get_json_convert(
99+
Dict("_id_missing" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
100+
)
101+
end

0 commit comments

Comments
 (0)