Skip to content

Commit 5e8d69d

Browse files
committed
Merge branch 'mp/dataset-constructor-tests' into mp/project-datasets
2 parents 8f40681 + 344be60 commit 5e8d69d

File tree

5 files changed

+226
-94
lines changed

5 files changed

+226
-94
lines changed

src/datasets.jl

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -158,15 +158,29 @@ Base.@kwdef struct Dataset
158158
_json::Dict
159159
end
160160

161-
function Dataset(d::Dict; expected_project::Union{UUIDs.UUID, Nothing}=nothing)
162-
owner = d["owner"]["username"]
163-
name = d["name"]
161+
function Dataset(d::Dict)
162+
owner = _get_json(
163+
_get_json(d, "owner", Dict),
164+
"username", String,
165+
)
166+
name = _get_json(d, "name", AbstractString)
164167
versions_json = _get_json_or(d, "versions", Vector, [])
165-
versions = sort([DatasetVersion(json; owner, name) for json in versions_json]; by=dsv -> dsv.id)
168+
versions = sort(
169+
[DatasetVersion(json; owner, name) for json in versions_json];
170+
by=dsv -> dsv.id,
171+
)
172+
_storage = let storage_json = _get_json(d, "storage", Dict)
173+
_DatasetStorage(;
174+
credentials_url=_get_json(d, "credentials_url", AbstractString),
175+
region=_get_json(storage_json, "bucket_region", AbstractString),
176+
bucket=_get_json(storage_json, "bucket", AbstractString),
177+
prefix=_get_json(storage_json, "prefix", AbstractString),
178+
)
179+
end
166180
project = if !isnothing(expected_project)
167181
project_json = _get_json(d, "project", Dict)
168182
project_json_uuid = UUIDs.UUID(
169-
_get_json(project_json, "project_id", String; msg=".project")
183+
_get_json(project_json, "project_id", String)
170184
)
171185
if project_json_uuid != expected_project
172186
msg = "Project UUID mismatch in dataset response: $(project_json_uuid), requested $(project)"
@@ -182,27 +196,22 @@ function Dataset(d::Dict; expected_project::Union{UUIDs.UUID, Nothing}=nothing)
182196
else
183197
nothing
184198
end
185-
Dataset(;
186-
uuid=UUIDs.UUID(d["id"]),
199+
return Dataset(;
200+
uuid=_get_json_convert(d, "id", UUIDs.UUID),
187201
name, owner, versions,
188-
dtype=d["type"],
189-
description=d["description"],
190-
size=d["size"],
191-
tags=d["tags"],
192-
project=project,
193-
_downloadURL=d["downloadURL"],
194-
_last_modified=_nothing_or(d["lastModified"]) do last_modified
202+
dtype=_get_json(d, "type", AbstractString),
203+
description=_get_json(d, "description", AbstractString),
204+
size=_get_json(d, "size", Integer),
205+
tags=_get_json(d, "tags", Vector),
206+
project,
207+
_downloadURL=_get_json(d, "downloadURL", AbstractString),
208+
_last_modified=_nothing_or(_get_json(d, "lastModified", AbstractString)) do last_modified
195209
datetime_utc = Dates.DateTime(
196210
last_modified, Dates.dateformat"YYYY-mm-ddTHH:MM:SS.ss"
197211
)
198212
_utc2localtz(datetime_utc)
199213
end,
200-
_storage=_DatasetStorage(;
201-
credentials_url=d["credentials_url"],
202-
region=d["storage"]["bucket_region"],
203-
bucket=d["storage"]["bucket"],
204-
prefix=d["storage"]["prefix"],
205-
),
214+
_storage,
206215
_json=d,
207216
)
208217
end
@@ -425,12 +434,19 @@ function _parse_dataset_list(
425434
end
426435
return Dataset(dataset; expected_project)
427436
catch e
437+
<<<<<<< HEAD
428438
# If we fail to parse the server response for a dataset, we should always get a JuliaHubError.
429439
# Other errors types might indicate e.g. code errors, so we don't want to swallow those
430440
# here, and instead throw immediately.
431441
if !isa(e, JuliaHubError)
432442
rethrow()
433443
end
444+
=======
445+
# If Dataset() fails due to some unexpected value in one of the dataset JSON objects that
446+
# JuliaHub.jl can not handle, it should only throw a JuliaHubError. So we rethrow on other
447+
# error types, as filtering all of them out could potentially hide JuliaHub.jl bugs.
448+
isa(e, JuliaHubError) || rethrow()
449+
>>>>>>> mp/dataset-constructor-tests
434450
@debug "Invalid dataset in GET /datasets response" dataset exception = (
435451
e, catch_backtrace()
436452
)

src/utils.jl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,34 @@ function _get_json_or(
195195
haskey(json, key) ? _get_json(json, key, T; msg) : default
196196
end
197197

198+
# _get_json_convert is a _get_json-type helper that also does some sort of type conversion
199+
# parsing etc. The general signature is the following:
200+
#
201+
# function _get_json_convert(
202+
# json::Dict, key::AbstractString, ::Type{T};
203+
# msg::Union{AbstractString, Nothing}=nothing
204+
# )::T
205+
#
206+
# Although in practice we implement for each type separately, since the parsing/conversion logic
207+
# can vary dramatically.
208+
#
209+
# A key point, though, is that it will throw a JuliaHubError if the server response is somehow
210+
# invalid and we can't parse/convert it properly.
211+
function _get_json_convert(
212+
json::Dict, key::AbstractString, ::Type{UUIDs.UUID}; msg::Union{AbstractString, Nothing}=nothing
213+
)::UUIDs.UUID
214+
uuid_str = _get_json(json, key, String; msg)
215+
uuid = tryparse(UUIDs.UUID, uuid_str)
216+
if isnothing(uuid)
217+
errormsg = """
218+
Invalid JSON returned by the server: `$key` not a valid UUID string.
219+
Server returned '$(uuid_str)'."""
220+
isnothing(msg) || (errormsg = string(msg, '\n', errormsg))
221+
throw(JuliaHubError(errormsg))
222+
end
223+
return uuid
224+
end
225+
198226
"""
199227
mutable struct Secret
200228

test/datasets.jl

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,81 @@
2121
)
2222
end
2323

24+
# These tests mainly exercise the Dataset() constructor, to ensure that it throws the
25+
# correct error objects.
26+
@testset "Dataset" begin
27+
d0 = () -> _dataset_json("test/test"; version_sizes=[42])
28+
let ds = JuliaHub.Dataset(d0())
29+
@test ds isa JuliaHub.Dataset
30+
@test ds.uuid == Base.UUID("3c4441bd-04bd-59f2-5426-70de923e67c2")
31+
@test ds.owner == "test"
32+
@test ds.name == "test"
33+
@test ds.description == "An example dataset"
34+
@test ds.tags == ["tag1", "tag2"]
35+
@test ds.dtype == "Blob"
36+
@test ds.size == 42
37+
@test length(ds.versions) == 1
38+
@test ds.versions[1].id == 1
39+
@test ds.versions[1].size == 42
40+
end
41+
42+
# We don't verify dtype values (this list might expand in the future)
43+
let d = Dict(d0()..., "type" => "Unknown Dtype")
44+
ds = JuliaHub.Dataset(d)
45+
@test ds.dtype == "Unknown Dtype"
46+
end
47+
48+
# If there are critical fields missing, it will throw
49+
@testset "required property: $(pname)" for pname in (
50+
"id", "owner", "name", "type", "description", "tags",
51+
"downloadURL", "lastModified", "credentials_url", "storage",
52+
)
53+
let d = Dict(d0()...)
54+
delete!(d, pname)
55+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
56+
@test startswith(
57+
e.value.msg,
58+
"Invalid JSON returned by the server: `$pname` missing in the response.",
59+
)
60+
end
61+
# Replace the value with a value that's of incorrect type
62+
let d = Dict(d0()..., pname => missing)
63+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
64+
@test startswith(
65+
e.value.msg,
66+
"Invalid JSON returned by the server: `$(pname)` of type `Missing`, expected",
67+
)
68+
end
69+
end
70+
# We also need to be able to parse the UUID into UUIDs.UUID
71+
let d = Dict(d0()..., "id" => "1234")
72+
@test_throws JuliaHub.JuliaHubError(
73+
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '1234'."
74+
) JuliaHub.Dataset(d)
75+
end
76+
# And correctly throw for invalid owner.username
77+
let d = Dict(d0()..., "owner" => nothing)
78+
@test_throws JuliaHub.JuliaHubError(
79+
"Invalid JSON returned by the server: `owner` of type `Nothing`, expected `<: Dict`."
80+
) JuliaHub.Dataset(d)
81+
end
82+
83+
# Missing versions list is okay though. We assume that there are no
84+
# versions then.
85+
let d = d0()
86+
delete!(d, "versions")
87+
ds = JuliaHub.Dataset(d)
88+
@test length(ds.versions) == 0
89+
end
90+
# But a bad type is not okay
91+
let d = Dict(d0()..., "versions" => 0)
92+
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
93+
@test startswith(
94+
e.value.msg, "Invalid JSON returned by the server: `versions` of type `Int64`"
95+
)
96+
end
97+
end
98+
2499
@testset "JuliaHub.dataset(s)" begin
25100
empty!(MOCK_JULIAHUB_STATE)
26101
Mocking.apply(mocking_patch) do
@@ -127,7 +202,7 @@ end
127202
@test isempty(ds.versions)
128203
end
129204

130-
MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["erroneous_dataset"]
205+
MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["bad-user/erroneous_dataset"]
131206
err_ds_warn = (
132207
:warn,
133208
"The JuliaHub GET /datasets response contains erroneous datasets. Omitting 1 entries.",

test/mocking.jl

Lines changed: 64 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -332,83 +332,25 @@ function _restcall_mocked(method, url, headers, payload; query)
332332
Dict("message" => "", "success" => true) |> jsonresponse(200)
333333
end
334334
elseif (method == :GET) && endswith(url, "datasets")
335-
dataset_params = get(MOCK_JULIAHUB_STATE, :dataset_params, Dict())
336-
dataset_version_sizes = get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing)
337-
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
338-
versions_json =
339-
dataset -> begin
340-
version_sizes = something(
341-
dataset_version_sizes,
342-
(dataset == "example-dataset") ? [57, 331] : [57],
343-
)
344-
Dict(
345-
"version" => string("v", length(version_sizes)),
346-
"versions" => map(enumerate(version_sizes)) do (i, sz)
347-
Dict(
348-
"version" => i,
349-
"blobstore_path" => string("u", 2),
350-
"size" => sz,
351-
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
352-
)
353-
end,
354-
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
355-
)
356-
end
357-
#! format: off
358-
shared = Dict(
359-
"groups" => Any[],
360-
"storage" => Dict(
361-
"bucket_region" => "us-east-1",
362-
"bucket" => "datasets-bucket",
363-
"prefix" => "datasets",
364-
"vendor" => "aws",
365-
),
366-
"description" => get(dataset_params, "description", "An example dataset"),
367-
"tags" => get(dataset_params, "tags", ["tag1", "tag2"]),
368-
"license" => (
369-
"name" => "MIT License",
370-
"spdx_id" => "MIT",
371-
"text" => nothing,
372-
"url" => "https://opensource.org/licenses/MIT",
373-
),
374-
"lastModified" => "2022-10-12T05:39:42.906",
375-
"downloadURL" => "",
376-
"credentials_url" => "...",
377-
)
378-
#! format: on
379-
datasets = []
380-
for dataset_full_id in existing_datasets
381-
username, dataset = split(dataset_full_id, '/'; limit=2)
382-
push!(datasets,
383-
Dict(
384-
"id" => string(uuidhash(dataset_full_id)),
385-
"name" => dataset,
386-
"owner" => Dict(
387-
"username" => username,
388-
"type" => "User",
389-
),
390-
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
391-
"visibility" => occursin("public", dataset) ? "public" : "private",
392-
versions_json(dataset)...,
393-
shared...,
335+
datasets = Dict[]
336+
for dataset_name in existing_datasets
337+
d = _dataset_json(
338+
dataset_name;
339+
params=get(MOCK_JULIAHUB_STATE, :dataset_params, Dict()),
340+
version_sizes=something(
341+
get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing),
342+
endswith(dataset_name, "/example-dataset") ? [57, 331] : [57],
394343
),
395344
)
345+
push!(datasets, d)
396346
end
397-
for dataset in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
398-
push!(datasets,
399-
Dict(
400-
"id" => string(uuidhash(dataset)),
401-
"name" => dataset,
402-
"owner" => Dict(
403-
"username" => nothing,
404-
"type" => "User",
405-
),
406-
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
407-
"visibility" => occursin("public", dataset) ? "public" : "private",
408-
versions_json(dataset)...,
409-
shared...,
410-
),
347+
for dataset_name in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
348+
d = _dataset_json(
349+
dataset_name;
350+
version_sizes=(dataset_name == "example-dataset") ? [57, 331] : [57],
411351
)
352+
d["owner"]["username"] = nothing
353+
push!(datasets, d)
412354
end
413355
datasets |> jsonresponse(200)
414356
elseif (method == :DELETE) && endswith(url, DATASET_REGEX)
@@ -776,3 +718,52 @@ function _http_request_mocked(
776718
]
777719
HTTP.Response(200, headers, b"success")
778720
end
721+
722+
function _dataset_json(
723+
dataset_name::AbstractString;
724+
params=Dict(),
725+
version_sizes=[],
726+
)
727+
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
728+
username, dataset = string.(split(dataset_name, '/'; limit=2))
729+
return Dict{String, Any}(
730+
"id" => string(uuidhash(dataset_name)),
731+
"name" => dataset,
732+
"owner" => Dict{String, Any}(
733+
"username" => username,
734+
"type" => "User",
735+
),
736+
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
737+
"visibility" => occursin("public", dataset) ? "public" : "private",
738+
# versions
739+
"version" => string("v", length(version_sizes)),
740+
"versions" => map(enumerate(version_sizes)) do (i, sz)
741+
Dict{String, Any}(
742+
"version" => i,
743+
"blobstore_path" => string("u", 2),
744+
"size" => sz,
745+
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
746+
)
747+
end,
748+
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
749+
# shared
750+
"groups" => Any[],
751+
"storage" => Dict{String, Any}(
752+
"bucket_region" => "us-east-1",
753+
"bucket" => "datasets-bucket",
754+
"prefix" => "datasets",
755+
"vendor" => "aws",
756+
),
757+
"description" => get(params, "description", "An example dataset"),
758+
"tags" => get(params, "tags", ["tag1", "tag2"]),
759+
"license" => (
760+
"name" => "MIT License",
761+
"spdx_id" => "MIT",
762+
"text" => nothing,
763+
"url" => "https://opensource.org/licenses/MIT",
764+
),
765+
"lastModified" => "2022-10-12T05:39:42.906",
766+
"downloadURL" => "",
767+
"credentials_url" => "...",
768+
)
769+
end

test/utils.jl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,25 @@ end
7777
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("")
7878
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("bad-string")
7979
end
80+
81+
@testset "_get_json_convert" begin
82+
@test JuliaHub._get_json_convert(
83+
Dict("id" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
84+
) == UUIDs.UUID("123e4567-e89b-12d3-a456-426614174000")
85+
# Error cases:
86+
@test_throws JuliaHub.JuliaHubError(
87+
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '123'."
88+
) JuliaHub._get_json_convert(
89+
Dict("id" => "123"), "id", UUIDs.UUID
90+
)
91+
@test_throws JuliaHub.JuliaHubError(
92+
"Invalid JSON returned by the server: `id` of type `Int64`, expected `<: String`."
93+
) JuliaHub._get_json_convert(
94+
Dict("id" => 123), "id", UUIDs.UUID
95+
)
96+
@test_throws JuliaHub.JuliaHubError(
97+
"Invalid JSON returned by the server: `id` missing in the response.\nKeys present: _id_missing\njson: Dict{String, String} with 1 entry:\n \"_id_missing\" => \"123e4567-e89b-12d3-a456-426614174000\""
98+
) JuliaHub._get_json_convert(
99+
Dict("_id_missing" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
100+
)
101+
end

0 commit comments

Comments
 (0)