Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions src/datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,30 +131,39 @@ Base.@kwdef struct Dataset
end

function Dataset(d::Dict)
owner = d["owner"]["username"]
name = d["name"]
owner = _get_json(
_get_json(d, "owner", Dict),
"username", String,
)
name = _get_json(d, "name", AbstractString)
versions_json = _get_json_or(d, "versions", Vector, [])
versions = sort([DatasetVersion(json; owner, name) for json in versions_json]; by=dsv -> dsv.id)
versions = sort(
[DatasetVersion(json; owner, name) for json in versions_json];
by=dsv -> dsv.id,
)
_storage = let storage_json = _get_json(d, "storage", Dict)
_DatasetStorage(;
credentials_url=_get_json(d, "credentials_url", AbstractString),
region=_get_json(storage_json, "bucket_region", AbstractString),
bucket=_get_json(storage_json, "bucket", AbstractString),
prefix=_get_json(storage_json, "prefix", AbstractString),
)
end
Dataset(;
uuid=UUIDs.UUID(d["id"]),
uuid=_get_json_convert(d, "id", UUIDs.UUID),
name, owner, versions,
dtype=d["type"],
description=d["description"],
size=d["size"],
tags=d["tags"],
_downloadURL=d["downloadURL"],
_last_modified=_nothing_or(d["lastModified"]) do last_modified
dtype=_get_json(d, "type", AbstractString),
description=_get_json(d, "description", AbstractString),
size=_get_json(d, "size", Integer),
tags=_get_json(d, "tags", Vector),
_downloadURL=_get_json(d, "downloadURL", AbstractString),
_last_modified=_nothing_or(_get_json(d, "lastModified", AbstractString)) do last_modified
datetime_utc = Dates.DateTime(
last_modified, Dates.dateformat"YYYY-mm-ddTHH:MM:SS.ss"
)
_utc2localtz(datetime_utc)
end,
_storage=_DatasetStorage(;
credentials_url=d["credentials_url"],
region=d["storage"]["bucket_region"],
bucket=d["storage"]["bucket"],
prefix=d["storage"]["prefix"],
),
_storage,
_json=d,
)
end
Expand Down Expand Up @@ -351,6 +360,10 @@ function datasets(
end
return Dataset(dataset)
catch e
# If Dataset() fails due to some unexpected value in one of the dataset JSON objects that
# JuliaHub.jl can not handle, it should only throw a JuliaHubError. So we rethrow on other
# error types, as filtering all of them out could potentially hide JuliaHub.jl bugs.
isa(e, JuliaHubError) || rethrow()
@debug "Invalid dataset in GET /datasets response" dataset exception = (
e, catch_backtrace()
)
Expand Down
28 changes: 28 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,34 @@ function _get_json_or(
haskey(json, key) ? _get_json(json, key, T; msg) : default
end

# _get_json_convert is a _get_json-type helper that also does some sort of type conversion
# parsing etc. The general signature is the following:
#
# function _get_json_convert(
# json::Dict, key::AbstractString, ::Type{T};
# msg::Union{AbstractString, Nothing}=nothing
# )::T
#
# Although in practice we implement for each type separately, since the parsing/conversion logic
# can vary dramatically.
#
# A key point, though, is that it will throw a JuliaHubError if the server response is somehow
# invalid and we can't parse/convert it properly.
function _get_json_convert(
json::Dict, key::AbstractString, ::Type{UUIDs.UUID}; msg::Union{AbstractString, Nothing}=nothing
)::UUIDs.UUID
uuid_str = _get_json(json, key, String; msg)
uuid = tryparse(UUIDs.UUID, uuid_str)
if isnothing(uuid)
errormsg = """
Invalid JSON returned by the server: `$key` not a valid UUID string.
Server returned '$(uuid_str)'."""
isnothing(msg) || (errormsg = string(msg, '\n', errormsg))
throw(JuliaHubError(errormsg))
end
return uuid
end

"""
mutable struct Secret

Expand Down
76 changes: 75 additions & 1 deletion test/datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,80 @@
)
end

# These tests mainly exercise the Dataset() constructor, to ensure that it throws the
# correct error objects.
@testset "Dataset" begin
d0 = () -> _dataset_json("test/test"; version_sizes=[42])
let ds = JuliaHub.Dataset(d0())
@test ds isa JuliaHub.Dataset
@test ds.uuid == Base.UUID("3c4441bd-04bd-59f2-5426-70de923e67c2")
@test ds.owner == "test"
@test ds.name == "test"
@test ds.description == "An example dataset"
@test ds.tags == ["tag1", "tag2"]
@test ds.dtype == "Blob"
@test ds.size == 42
@test length(ds.versions) == 1
@test ds.versions[1].id == 1
@test ds.versions[1].size == 42
end

# We don't verify dtype values (this list might expand in the future)
let d = Dict(d0()..., "type" => "Unknown Dtype")
ds = JuliaHub.Dataset(d)
@test ds.dtype == "Unknown Dtype"
end

# If there are critical fields missing, it will throw
@testset "required property: $(pname)" for pname in (
"id", "owner", "name", "type", "description", "tags",
"downloadURL", "lastModified", "credentials_url", "storage",
)
let d = Dict(d0()...)
delete!(d, pname)
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
@test startswith(
e.value.msg,
"Invalid JSON returned by the server: `$pname` missing in the response.",
)
end
# Replace the value with a value that's of incorrect type
let d = Dict(d0()..., pname => missing)
e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d)
@test startswith(
e.value.msg,
"Invalid JSON returned by the server: `$(pname)` of type `Missing`, expected",
)
end
end
# We also need to be able to parse the UUID into UUIDs.UUID
let d = Dict(d0()..., "id" => "1234")
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '1234'."
) JuliaHub.Dataset(d)
end
# And correctly throw for invalid owner.username
let d = Dict(d0()..., "owner" => nothing)
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `owner` of type `Nothing`, expected `<: Dict`."
) JuliaHub.Dataset(d)
end

# Missing versions list is okay though. We assume that there are no
# versions then.
let d = d0()
delete!(d, "versions")
ds = JuliaHub.Dataset(d)
@test length(ds.versions) == 0
end
# But a bad type is not okay
let d = Dict(d0()..., "versions" => 0)
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `versions` of type `Int64`, expected `<: Vector`."
) JuliaHub.Dataset(d)
end
end

@testset "JuliaHub.dataset(s)" begin
empty!(MOCK_JULIAHUB_STATE)
Mocking.apply(mocking_patch) do
Expand Down Expand Up @@ -119,7 +193,7 @@ end
@test isempty(ds.versions)
end

MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["erroneous_dataset"]
MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["bad-user/erroneous_dataset"]
err_ds_warn = (
:warn,
"The JuliaHub GET /datasets response contains erroneous datasets. Omitting 1 entries.",
Expand Down
137 changes: 64 additions & 73 deletions test/mocking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,83 +332,25 @@ function _restcall_mocked(method, url, headers, payload; query)
Dict("message" => "", "success" => true) |> jsonresponse(200)
end
elseif (method == :GET) && endswith(url, "datasets")
dataset_params = get(MOCK_JULIAHUB_STATE, :dataset_params, Dict())
dataset_version_sizes = get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing)
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
versions_json =
dataset -> begin
version_sizes = something(
dataset_version_sizes,
(dataset == "example-dataset") ? [57, 331] : [57],
)
Dict(
"version" => string("v", length(version_sizes)),
"versions" => map(enumerate(version_sizes)) do (i, sz)
Dict(
"version" => i,
"blobstore_path" => string("u", 2),
"size" => sz,
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
)
end,
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
)
end
#! format: off
shared = Dict(
"groups" => Any[],
"storage" => Dict(
"bucket_region" => "us-east-1",
"bucket" => "datasets-bucket",
"prefix" => "datasets",
"vendor" => "aws",
),
"description" => get(dataset_params, "description", "An example dataset"),
"tags" => get(dataset_params, "tags", ["tag1", "tag2"]),
"license" => (
"name" => "MIT License",
"spdx_id" => "MIT",
"text" => nothing,
"url" => "https://opensource.org/licenses/MIT",
),
"lastModified" => "2022-10-12T05:39:42.906",
"downloadURL" => "",
"credentials_url" => "...",
)
#! format: on
datasets = []
for dataset_full_id in existing_datasets
username, dataset = split(dataset_full_id, '/'; limit=2)
push!(datasets,
Dict(
"id" => string(uuidhash(dataset_full_id)),
"name" => dataset,
"owner" => Dict(
"username" => username,
"type" => "User",
),
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
"visibility" => occursin("public", dataset) ? "public" : "private",
versions_json(dataset)...,
shared...,
datasets = Dict[]
for dataset_name in existing_datasets
d = _dataset_json(
dataset_name;
params=get(MOCK_JULIAHUB_STATE, :dataset_params, Dict()),
version_sizes=something(
get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing),
endswith(dataset_name, "/example-dataset") ? [57, 331] : [57],
),
)
push!(datasets, d)
end
for dataset in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
push!(datasets,
Dict(
"id" => string(uuidhash(dataset)),
"name" => dataset,
"owner" => Dict(
"username" => nothing,
"type" => "User",
),
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
"visibility" => occursin("public", dataset) ? "public" : "private",
versions_json(dataset)...,
shared...,
),
for dataset_name in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[])
d = _dataset_json(
dataset_name;
version_sizes=(dataset_name == "example-dataset") ? [57, 331] : [57],
)
d["owner"]["username"] = nothing
push!(datasets, d)
end
datasets |> jsonresponse(200)
elseif (method == :DELETE) && endswith(url, DATASET_REGEX)
Expand Down Expand Up @@ -776,3 +718,52 @@ function _http_request_mocked(
]
HTTP.Response(200, headers, b"success")
end

function _dataset_json(
dataset_name::AbstractString;
params=Dict(),
version_sizes=[],
)
zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00")
username, dataset = string.(split(dataset_name, '/'; limit=2))
return Dict{String, Any}(
"id" => string(uuidhash(dataset_name)),
"name" => dataset,
"owner" => Dict{String, Any}(
"username" => username,
"type" => "User",
),
"type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob",
"visibility" => occursin("public", dataset) ? "public" : "private",
# versions
"version" => string("v", length(version_sizes)),
"versions" => map(enumerate(version_sizes)) do (i, sz)
Dict{String, Any}(
"version" => i,
"blobstore_path" => string("u", 2),
"size" => sz,
"date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)),
)
end,
"size" => isempty(version_sizes) ? 0 : sum(version_sizes),
# shared
"groups" => Any[],
"storage" => Dict{String, Any}(
"bucket_region" => "us-east-1",
"bucket" => "datasets-bucket",
"prefix" => "datasets",
"vendor" => "aws",
),
"description" => get(params, "description", "An example dataset"),
"tags" => get(params, "tags", ["tag1", "tag2"]),
"license" => (
"name" => "MIT License",
"spdx_id" => "MIT",
"text" => nothing,
"url" => "https://opensource.org/licenses/MIT",
),
"lastModified" => "2022-10-12T05:39:42.906",
"downloadURL" => "",
"credentials_url" => "...",
)
end
22 changes: 22 additions & 0 deletions test/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,25 @@ end
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("")
@test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("bad-string")
end

@testset "_get_json_convert" begin
@test JuliaHub._get_json_convert(
Dict("id" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
) == UUIDs.UUID("123e4567-e89b-12d3-a456-426614174000")
# Error cases:
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '123'."
) JuliaHub._get_json_convert(
Dict("id" => "123"), "id", UUIDs.UUID
)
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `id` of type `Int64`, expected `<: String`."
) JuliaHub._get_json_convert(
Dict("id" => 123), "id", UUIDs.UUID
)
@test_throws JuliaHub.JuliaHubError(
"Invalid JSON returned by the server: `id` missing in the response.\nKeys present: _id_missing\njson: Dict{String, String} with 1 entry:\n \"_id_missing\" => \"123e4567-e89b-12d3-a456-426614174000\""
) JuliaHub._get_json_convert(
Dict("_id_missing" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID
)
end
Loading