From f67af70a25a8f42a7c34084518c6d8b915be6a59 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 18:12:14 +1300 Subject: [PATCH 1/6] add basic Dataset constructor tests --- test/datasets.jl | 57 +++++++++++++++++++- test/mocking.jl | 137 ++++++++++++++++++++++------------------------- 2 files changed, 120 insertions(+), 74 deletions(-) diff --git a/test/datasets.jl b/test/datasets.jl index 0ea3ecf75..2431baee6 100644 --- a/test/datasets.jl +++ b/test/datasets.jl @@ -21,6 +21,61 @@ ) end +# These tests mainly exercise the Dataset() constructor, to ensure that it throws the +# correct error objects. +@testset "Dataset" begin + d0 = () -> _dataset_json("test/test"; version_sizes=[42]) + let ds = JuliaHub.Dataset(d0()) + @test ds isa JuliaHub.Dataset + @test ds.uuid == Base.UUID("3c4441bd-04bd-59f2-5426-70de923e67c2") + @test ds.owner == "test" + @test ds.name == "test" + @test ds.description == "An example dataset" + @test ds.tags == ["tag1", "tag2"] + @test ds.dtype == "Blob" + @test ds.size == 42 + @test length(ds.versions) == 1 + @test ds.versions[1].id == 1 + @test ds.versions[1].size == 42 + end + + # We don't verify dtype values (this list might expand in the future) + let d = Dict(d0()..., "type" => "Unknown Dtype") + ds = JuliaHub.Dataset(d) + @test ds.dtype == "Unknown Dtype" + end + + # If there are critical fields missing, it will throw + @testset "required property: $(pname)" for pname in ( + "id", "owner", "name", "type", "description", "tags", + "downloadURL", "lastModified", "credentials_url", "storage", + ) + d = Dict(d0()...) + delete!(d, pname) + # TODO: should not be a KeyError though.. + @test_throws KeyError JuliaHub.Dataset(d) + end + # We also need to be able to parse the UUID into UUIDs.UUID + let d = Dict(d0()..., "id" => "1234") + # TODO: should not be a ArgumentError though.. + @test_throws ArgumentError JuliaHub.Dataset(d) + end + + # Missing versions list is okay though. We assume that there are no + # versions then. + let d = d0() + delete!(d, "versions") + ds = JuliaHub.Dataset(d) + @test length(ds.versions) == 0 + end + # But a bad type is not okay + let d = Dict(d0()..., "versions" => 0) + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `versions` of type `Int64`, expected `<: Vector`." + ) JuliaHub.Dataset(d) + end +end + @testset "JuliaHub.dataset(s)" begin empty!(MOCK_JULIAHUB_STATE) Mocking.apply(mocking_patch) do @@ -119,7 +174,7 @@ end @test isempty(ds.versions) end - MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["erroneous_dataset"] + MOCK_JULIAHUB_STATE[:datasets_erroneous] = ["bad-user/erroneous_dataset"] err_ds_warn = ( :warn, "The JuliaHub GET /datasets response contains erroneous datasets. Omitting 1 entries.", diff --git a/test/mocking.jl b/test/mocking.jl index 7a9f2b770..1fb00df4d 100644 --- a/test/mocking.jl +++ b/test/mocking.jl @@ -332,83 +332,25 @@ function _restcall_mocked(method, url, headers, payload; query) Dict("message" => "", "success" => true) |> jsonresponse(200) end elseif (method == :GET) && endswith(url, "datasets") - dataset_params = get(MOCK_JULIAHUB_STATE, :dataset_params, Dict()) - dataset_version_sizes = get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing) - zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00") - versions_json = - dataset -> begin - version_sizes = something( - dataset_version_sizes, - (dataset == "example-dataset") ? [57, 331] : [57], - ) - Dict( - "version" => string("v", length(version_sizes)), - "versions" => map(enumerate(version_sizes)) do (i, sz) - Dict( - "version" => i, - "blobstore_path" => string("u", 2), - "size" => sz, - "date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)), - ) - end, - "size" => isempty(version_sizes) ? 0 : sum(version_sizes), - ) - end - #! format: off - shared = Dict( - "groups" => Any[], - "storage" => Dict( - "bucket_region" => "us-east-1", - "bucket" => "datasets-bucket", - "prefix" => "datasets", - "vendor" => "aws", - ), - "description" => get(dataset_params, "description", "An example dataset"), - "tags" => get(dataset_params, "tags", ["tag1", "tag2"]), - "license" => ( - "name" => "MIT License", - "spdx_id" => "MIT", - "text" => nothing, - "url" => "https://opensource.org/licenses/MIT", - ), - "lastModified" => "2022-10-12T05:39:42.906", - "downloadURL" => "", - "credentials_url" => "...", - ) - #! format: on - datasets = [] - for dataset_full_id in existing_datasets - username, dataset = split(dataset_full_id, '/'; limit=2) - push!(datasets, - Dict( - "id" => string(uuidhash(dataset_full_id)), - "name" => dataset, - "owner" => Dict( - "username" => username, - "type" => "User", - ), - "type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob", - "visibility" => occursin("public", dataset) ? "public" : "private", - versions_json(dataset)..., - shared..., + datasets = Dict[] + for dataset_name in existing_datasets + d = _dataset_json( + dataset_name; + params=get(MOCK_JULIAHUB_STATE, :dataset_params, Dict()), + version_sizes=something( + get(MOCK_JULIAHUB_STATE, :dataset_version_sizes, nothing), + endswith(dataset_name, "/example-dataset") ? [57, 331] : [57], ), ) + push!(datasets, d) end - for dataset in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[]) - push!(datasets, - Dict( - "id" => string(uuidhash(dataset)), - "name" => dataset, - "owner" => Dict( - "username" => nothing, - "type" => "User", - ), - "type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob", - "visibility" => occursin("public", dataset) ? "public" : "private", - versions_json(dataset)..., - shared..., - ), + for dataset_name in get(MOCK_JULIAHUB_STATE, :datasets_erroneous, String[]) + d = _dataset_json( + dataset_name; + version_sizes=(dataset_name == "example-dataset") ? [57, 331] : [57], ) + d["owner"]["username"] = nothing + push!(datasets, d) end datasets |> jsonresponse(200) elseif (method == :DELETE) && endswith(url, DATASET_REGEX) @@ -776,3 +718,52 @@ function _http_request_mocked( ] HTTP.Response(200, headers, b"success") end + +function _dataset_json( + dataset_name::AbstractString; + params=Dict(), + version_sizes=[], +) + zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00") + username, dataset = split(dataset_name, '/'; limit=2) + return Dict{String, Any}( + "id" => string(uuidhash(dataset_name)), + "name" => dataset, + "owner" => Dict{String, Any}( + "username" => username, + "type" => "User", + ), + "type" => occursin("blobtree", dataset) ? "BlobTree" : "Blob", + "visibility" => occursin("public", dataset) ? "public" : "private", + # versions + "version" => string("v", length(version_sizes)), + "versions" => map(enumerate(version_sizes)) do (i, sz) + Dict{String, Any}( + "version" => i, + "blobstore_path" => string("u", 2), + "size" => sz, + "date" => string(zerotime + Dates.Day(i) + Dates.Millisecond(sz)), + ) + end, + "size" => isempty(version_sizes) ? 0 : sum(version_sizes), + # shared + "groups" => Any[], + "storage" => Dict{String, Any}( + "bucket_region" => "us-east-1", + "bucket" => "datasets-bucket", + "prefix" => "datasets", + "vendor" => "aws", + ), + "description" => get(params, "description", "An example dataset"), + "tags" => get(params, "tags", ["tag1", "tag2"]), + "license" => ( + "name" => "MIT License", + "spdx_id" => "MIT", + "text" => nothing, + "url" => "https://opensource.org/licenses/MIT", + ), + "lastModified" => "2022-10-12T05:39:42.906", + "downloadURL" => "", + "credentials_url" => "...", + ) +end From 9d98a22682445ac4203f838e06379ea5f60f84d6 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 19:12:07 +1300 Subject: [PATCH 2/6] make sure Dataset only throws JuliaHubError --- src/datasets.jl | 41 +++++++++++++++++++++++++---------------- src/utils.jl | 28 ++++++++++++++++++++++++++++ test/datasets.jl | 31 +++++++++++++++++++++++++------ test/mocking.jl | 2 +- test/utils.jl | 22 ++++++++++++++++++++++ 5 files changed, 101 insertions(+), 23 deletions(-) diff --git a/src/datasets.jl b/src/datasets.jl index 4d81eada5..c7146db99 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -131,30 +131,39 @@ Base.@kwdef struct Dataset end function Dataset(d::Dict) - owner = d["owner"]["username"] - name = d["name"] + owner = _get_json( + _get_json(d, "owner", Dict), + "username", String, + ) + name = _get_json(d, "name", AbstractString) versions_json = _get_json_or(d, "versions", Vector, []) - versions = sort([DatasetVersion(json; owner, name) for json in versions_json]; by=dsv -> dsv.id) + versions = sort( + [DatasetVersion(json; owner, name) for json in versions_json]; + by=dsv -> dsv.id, + ) + _storage = let storage_json = _get_json(d, "storage", Dict) + _DatasetStorage(; + credentials_url=_get_json(d, "credentials_url", AbstractString), + region=_get_json(storage_json, "bucket_region", AbstractString), + bucket=_get_json(storage_json, "bucket", AbstractString), + prefix=_get_json(storage_json, "prefix", AbstractString), + ) + end Dataset(; - uuid=UUIDs.UUID(d["id"]), + uuid=_get_json_convert(d, "id", UUIDs.UUID), name, owner, versions, - dtype=d["type"], - description=d["description"], - size=d["size"], - tags=d["tags"], - _downloadURL=d["downloadURL"], - _last_modified=_nothing_or(d["lastModified"]) do last_modified + dtype=_get_json(d, "type", AbstractString), + description=_get_json(d, "description", AbstractString), + size=_get_json(d, "size", Integer), + tags=_get_json(d, "tags", Vector{<:AbstractString}), + _downloadURL=_get_json(d, "downloadURL", AbstractString), + _last_modified=_nothing_or(_get_json(d, "lastModified", AbstractString)) do last_modified datetime_utc = Dates.DateTime( last_modified, Dates.dateformat"YYYY-mm-ddTHH:MM:SS.ss" ) _utc2localtz(datetime_utc) end, - _storage=_DatasetStorage(; - credentials_url=d["credentials_url"], - region=d["storage"]["bucket_region"], - bucket=d["storage"]["bucket"], - prefix=d["storage"]["prefix"], - ), + _storage, _json=d, ) end diff --git a/src/utils.jl b/src/utils.jl index 3372ec7e9..e08561bdc 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -176,6 +176,34 @@ function _get_json_or( haskey(json, key) ? _get_json(json, key, T; msg) : default end +# _get_json_convert is a _get_json-type helper that also does some sort of type conversion +# parsing etc. The general signature is the following: +# +# function _get_json_convert( +# json::Dict, key::AbstractString, ::Type{T}; +# msg::Union{AbstractString, Nothing}=nothing +# )::T +# +# Although in practice we implement for each type separately, since the parsing/conversion logic +# can vary dramatically. +# +# A key point, though, is that it will throw a JuliaHubError if the server response is somehow +# invalid and we can't parse/convert it properly. +function _get_json_convert( + json::Dict, key::AbstractString, ::Type{UUIDs.UUID}; msg::Union{AbstractString, Nothing}=nothing +)::UUIDs.UUID + uuid_str = _get_json(json, key, String; msg) + uuid = tryparse(UUIDs.UUID, uuid_str) + if isnothing(uuid) + errormsg = """ + Invalid JSON returned by the server: `$key` not a valid UUID string. + Server returned '$(uuid_str)'.""" + isnothing(msg) || (errormsg = string(msg, '\n', errormsg)) + throw(JuliaHubError(errormsg)) + end + return uuid +end + """ mutable struct Secret diff --git a/test/datasets.jl b/test/datasets.jl index 2431baee6..301324cc0 100644 --- a/test/datasets.jl +++ b/test/datasets.jl @@ -50,15 +50,34 @@ end "id", "owner", "name", "type", "description", "tags", "downloadURL", "lastModified", "credentials_url", "storage", ) - d = Dict(d0()...) - delete!(d, pname) - # TODO: should not be a KeyError though.. - @test_throws KeyError JuliaHub.Dataset(d) + let d = Dict(d0()...) + delete!(d, pname) + e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d) + @test startswith( + e.value.msg, + "Invalid JSON returned by the server: `$pname` missing in the response.", + ) + end + # Replace the value with a value that's of incorrect type + let d = Dict(d0()..., pname => missing) + e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d) + @test startswith( + e.value.msg, + "Invalid JSON returned by the server: `$(pname)` of type `Missing`, expected", + ) + end end # We also need to be able to parse the UUID into UUIDs.UUID let d = Dict(d0()..., "id" => "1234") - # TODO: should not be a ArgumentError though.. - @test_throws ArgumentError JuliaHub.Dataset(d) + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '1234'." + ) JuliaHub.Dataset(d) + end + # And correctly throw for invalid owner.username + let d = Dict(d0()..., "owner" => nothing) + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `owner` of type `Nothing`, expected `<: Dict`." + ) JuliaHub.Dataset(d) end # Missing versions list is okay though. We assume that there are no diff --git a/test/mocking.jl b/test/mocking.jl index 1fb00df4d..7353cd9ff 100644 --- a/test/mocking.jl +++ b/test/mocking.jl @@ -725,7 +725,7 @@ function _dataset_json( version_sizes=[], ) zerotime = TimeZones.ZonedDateTime("2022-10-12T05:39:42.906+00:00") - username, dataset = split(dataset_name, '/'; limit=2) + username, dataset = string.(split(dataset_name, '/'; limit=2)) return Dict{String, Any}( "id" => string(uuidhash(dataset_name)), "name" => dataset, diff --git a/test/utils.jl b/test/utils.jl index ad08d96ae..8d004e69b 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -77,3 +77,25 @@ end @test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("") @test_throws JuliaHub.JuliaHubError JuliaHub._parse_tz("bad-string") end + +@testset "_get_json_convert" begin + @test JuliaHub._get_json_convert( + Dict("id" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID + ) == UUIDs.UUID("123e4567-e89b-12d3-a456-426614174000") + # Error cases: + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `id` not a valid UUID string.\nServer returned '123'." + ) JuliaHub._get_json_convert( + Dict("id" => "123"), "id", UUIDs.UUID + ) + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `id` of type `Int64`, expected `<: String`." + ) JuliaHub._get_json_convert( + Dict("id" => 123), "id", UUIDs.UUID + ) + @test_throws JuliaHub.JuliaHubError( + "Invalid JSON returned by the server: `id` missing in the response.\nKeys present: _id_missing\njson: Dict{String, String} with 1 entry:\n \"_id_missing\" => \"123e4567-e89b-12d3-a456-426614174000\"" + ) JuliaHub._get_json_convert( + Dict("_id_missing" => "123e4567-e89b-12d3-a456-426614174000"), "id", UUIDs.UUID + ) +end From 25c9f8fdf020cc4d18ad188ac6f59a966c8cd9c1 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 19:19:43 +1300 Subject: [PATCH 3/6] in mocking, .tags is sometimes Any[] --- src/datasets.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets.jl b/src/datasets.jl index c7146db99..28446c3ed 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -155,7 +155,7 @@ function Dataset(d::Dict) dtype=_get_json(d, "type", AbstractString), description=_get_json(d, "description", AbstractString), size=_get_json(d, "size", Integer), - tags=_get_json(d, "tags", Vector{<:AbstractString}), + tags=_get_json(d, "tags", Vector), _downloadURL=_get_json(d, "downloadURL", AbstractString), _last_modified=_nothing_or(_get_json(d, "lastModified", AbstractString)) do last_modified datetime_utc = Dates.DateTime( From 512d23002a2d4c95d222df3c8065ece627eee751 Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 19:20:24 +1300 Subject: [PATCH 4/6] only silently capture JuliaHubErrors in datasets() --- src/datasets.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/datasets.jl b/src/datasets.jl index 28446c3ed..e5c353c2e 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -360,6 +360,10 @@ function datasets( end return Dataset(dataset) catch e + # If Dataset() fails due to some unexpected value in one of the dataset JSON objects that + # JuliaHub.jl can not handle, it should only throw a JuliaHubError. So we rethrow on other + # error types, as filtering all of them out could potentially hide JuliaHub.jl bugs. + isa(e, JuliaHubError) || rethrow() @debug "Invalid dataset in GET /datasets response" dataset exception = ( e, catch_backtrace() ) From 9a81327705dae09653d2cd8a5d03888c89f74d6d Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 19:29:23 +1300 Subject: [PATCH 5/6] add a return --- src/datasets.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets.jl b/src/datasets.jl index e5c353c2e..cdf374358 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -149,7 +149,7 @@ function Dataset(d::Dict) prefix=_get_json(storage_json, "prefix", AbstractString), ) end - Dataset(; + return Dataset(; uuid=_get_json_convert(d, "id", UUIDs.UUID), name, owner, versions, dtype=_get_json(d, "type", AbstractString), From 344be60725a25f69afdcd2e59056cfb359d9716a Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 12 Mar 2025 19:57:59 +1300 Subject: [PATCH 6/6] don't rely on string rep of a parametric type --- test/datasets.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/datasets.jl b/test/datasets.jl index 301324cc0..f7a1672ef 100644 --- a/test/datasets.jl +++ b/test/datasets.jl @@ -89,9 +89,10 @@ end end # But a bad type is not okay let d = Dict(d0()..., "versions" => 0) - @test_throws JuliaHub.JuliaHubError( - "Invalid JSON returned by the server: `versions` of type `Int64`, expected `<: Vector`." - ) JuliaHub.Dataset(d) + e = @test_throws JuliaHub.JuliaHubError JuliaHub.Dataset(d) + @test startswith( + e.value.msg, "Invalid JSON returned by the server: `versions` of type `Int64`" + ) end end