Skip to content

Commit 9d2132f

Browse files
authored
Flexible string-based data addressing with URL-like "dataspec" (#33)
This parses the argument to dataset() with a URL-like scheme, enabling more flexible naming of resources within a given dataset. Like URLs, we use the & and # as separators for query and fragment sections. Like URLs, this provides for a compact string representation which can be easily communicated between users of a shared dataset to address subresources without the need to create an entirely new and largely duplicated dataset configuration. As in URLs, the query section is to be used during resource resolution, for example a version number for the dataset can be specified. We choose the standard & and = separators for key value pairs: name/of/dataset?version=v1 As in URLs, the fragment section refers to a subresource and its interpretation depends on the type of the dataset. (In analogy, the meaning of the fragment of a URL depends on the mime type received from the server.) For example, a subtree of a BlobTree can be addressed using tree_name#path/to/subtree The use of fragment for indexing a BlobTree is implemented here as one handy use case of this syntax.
1 parent 9f1e961 commit 9d2132f

File tree

4 files changed

+124
-18
lines changed

4 files changed

+124
-18
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "DataSets"
22
uuid = "c9661210-8a83-48f0-b833-72e62abce419"
33
authors = ["Chris Foster <[email protected]> and contributors"]
4-
version = "0.2.5"
4+
version = "0.2.6"
55

66
[deps]
77
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -17,7 +17,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
1717
[compat]
1818
AbstractTrees = "0.3"
1919
ReplMaker = "0.2"
20-
ResourceContexts = "0.1"
20+
ResourceContexts = "0.1,0.2"
2121
TOML = "1"
2222
julia = "1.5"
2323

src/DataSets.jl

Lines changed: 91 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,20 @@ function Base.getproperty(d::DataSet, name::Symbol)
119119
end
120120
end
121121

122+
Base.getindex(d::DataSet, name::AbstractString) = getindex(d.conf, name)
123+
Base.haskey(d::DataSet, name::AbstractString) = haskey(d.conf, name)
124+
125+
# Split the fragment section as a '/' separated RelPath
126+
function dataspec_fragment_as_path(d::DataSet)
127+
if haskey(d, "dataspec")
128+
fragment = get(d.dataspec, "fragment", nothing)
129+
if !isnothing(fragment)
130+
return RelPath(split(fragment, '/'))
131+
end
132+
end
133+
return nothing
134+
end
135+
122136
function Base.show(io::IO, d::DataSet)
123137
print(io, DataSet, "(name=$(repr(d.name)), uuid=$(repr(d.uuid)), #= … =#)")
124138
end
@@ -188,21 +202,83 @@ To open a directory as a browsable tree object,
188202
open(BlobTree, dataset("a_tree_example"))
189203
```
190204
"""
191-
function dataset(proj::AbstractDataProject, name::AbstractString)
192-
# Non-fancy search... for now :)
193-
# In the future, we can consider parsing `name` into a dataset prefix and a
194-
# data selector / resource section. Eg a path for BlobTree which gives us a
195-
# SubDataSet
196-
#
197-
# The URN RFC8141 has some good design inspiration here, in particular the
198-
# distinction between r-component and q-component seems relevant:
199-
# * Some parameters may need to be passed to the "resolver" (ie, the data
200-
# storage backend)
201-
# * Some parameters may need to be passed to the dataset itself (eg, a
202-
# relative path within the dataset)
203-
#
204-
# See https://datatracker.ietf.org/doc/html/rfc8141#page-12
205-
return proj[name]
205+
function dataset(proj::AbstractDataProject, spec::AbstractString)
206+
namestr, query, fragmentstr = _split_dataspec(spec)
207+
208+
if isnothing(namestr)
209+
throw(ArgumentError("Invalid dataset specification: $spec"))
210+
end
211+
212+
dataset = proj[namestr]
213+
214+
if isnothing(query) && isnothing(fragmentstr)
215+
return dataset
216+
end
217+
218+
# Enhance dataset with "dataspec" holding URL-like fragment & query
219+
dataspec = Dict()
220+
if !isnothing(query)
221+
dataspec["query"] = Dict{String,Any}(query)
222+
end
223+
if !isnothing(fragmentstr)
224+
dataspec["fragment"] = fragmentstr
225+
end
226+
227+
# We need to take care here with copy() to avoid modifying the original
228+
# dataset configuration.
229+
conf = copy(dataset.conf)
230+
conf["dataspec"] = dataspec
231+
232+
return DataSet(conf)
233+
end
234+
235+
236+
# Percent-decode a string according to the URI escaping rules.
237+
# Vendored from URIs.jl for now to avoid depending on that entire package for
238+
# this one function.
239+
function _unescapeuri(str)
240+
occursin("%", str) || return str
241+
out = IOBuffer()
242+
i = 1
243+
io = IOBuffer(str)
244+
while !eof(io)
245+
c = read(io, Char)
246+
if c == '%'
247+
c1 = read(io, Char)
248+
c = read(io, Char)
249+
write(out, parse(UInt8, string(c1, c); base=16))
250+
else
251+
write(out, c)
252+
end
253+
end
254+
return String(take!(out))
255+
end
256+
257+
function _split_dataspec(spec::AbstractString)
258+
# Parse as a suffix of URI syntax
259+
# name/of/dataset?param1=value1&param2=value2#fragment
260+
m = match(r"
261+
^
262+
((?:[[:alpha:]][[:alnum:]_]*/?)+) # name - a/b/c
263+
(?:\?([^#]*))? # query - a=b&c=d
264+
(?:\#(.*))? # fragment - ...
265+
$"x,
266+
spec)
267+
if isnothing(m)
268+
return nothing, nothing, nothing
269+
end
270+
namestr = m[1]
271+
query = m[2]
272+
fragmentstr = m[3]
273+
274+
if !isnothing(query)
275+
query = [_unescapeuri(x)=>_unescapeuri(y) for (x,y) in split.(split(query, '&'), '=')]
276+
end
277+
if !isnothing(fragmentstr)
278+
fragmentstr = _unescapeuri(fragmentstr)
279+
end
280+
281+
namestr, query, fragmentstr
206282
end
207283

208284
function Base.haskey(proj::AbstractDataProject, name::AbstractString)

src/filesystem.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ end
271271
#--------------------------------------------------
272272

273273
# Filesystem storage driver
274-
function connect_filesystem(f, config, _)
274+
function connect_filesystem(f, config, dataset)
275275
path = config["path"]
276276
type = config["type"]
277277
if type == "Blob"
@@ -280,6 +280,10 @@ function connect_filesystem(f, config, _)
280280
elseif type == "BlobTree"
281281
isdir(path) || throw(ArgumentError("$(repr(path)) should be a directory"))
282282
storage = BlobTree(FileSystemRoot(path))
283+
path = dataspec_fragment_as_path(dataset)
284+
if !isnothing(path)
285+
storage = storage[path]
286+
end
283287
else
284288
throw(ArgumentError("DataSet type $type not supported on the filesystem"))
285289
end

test/runtests.jl

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ end
5757
tree = @! open(tree_data)
5858
@test readdir(tree) == ["1.csv", "2.csv"]
5959
end
60+
61+
blob_in_tree_data = dataset(proj, "a_tree_example#1.csv")
62+
@test open(blob_in_tree_data) isa Blob
63+
@context begin
64+
@test @!(open(String, blob_in_tree_data)) == """Name,Age\n"Aaron",23\n"Harry",42\n"""
65+
end
6066
end
6167

6268
#-------------------------------------------------------------------------------
@@ -101,6 +107,26 @@ end
101107
@test_throws ErrorException DataSets.check_dataset_name("/a/b")
102108
end
103109

110+
@testset "URL-like dataspec parsing" begin
111+
proj = DataSets.load_project("Data.toml")
112+
113+
@test !haskey(dataset(proj, "a_text_file"), "dataspec")
114+
115+
# URL-like query
116+
@test dataset(proj, "a_text_file?x=1&yy=2")["dataspec"]["query"] == Dict("x"=>"1", "yy"=>"2")
117+
@test dataset(proj, "a_text_file?y%20y=x%20x")["dataspec"]["query"] == Dict("y y"=>"x x")
118+
@test dataset(proj, "a_text_file?x=%3d&y=%26")["dataspec"]["query"] == Dict("x"=>"=", "y"=>"&")
119+
120+
# URL-like fragment
121+
@test dataset(proj, "a_text_file#a/b")["dataspec"]["fragment"] == "a/b"
122+
@test dataset(proj, "a_text_file#x%20x")["dataspec"]["fragment"] == "x x"
123+
@test dataset(proj, "a_text_file#x%ce%b1x")["dataspec"]["fragment"] == "xαx"
124+
125+
# Combined query and fragment
126+
@test dataset(proj, "a_text_file?x=1&yy=2#frag")["dataspec"]["query"] == Dict("x"=>"1", "yy"=>"2")
127+
@test dataset(proj, "a_text_file?x=1&yy=2#frag")["dataspec"]["fragment"] == "frag"
128+
end
129+
104130
#-------------------------------------------------------------------------------
105131
# Trees
106132
@testset "Temporary trees" begin

0 commit comments

Comments
 (0)