diff --git a/Project.toml b/Project.toml index 45047da..409bf9e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DataSets" uuid = "c9661210-8a83-48f0-b833-72e62abce419" authors = ["Chris Foster and contributors"] -version = "0.2.11" +version = "0.2.12" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/src/DataSets.jl b/src/DataSets.jl index 6e4ee01..22dacb9 100644 --- a/src/DataSets.jl +++ b/src/DataSets.jl @@ -87,18 +87,39 @@ end Check whether a dataset name is valid. Valid names must start with a letter or a number, the rest of the name can also contain `-` -and `_` characters. The names can also be hieracicial, with segments separated by forward -slashes (`/`). Each segment must also start with either a letter or a number. For example: +and `_` characters. The names can also be hierarchical, with segments separated by forward +slashes (`/`) or (`.`). Each segment must also start with either a letter or a number. + +For example, the following dataset names are valid: my_data my_data_1 username/data organization_name/project-name/data 123user/456dataset--name + username/my_table.csv + dataset/v0.1.2 + +whereas names like this are invalid: + + __mydata__ + username/.git + my...dataset + +!!! note "Segment separators" + + In dataset names, both `/` and `.` are considered segment separators from a syntax + perspective. While DataSets.jl does not impose any specific interpretation on the + dataset name, it is recommended to use `/` to separate segments from a semantic + perspective, and to interpret each forward-slash-separated segment as a path separator. + Periods would conventionally be used to separate file extensions within a segment. + + E.g. use `username/my-project-data/population.csv`, rather than + `username.my-project-data.population.csv` or something like that. """ function check_dataset_name(name::AbstractString) if !occursin(DATASET_NAME_REGEX, name) - error("DataSet name \"$name\" is invalid. DataSet names must start with a letter and can contain only letters, numbers, `-`, `_` or `/`.") + error("DataSet name \"$name\" is invalid. DataSet names must start with a letter or a number, and can contain only letters, numbers, `-` and `_`, or `/` and `.` as segment separators.") end end # DataSet names disallow most punctuation for now, as it may be needed as @@ -106,8 +127,9 @@ end const DATASET_NAME_REGEX_STRING = raw""" [[:alnum:]] (?: - [-[:alnum:]_] | - / (?=[[:alnum:]]) + [-[:alnum:]_] | + \.(?=[[:alnum:]]) | + \/ (?=[[:alnum:]]) )* """ const DATASET_NAME_REGEX = Regex("^\n$(DATASET_NAME_REGEX_STRING)\n\$", "x") diff --git a/test/runtests.jl b/test/runtests.jl index 98a7966..4c48815 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -98,21 +98,55 @@ end end #------------------------------------------------------------------------------- +function load_list(filename) + lines = eachline(joinpath(@__DIR__, filename)) + filter(!isempty, strip.(lines)) +end @testset "Data set name parsing" begin - @testset "Valid name: $name" for name in ( - "a_b", "a-b", "a1", "δεδομένα", "a/b", "a/b/c", "a-", "b_", - "1", "a/1", "123", "12ab/34cd", "1/2/3", "1-2-3", "x_-__", "a---", - ) - @test DataSets.check_dataset_name(name) === nothing - @test DataSets._split_dataspec(name) == (name, nothing, nothing) + @testset "Valid names" begin + valid_names = load_list("testnames-valid.txt") + @test !isempty(valid_names) + @testset "Valid name: $name" for name in valid_names + @test DataSets.check_dataset_name(name) === nothing + @test DataSets._split_dataspec(name) == (name, nothing, nothing) + # Also test that the name is still valid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "$(name)/foo" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (path_name, nothing, nothing) + end + end end - @testset "Invalid name: $name" for name in ( - "a b", "a.b", "a/b/", "a//b", "/a/b", "a/-", "a/ _/b", - "a/-a", "a/-1", - ) - @test_throws ErrorException DataSets.check_dataset_name(name) - @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + @testset "Invalid names" begin + invalid_names = load_list("testnames-invalid.txt") + @test !isempty(invalid_names) + @testset "Invalid name: $name" for name in invalid_names + @test_throws ErrorException DataSets.check_dataset_name(name) + @test DataSets._split_dataspec(name) == (nothing, nothing, nothing) + # Also test that the name is still invalid when it appears as part of + # a path elements. + let path_name = "foo/$(name)" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "$(name)/foo" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + let path_name = "foo/$(name)/bar" + @test_throws ErrorException DataSets.check_dataset_name(path_name) === nothing + @test DataSets._split_dataspec(path_name) == (nothing, nothing, nothing) + end + end end end diff --git a/test/testnames-invalid.txt b/test/testnames-invalid.txt new file mode 100644 index 0000000..bf78155 --- /dev/null +++ b/test/testnames-invalid.txt @@ -0,0 +1,25 @@ +a b +a/b/ +a//b +/a/b +a/- +a/ _/b +a/-a +a/-1 +.a +..a +a. +a.. +.a. +a..b +.abc +abc. +abc/.def +abc/def. +a./b +a.- +_._ +a._b +a.-b +./a +b/../a diff --git a/test/testnames-valid.txt b/test/testnames-valid.txt new file mode 100644 index 0000000..53c7bbb --- /dev/null +++ b/test/testnames-valid.txt @@ -0,0 +1,24 @@ +a_b +a-b +a1 +δεδομένα +a/b +a/b/c +a- +b_ +1 +a/1 +123 +12ab/34cd +1/2/3 +1-2-3 +x_-__ +a--- +a.b +a.b +abc.def +abc/def.ghi +abc-def.ghi_jkl +a.b.c +a_.c +foo__-.csv