Allow Regex keys for selecting columns in types/dateformat/pool keyword (#1016)

nickrobinson251 · web-flow · commit ea78bbbabf6f · 2022-09-19T12:44:31.000+01:00
* Support `types` being `Dict{Regex}

* Validate `types` getting `Regex`

* Support `types` being `Dict{Any}` with `Regex` key(s)

* Test `dateformat` now also understands Regex

* fix typo in test

* Add test for when `Regex` and exact name both match

* Document that Regex can be used to identify columns by name
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -616,6 +616,9 @@ col1,col2,col3,col4,col5,col6,col7
 """
 file = CSV.File(IOBuffer(data); types=(i, name) -> i == 1 ? Bool : Int8)
 file = CSV.File(IOBuffer(data); types=(i, name) -> name == :col1 ? Bool : Int8)
+# Alternatively by providing the exact name for the first column and a Regex to match the rest.
+# Note that an exact column name always takes precedence over a regular expression.
+file = CSV.File(IOBuffer(data); types=Dict(:col1 => Bool, r"^col\d" => Int8))
 ```
 
 ## [Typemap](@id typemap_example)
diff --git a/docs/src/reading.md b/docs/src/reading.md
@@ -150,7 +150,7 @@ An ASCII `Char` argument that parsing uses when parsing quoted cells and the `qu
 
 ## [`dateformat`](@id dateformat)
 
-A `String` or `AbstractDict` argument that controls how parsing detects datetime values in the data input. As a single `String` (or `DateFormat`) argument, the same format will be applied to _all_ columns in the file. For columns without type information provided otherwise, parsing will use the provided format string to check if the cell is parseable and if so, will attempt to parse the entire column as the datetime type (`Time`, `Date`, or `DateTime`). By default, if no `dateformat` argument is explicitly provided, parsing will try to detect any of `Time`, `Date`, or `DateTime` types following the standard `Dates.ISOTimeFormat`, `Dates.ISODateFormat`, or `Dates.ISODateTimeFormat` formats, respectively. If a datetime type is provided for a column, (see the [types](@ref types) argument), then the `dateformat` format string needs to match the format of values in that column, otherwise, a warning will be emitted and the value will be replaced with a `missing` value (this behavior is also configurable via the [strict](@ref) and [silencewarnings](@ref strict) arguments). If an `AbstractDict` is provided, different `dateformat` strings can be provided for specific columns; the provided dict can map either an `Integer` for column number, or a `String` or `Symbol` for column name to the dateformat string that should be used for that column. Columns not mapped in the dict argument will use the default format strings mentioned above.
+A `String` or `AbstractDict` argument that controls how parsing detects datetime values in the data input. As a single `String` (or `DateFormat`) argument, the same format will be applied to _all_ columns in the file. For columns without type information provided otherwise, parsing will use the provided format string to check if the cell is parseable and if so, will attempt to parse the entire column as the datetime type (`Time`, `Date`, or `DateTime`). By default, if no `dateformat` argument is explicitly provided, parsing will try to detect any of `Time`, `Date`, or `DateTime` types following the standard `Dates.ISOTimeFormat`, `Dates.ISODateFormat`, or `Dates.ISODateTimeFormat` formats, respectively. If a datetime type is provided for a column, (see the [types](@ref types) argument), then the `dateformat` format string needs to match the format of values in that column, otherwise, a warning will be emitted and the value will be replaced with a `missing` value (this behavior is also configurable via the [strict](@ref) and [silencewarnings](@ref strict) arguments). If an `AbstractDict` is provided, different `dateformat` strings can be provided for specific columns; the provided dict can map either an `Integer` for column number or a `String`, `Symbol` or `Regex` for column name to the dateformat string that should be used for that column. Columns not mapped in the dict argument will use the default format strings mentioned above.
 
 ### Examples
   * [DateFormat](@ref dateformat_example)
@@ -171,9 +171,13 @@ These arguments can be provided as `Vector{String}` to specify custom values tha
 
 ## [`types`](@id types)
 
-Argument to control the types of columns that get parsed in the data input. Can be provided as a single `Type`, an `AbstractVector` of types, an `AbstractDict`, or a function. If a single type is provided, like `types=Float64`, then _all_ columns in the data input will be parsed as `Float64`. If a column's value isn't a valid `Float64` value, then a warning will be emitted, unless `silencewarnings=false` is passed, then no warning will be printed. However, if `strict=true` is passed, then an error will be thrown instead, regarldess of the `silencewarnings` argument. The `types` argument can also be provided as an `AbstractVector{Type}`, wherein the length of the vector should match the number of columns in the data input, and each element gives the type of the corresponding column in order. If provided as an `AbstractDict`, then specific columns can have their column type specified, with the key of the dict being an `Integer` for column number, or `String` or `Symbol` for column name, and the dict value being the column type. Unspecified columns will have their column type auto-detected while parsing. A function can also be provided, and should be of the form `(i, name) -> Union{T, Nothing}`, and will be applied to each detected column during initial parsing. Returning `nothing` from the function will result in the column's type being automatically detected during parsing.
+Argument to control the types of columns that get parsed in the data input. Can be provided as a single `Type`, an `AbstractVector` of types, an `AbstractDict`, or a function.
+- If a single type is provided, like `types=Float64`, then _all_ columns in the data input will be parsed as `Float64`. If a column's value isn't a valid `Float64` value, then a warning will be emitted, unless `silencewarnings=false` is passed, then no warning will be printed. However, if `strict=true` is passed, then an error will be thrown instead, regarldess of the `silencewarnings` argument.
+- If a `AbstractVector{Type}` is provided, then the length of the vector should match the number of columns in the data input, and each element gives the type of the corresponding column in order.
+- If an `AbstractDict`, then specific columns can have their column type specified with the key of the dict being an `Integer` for column number, or `String` or `Symbol` for column name or `Regex` matching column names, and the dict value being the column type. Unspecified columns will have their column type auto-detected while parsing.
+- If a function, then it should be of the form `(i, name) -> Union{T, Nothing}`, and will be applied to each detected column during initial parsing. Returning `nothing` from the function will result in the column's type being automatically detected during parsing.
 
-By default, `types=nothing`, which means all column types in the data input will be detected while parsing. Note that it isn't necessary to pass `types=Union{Float64, Missing}` if the data input contains `missing` values. Parsing will detect `missing` values if present, and promote any manually provided column types from the singular (`Float64`) to the missing equivalent (`Union{Float64, Missing}`) automatically. Standard types will be auto-detected in the following order when not otherwise specified:  `Int64`, `Float64`, `Date`, `DateTime`, `Time`, `Bool`, `String`.
+By default `types=nothing`, which means all column types in the data input will be detected while parsing. Note that it isn't necessary to pass `types=Union{Float64, Missing}` if the data input contains `missing` values. Parsing will detect `missing` values if present, and promote any manually provided column types from the singular (`Float65`) to the missing equivalent (`Union{Float64, Missing}`) automatically. Standard types will be auto-detected in the following order when not otherwise specified:  `Int64`, `Float64`, `Date`, `DateTime`, `Time`, `Bool`, `String`.
 
 Non-standard types can be provided, like `Dec64` from the DecFP.jl package, but must support the `Base.tryparse(T, str)` function for parsing a value from a string. This allows, for example, easily defining a custom type, like `struct Float64Array; values::Vector{Float64}; end`, as long as a corresponding `Base.tryparse` definition is defined, like `Base.tryparse(::Type{Float64Array}, str) = Float64Array(map(x -> parse(Float64, x), split(str, ';')))`, where a single cell in the data input is like `1.23;4.56;7.89`.
 
diff --git a/src/context.jl b/src/context.jl
@@ -6,7 +6,7 @@ Fields:
   * `anymissing`: whether any missing values have been encountered while parsing; if a user provided a type like `Union{Int, Missing}`, we'll set this to `true`, or when `missing` values are encountered while parsing
   * `userprovidedtype`: whether the column type was provided by the user or not; this affects whether we'll promote a column's type while parsing, or emit a warning/error depending on `strict` keyword arg
   * `willdrop`: whether we'll drop this column from the final columnset; computed from select/drop keyword arguments; this will result in a column type of `HardMissing` while parsing, where an efficient parser is used to "skip" a field w/o allocating any parsed value
-  * `pool`: computed from `pool` keyword argument; `true` is `1.0`, `false` is `0.0`, everything else is `Float64(pool)`; once computed, this field isn't mutated at all while parsing; it's used in type detection to determine whether a column will be pooled or not once a type is detected; 
+  * `pool`: computed from `pool` keyword argument; `true` is `1.0`, `false` is `0.0`, everything else is `Float64(pool)`; once computed, this field isn't mutated at all while parsing; it's used in type detection to determine whether a column will be pooled or not once a type is detected;
   * `columnspecificpool`: if `pool` was provided via Vector or Dict by user, then `true`, other `false`; if `false`, then only string column types will attempt pooling
   * `column`: the actual column vector to hold parsed values; field is typed as `AbstractVector` and while parsing, we do switches on `col.type` to assert the column type to make code concretely typed
   * `lock`: in multithreaded parsing, we have a top-level set of `Vector{Column}`, then each threaded parsing task makes its own copy to parse its own chunk; when synchronizing column types/pooled refs, the task-local `Column` will `lock(col.lock)` to make changes to the parent `Column`; each task-local `Column` shares the same `lock` of the top-level `Column`
@@ -84,7 +84,8 @@ function checkinvalidcolumns(dict, argname, ncols, names)
         if k isa Integer
             (0 < k <= ncols) || throw(ArgumentError("invalid column number provided in `$argname` keyword argument: $k. Column number must be 0 < i <= $ncols as detected in the data. To ignore invalid columns numbers in `$argname`, pass `validate=false`"))
         else
-            Symbol(k) in names || throw(ArgumentError("invalid column name provided in `$argname` keyword argument: $k. Valid column names detected in the data are: $names. To ignore invalid columns names in `$argname`, pass `validate=false`"))
+            isvalid = (k isa Regex && any(nm -> contains(string(nm), k), names)) || Symbol(k) in names
+            isvalid || throw(ArgumentError("invalid column name provided in `$argname` keyword argument: $k. Valid column names detected in the data are: $names. To ignore invalid columns names in `$argname`, pass `validate=false`"))
         end
     end
     return
diff --git a/src/utils.jl b/src/utils.jl
@@ -364,7 +364,32 @@ end
 getordefault(x::AbstractDict{String}, nm, i, def) = haskey(x, string(nm)) ? x[string(nm)] : def
 getordefault(x::AbstractDict{Symbol}, nm, i, def) = haskey(x, nm) ? x[nm] : def
 getordefault(x::AbstractDict{Int}, nm, i, def) = haskey(x, i) ? x[i] : def
-getordefault(x::AbstractDict, nm, i, def) = haskey(x, i) ? x[i] : haskey(x, nm) ? x[nm] : haskey(x, string(nm)) ? x[string(nm)] : def
+function getordefault(x::AbstractDict{Regex}, nm, i, def)
+    for (re, T) in x
+        contains(string(nm), re) && return T
+    end
+    return def
+end
+function getordefault(x::AbstractDict, nm, i, def)
+    return if haskey(x, i)
+        x[i]
+    elseif haskey(x, nm)
+        x[nm]
+    elseif haskey(x, string(nm))
+        x[string(nm)]
+    else
+        val = _firstmatch(x, string(nm))
+        val !== nothing ? val : def
+    end
+end
+
+# return the first value in `x` with a `key::Regex` that matches on `nm`
+function _firstmatch(x::AbstractDict, nm::AbstractString)
+    for (k, T) in x
+        k isa Regex && contains(nm, k) && return T
+    end
+    return nothing
+end
 
 # given a DateFormat, is it meant for parsing Date, DateTime, or Time?
 function timetype(df::Parsers.Format)::Union{Type{Date}, Type{Time}, Type{DateTime}}
diff --git a/test/basics.jl b/test/basics.jl
@@ -777,4 +777,25 @@ f = CSV.File(IOBuffer(join((rand(("a,$(rand())", "b,$(rand())")) for _ = 1:10^6)
 f = CSV.File(IOBuffer("a\nfalse\n"))
 @test eltype(f.a) == Bool
 
+# 1014
+# types is Dict{Regex}
+data = IOBuffer("a_col,b_col,c,d\n1,2,3.14,hey\n4,2,6.5,hey\n")
+f = CSV.File(data; types=Dict(r"_col$" => Int16))
+@test eltype(f.a_col) == Int16
+@test eltype(f.b_col) == Int16
+@test_throws ArgumentError CSV.File(data; types=Dict(r"_column$" => Int16))
+# types is Dict{Any} including `Regex` key
+f = CSV.File(data; types=Dict(r"_col$" => Int16, "c" => Float16))
+@test eltype(f.a_col) == Int16
+@test eltype(f.b_col) == Int16
+@test eltype(f.c) == Float16
+# Regex has lower precedence than exact column name/number match
+f = CSV.File(data; types=Dict(r"_col$" => Int16, :a_col => Int8))
+@test eltype(f.a_col) == Int8
+@test eltype(f.b_col) == Int16
+# dateformat supports Regex
+f = CSV.File(IOBuffer("time,date1,date2\n10:00:00.0,04/16/2020,04/17/2022\n"); dateformat=Dict(r"^date"=>"mm/dd/yyyy"))
+@test f[1].date1 == Dates.Date(2020, 4, 16)
+@test f[1].date2 == Dates.Date(2022, 4, 17)
+
 end