Check for user-given type when encountering extra columns during parsing (#1023)

nickrobinson251 · web-flow · commit 3d477343177e · 2022-09-19T17:25:22.000+01:00
* Pull out initial type-setting logic into function

* Refactor function for initialising cols with types

* Re-use type-setting logic when new column found

* Split fallback case into specific methods

* Add some internal docs/comments

* Test `types isa Function` case

* fixup! Re-use type-setting logic when new column found

* Test `types isa AbstractDict` case

* Log message if we expect parsing to fail

- We expect parsing to fail in the case edge case where
  we unexpectedly find an extra column and its type was
  set via the `types` keyword to be some type `T`
  which is a type we don't know how to parse
  (i.e. `T` is not a standard/supported type nor a custom type
  which we have already compiled a specialised parse method for).
- In such a case, log an error-level message noting to
  inform the user we are in this case and parsing is not expected
  to work.
- In future we could considering trying to support this situation.

* fixup! Test `types isa AbstractDict` case

* fixup! Log message if we expect parsing to fail
diff --git a/src/chunks.jl b/src/chunks.jl
@@ -97,7 +97,7 @@ function Base.iterate(x::Chunks, i=1)
     threaded = false
     ntasks = 1
     limit = typemax(Int)
-    ctx = Context(x.ctx.transpose, x.ctx.name, names, rowsguess, x.ctx.cols, x.ctx.buf, datapos, len, 1, x.ctx.options, columns, x.ctx.pool, x.ctx.downcast, x.ctx.customtypes, x.ctx.typemap, x.ctx.stringtype, limit, threaded, ntasks, x.ctx.chunkpositions, x.ctx.strict, x.ctx.silencewarnings, x.ctx.maxwarnings, x.ctx.debug, x.ctx.tempfile, x.ctx.streaming)
+    ctx = Context(x.ctx.transpose, x.ctx.name, names, rowsguess, x.ctx.cols, x.ctx.buf, datapos, len, 1, x.ctx.options, columns, x.ctx.pool, x.ctx.downcast, x.ctx.customtypes, x.ctx.typemap, x.ctx.stringtype, limit, threaded, ntasks, x.ctx.chunkpositions, x.ctx.strict, x.ctx.silencewarnings, x.ctx.maxwarnings, x.ctx.debug, x.ctx.tempfile, x.ctx.streaming, x.ctx.types)
     f = File(ctx, true)
     return f, i + 1
 end
diff --git a/src/context.jl b/src/context.jl
@@ -79,7 +79,7 @@ function checkvaliddelim(delim)
                             "the following delimiters are invalid: '\\r', '\\n', '\\0'"))
 end
 
-function checkinvalidcolumns(dict, argname, ncols, names)
+function checkinvalidcolumns(dict::AbstractDict, argname, ncols, names)
     for (k, _) in dict
         if k isa Integer
             (0 < k <= ncols) || throw(ArgumentError("invalid column number provided in `$argname` keyword argument: $k. Column number must be 0 < i <= $ncols as detected in the data. To ignore invalid columns numbers in `$argname`, pass `validate=false`"))
@@ -88,11 +88,75 @@ function checkinvalidcolumns(dict, argname, ncols, names)
             isvalid || throw(ArgumentError("invalid column name provided in `$argname` keyword argument: $k. Valid column names detected in the data are: $names. To ignore invalid columns names in `$argname`, pass `validate=false`"))
         end
     end
-    return
+    return nothing
 end
+function checkinvalidcolumns(vec::AbstractVector, argname, ncols, names)
+    # we generally expect `length(types) == ncols` but still want to support the case where
+    # an additional column is found later in the file and e.g. has its type given in `types`
+    length(vec) >= ncols || throw(ArgumentError("provided `$argname::AbstractVector` keyword argument doesn't match detected # of columns: `$(length(vec)) < $ncols`"))
+    return nothing
+end
+# if the argument isn't given as an AbstractDict or an AbstractVector then
+# we have no way to check it again the number of cols or the names
+checkinvalidcolumns(arg::Any, argname, ncols, names) = nothing
 
 @noinline nonconcretetypes(types) = throw(ArgumentError("Non-concrete types passed in `types` keyword argument, please provide concrete types for columns: $types"))
 
+# Create all the `Column`s and keep track of any non-standard eltypes for which we will
+# later need generate specialized parsing methods.
+# - `ncols` is the number of columns to create
+# - `types` is the user-given input
+function initialize_columns(ncols::Int, types, names, args...; validate)
+    columns = Vector{Column}(undef, ncols)
+    customtypes = Tuple{}
+    validate && checkinvalidcolumns(types, "types", ncols, names)
+    for i = 1:ncols
+        col = initialize_column(i, types, names, args...)
+        columns[i] = col
+        if nonstandardtype(col.type) !== Union{}
+            customtypes = tupcat(customtypes, nonstandardtype(col.type))
+        end
+    end
+    return columns, customtypes
+end
+
+# Create a `Column` with their eltype set using any user-provided types,
+# but without yet allocating a vector to hold the parsed results (see `allocate`)
+# - `i` is the column number e.g. i=1 for the 1st column.
+# - `types` is the user-given input
+function initialize_column(i, types::AbstractVector, names, stringtype, streaming::Bool, options)
+    # we generally expected `length(types) == ncols` but we still want to support the case
+    # where an additional column is found later in the file and wasn't in `types`
+    T = i <= length(types) ? types[i] : NeedsTypeDetection
+    return Column(T, options)
+end
+
+function initialize_column(i, types::AbstractDict, names, stringtype, streaming::Bool, options)
+    defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
+    # if an additional column is found while parsing, it will not have a name yet
+    nm = i <= length(names) ? names[i] : ""
+    T = getordefault(types, nm, i, defaultT)
+    col = Column(T, options)
+    return col
+end
+
+function initialize_column(i, types::Function, names, stringtype, streaming::Bool, options)
+    defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
+    # if an additional column is found while parsing, it will not have a name yet
+    nm = i <= length(names) ? names[i] : ""
+    T = something(types(i, nm), defaultT)
+    return Column(T, options)
+end
+
+function initialize_column(i, types::Nothing, names, stringtype, streaming::Bool, options)
+    T = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
+    return Column(T, options)
+end
+
+function initialize_column(i, types::Type, names, stringtype, streaming::Bool, options)
+    return Column(types, options)
+end
+
 struct Context
     transpose::Bool
     name::String
@@ -120,6 +184,11 @@ struct Context
     debug::Bool
     tempfile::Union{String, Nothing}
     streaming::Bool
+    types::Union{Nothing, Type, AbstractVector, AbstractDict, Function}
+end
+
+function initialize_column(i, ctx::Context)
+    return initialize_column(i, ctx.types, ctx.names, ctx.stringtype, ctx.streaming, ctx.options)
 end
 
 # user-facing function if just the context is desired
@@ -402,52 +471,7 @@ end
     debug && println("byte position of data computed at: $datapos")
 
     # generate initial columns
-    # deduce initial column types/flags for parsing based on whether any user-provided types were provided or not
-    customtypes = Tuple{}
-    if types isa AbstractVector
-        length(types) == ncols || throw(ArgumentError("provided `types::AbstractVector` keyword argument doesn't match detected # of columns: `$(length(types)) != $ncols`"))
-        columns = Vector{Column}(undef, ncols)
-        for i = 1:ncols
-            col = Column(types[i], options)
-            columns[i] = col
-            if nonstandardtype(col.type) !== Union{}
-                customtypes = tupcat(customtypes, nonstandardtype(col.type))
-            end
-        end
-    elseif types isa AbstractDict
-        T = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
-        columns = Vector{Column}(undef, ncols)
-        for i = 1:ncols
-            S = getordefault(types, names[i], i, T)
-            col = Column(S, options)
-            columns[i] = col
-            if nonstandardtype(col.type) !== Union{}
-                customtypes = tupcat(customtypes, nonstandardtype(col.type))
-            end
-        end
-        validate && checkinvalidcolumns(types, "types", ncols, names)
-    elseif types isa Function
-        defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
-        columns = Vector{Column}(undef, ncols)
-        for i = 1:ncols
-            T = something(types(i, names[i]), defaultT)
-            col = Column(T, options)
-            columns[i] = col
-            if nonstandardtype(col.type) !== Union{}
-                customtypes = tupcat(customtypes, nonstandardtype(col.type))
-            end
-        end
-    else
-        T = types === nothing ? (streaming ? Union{stringtype, Missing} : NeedsTypeDetection) : types
-        if nonstandardtype(T) !== Union{}
-            customtypes = tupcat(customtypes, nonstandardtype(T))
-        end
-        columns = Vector{Column}(undef, ncols)
-        for i = 1:ncols
-            col = Column(T, options)
-            columns[i] = col
-        end
-    end
+    columns, customtypes = initialize_columns(ncols, types, names, stringtype, streaming, options; validate=validate)
     if transpose
         # set column positions
         for i = 1:ncols
@@ -651,6 +675,7 @@ end
         maxwarnings,
         debug,
         tempfile,
-        streaming
+        streaming,
+        types,
     )
 end
diff --git a/src/file.jl b/src/file.jl
@@ -644,7 +644,7 @@ Base.@propagate_inbounds function parserow(startpos, row, numwarnings, ctx::Cont
             if customtypes !== Tuple{}
                 pos, code = parsecustom!(customtypes, buf, pos, len, row, rowoffset, i, col, ctx)
             else
-                error("bad column type: $(type))")
+                error("Column $i bad column type: `$(type)`")
             end
         end
         if promote_to_string(code)
@@ -684,17 +684,22 @@ Base.@propagate_inbounds function parserow(startpos, row, numwarnings, ctx::Cont
                 # extra columns on this row, let's widen
                 ctx.silencewarnings || toomanycolumns(ncols, rowoffset + row)
                 j = i + 1
-                T = ctx.streaming ? Union{ctx.stringtype, Missing} : NeedsTypeDetection
                 while pos <= len && !Parsers.newline(code)
-                    col = Column(T, ctx.options)
+                    col = initialize_column(j, ctx)
                     col.anymissing = ctx.streaming || rowoffset == 0 && row > 1 # assume all previous rows were missing
                     col.pool = ctx.pool
+                    T = col.type
+                    # TODO: Support edge case where a custom type was provided for the new column?
+                    # Right now if `T` is a `nonstandardtype` not already in `customtypes`, then
+                    # we won't have a specialised parse method for it, so parsing is expected to fail.
+                    # Only log the error, rather than throw, in case parsing somehow works.
+                    nonstandardtype(T) === Union{} || T in ctx.customtypes.parameters || @error "Parsing extra column with unknown type `$T`. Parsing may fail!"
                     if T === NeedsTypeDetection
                         pos, code = detectcell(buf, pos, len, row, rowoffset, j, col, ctx, rowsguess)
                     else
                         # need to allocate
-                        col.column = allocate(ctx.stringtype, ctx.rowsguess)
-                        pos, code = parsevalue!(ctx.stringtype, buf, pos, len, row, rowoffset, j, col, ctx)
+                        col.column = allocate(T, ctx.rowsguess)
+                        pos, code = parsevalue!(T, buf, pos, len, row, rowoffset, j, col, ctx)
                     end
                     j += 1
                     push!(columns, col)
diff --git a/test/basics.jl b/test/basics.jl
@@ -798,4 +798,25 @@ f = CSV.File(IOBuffer("time,date1,date2\n10:00:00.0,04/16/2020,04/17/2022\n"); d
 @test f[1].date1 == Dates.Date(2020, 4, 16)
 @test f[1].date2 == Dates.Date(2022, 4, 17)
 
+# 1021 - https://github.com/JuliaData/CSV.jl/issues/1021
+# user-given types for columns only found later in file
+str = """
+    1 2 3
+    1 2
+    1 2 3 4
+    1
+    1 2 3 4 5
+    """
+f = CSV.File(IOBuffer(str); delim=" ", header=false, types=String)
+@test String <: eltype(f.Column5)
+# case where `types isa AbstractVector`
+f = CSV.File(IOBuffer(str); delim=" ", header=false, types=[Int8, Int16, Int32, Int64, Int128])
+@test Int128 <: eltype(f.Column5)
+# case where `types isa Function`
+f = CSV.File(IOBuffer(str); delim=" ", header=false, types=(i,nm) -> (i == 5 ? Int8 : String))
+@test Int8 <: eltype(f.Column5)
+# case where `types isa AbstractDict`
+f = CSV.File(IOBuffer(str); delim=" ", header=false, types=Dict(r".*" => Float16))
+@test Float16 <: eltype(f.Column5)
+
 end