Skip to content

Commit 3d47734

Browse files
Check for user-given type when encountering extra columns during parsing (#1023)
* Pull out initial type-setting logic into function * Refactor function for initialising cols with types * Re-use type-setting logic when new column found * Split fallback case into specific methods * Add some internal docs/comments * Test `types isa Function` case * fixup! Re-use type-setting logic when new column found * Test `types isa AbstractDict` case * Log message if we expect parsing to fail - We expect parsing to fail in the case edge case where we unexpectedly find an extra column and its type was set via the `types` keyword to be some type `T` which is a type we don't know how to parse (i.e. `T` is not a standard/supported type nor a custom type which we have already compiled a specialised parse method for). - In such a case, log an error-level message noting to inform the user we are in this case and parsing is not expected to work. - In future we could considering trying to support this situation. * fixup! Test `types isa AbstractDict` case * fixup! Log message if we expect parsing to fail
1 parent ea78bbb commit 3d47734

File tree

4 files changed

+106
-55
lines changed

4 files changed

+106
-55
lines changed

src/chunks.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ function Base.iterate(x::Chunks, i=1)
9797
threaded = false
9898
ntasks = 1
9999
limit = typemax(Int)
100-
ctx = Context(x.ctx.transpose, x.ctx.name, names, rowsguess, x.ctx.cols, x.ctx.buf, datapos, len, 1, x.ctx.options, columns, x.ctx.pool, x.ctx.downcast, x.ctx.customtypes, x.ctx.typemap, x.ctx.stringtype, limit, threaded, ntasks, x.ctx.chunkpositions, x.ctx.strict, x.ctx.silencewarnings, x.ctx.maxwarnings, x.ctx.debug, x.ctx.tempfile, x.ctx.streaming)
100+
ctx = Context(x.ctx.transpose, x.ctx.name, names, rowsguess, x.ctx.cols, x.ctx.buf, datapos, len, 1, x.ctx.options, columns, x.ctx.pool, x.ctx.downcast, x.ctx.customtypes, x.ctx.typemap, x.ctx.stringtype, limit, threaded, ntasks, x.ctx.chunkpositions, x.ctx.strict, x.ctx.silencewarnings, x.ctx.maxwarnings, x.ctx.debug, x.ctx.tempfile, x.ctx.streaming, x.ctx.types)
101101
f = File(ctx, true)
102102
return f, i + 1
103103
end

src/context.jl

Lines changed: 74 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ function checkvaliddelim(delim)
7979
"the following delimiters are invalid: '\\r', '\\n', '\\0'"))
8080
end
8181

82-
function checkinvalidcolumns(dict, argname, ncols, names)
82+
function checkinvalidcolumns(dict::AbstractDict, argname, ncols, names)
8383
for (k, _) in dict
8484
if k isa Integer
8585
(0 < k <= ncols) || throw(ArgumentError("invalid column number provided in `$argname` keyword argument: $k. Column number must be 0 < i <= $ncols as detected in the data. To ignore invalid columns numbers in `$argname`, pass `validate=false`"))
@@ -88,11 +88,75 @@ function checkinvalidcolumns(dict, argname, ncols, names)
8888
isvalid || throw(ArgumentError("invalid column name provided in `$argname` keyword argument: $k. Valid column names detected in the data are: $names. To ignore invalid columns names in `$argname`, pass `validate=false`"))
8989
end
9090
end
91-
return
91+
return nothing
9292
end
93+
function checkinvalidcolumns(vec::AbstractVector, argname, ncols, names)
94+
# we generally expect `length(types) == ncols` but still want to support the case where
95+
# an additional column is found later in the file and e.g. has its type given in `types`
96+
length(vec) >= ncols || throw(ArgumentError("provided `$argname::AbstractVector` keyword argument doesn't match detected # of columns: `$(length(vec)) < $ncols`"))
97+
return nothing
98+
end
99+
# if the argument isn't given as an AbstractDict or an AbstractVector then
100+
# we have no way to check it again the number of cols or the names
101+
checkinvalidcolumns(arg::Any, argname, ncols, names) = nothing
93102

94103
@noinline nonconcretetypes(types) = throw(ArgumentError("Non-concrete types passed in `types` keyword argument, please provide concrete types for columns: $types"))
95104

105+
# Create all the `Column`s and keep track of any non-standard eltypes for which we will
106+
# later need generate specialized parsing methods.
107+
# - `ncols` is the number of columns to create
108+
# - `types` is the user-given input
109+
function initialize_columns(ncols::Int, types, names, args...; validate)
110+
columns = Vector{Column}(undef, ncols)
111+
customtypes = Tuple{}
112+
validate && checkinvalidcolumns(types, "types", ncols, names)
113+
for i = 1:ncols
114+
col = initialize_column(i, types, names, args...)
115+
columns[i] = col
116+
if nonstandardtype(col.type) !== Union{}
117+
customtypes = tupcat(customtypes, nonstandardtype(col.type))
118+
end
119+
end
120+
return columns, customtypes
121+
end
122+
123+
# Create a `Column` with their eltype set using any user-provided types,
124+
# but without yet allocating a vector to hold the parsed results (see `allocate`)
125+
# - `i` is the column number e.g. i=1 for the 1st column.
126+
# - `types` is the user-given input
127+
function initialize_column(i, types::AbstractVector, names, stringtype, streaming::Bool, options)
128+
# we generally expected `length(types) == ncols` but we still want to support the case
129+
# where an additional column is found later in the file and wasn't in `types`
130+
T = i <= length(types) ? types[i] : NeedsTypeDetection
131+
return Column(T, options)
132+
end
133+
134+
function initialize_column(i, types::AbstractDict, names, stringtype, streaming::Bool, options)
135+
defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
136+
# if an additional column is found while parsing, it will not have a name yet
137+
nm = i <= length(names) ? names[i] : ""
138+
T = getordefault(types, nm, i, defaultT)
139+
col = Column(T, options)
140+
return col
141+
end
142+
143+
function initialize_column(i, types::Function, names, stringtype, streaming::Bool, options)
144+
defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
145+
# if an additional column is found while parsing, it will not have a name yet
146+
nm = i <= length(names) ? names[i] : ""
147+
T = something(types(i, nm), defaultT)
148+
return Column(T, options)
149+
end
150+
151+
function initialize_column(i, types::Nothing, names, stringtype, streaming::Bool, options)
152+
T = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
153+
return Column(T, options)
154+
end
155+
156+
function initialize_column(i, types::Type, names, stringtype, streaming::Bool, options)
157+
return Column(types, options)
158+
end
159+
96160
struct Context
97161
transpose::Bool
98162
name::String
@@ -120,6 +184,11 @@ struct Context
120184
debug::Bool
121185
tempfile::Union{String, Nothing}
122186
streaming::Bool
187+
types::Union{Nothing, Type, AbstractVector, AbstractDict, Function}
188+
end
189+
190+
function initialize_column(i, ctx::Context)
191+
return initialize_column(i, ctx.types, ctx.names, ctx.stringtype, ctx.streaming, ctx.options)
123192
end
124193

125194
# user-facing function if just the context is desired
@@ -402,52 +471,7 @@ end
402471
debug && println("byte position of data computed at: $datapos")
403472

404473
# generate initial columns
405-
# deduce initial column types/flags for parsing based on whether any user-provided types were provided or not
406-
customtypes = Tuple{}
407-
if types isa AbstractVector
408-
length(types) == ncols || throw(ArgumentError("provided `types::AbstractVector` keyword argument doesn't match detected # of columns: `$(length(types)) != $ncols`"))
409-
columns = Vector{Column}(undef, ncols)
410-
for i = 1:ncols
411-
col = Column(types[i], options)
412-
columns[i] = col
413-
if nonstandardtype(col.type) !== Union{}
414-
customtypes = tupcat(customtypes, nonstandardtype(col.type))
415-
end
416-
end
417-
elseif types isa AbstractDict
418-
T = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
419-
columns = Vector{Column}(undef, ncols)
420-
for i = 1:ncols
421-
S = getordefault(types, names[i], i, T)
422-
col = Column(S, options)
423-
columns[i] = col
424-
if nonstandardtype(col.type) !== Union{}
425-
customtypes = tupcat(customtypes, nonstandardtype(col.type))
426-
end
427-
end
428-
validate && checkinvalidcolumns(types, "types", ncols, names)
429-
elseif types isa Function
430-
defaultT = streaming ? Union{stringtype, Missing} : NeedsTypeDetection
431-
columns = Vector{Column}(undef, ncols)
432-
for i = 1:ncols
433-
T = something(types(i, names[i]), defaultT)
434-
col = Column(T, options)
435-
columns[i] = col
436-
if nonstandardtype(col.type) !== Union{}
437-
customtypes = tupcat(customtypes, nonstandardtype(col.type))
438-
end
439-
end
440-
else
441-
T = types === nothing ? (streaming ? Union{stringtype, Missing} : NeedsTypeDetection) : types
442-
if nonstandardtype(T) !== Union{}
443-
customtypes = tupcat(customtypes, nonstandardtype(T))
444-
end
445-
columns = Vector{Column}(undef, ncols)
446-
for i = 1:ncols
447-
col = Column(T, options)
448-
columns[i] = col
449-
end
450-
end
474+
columns, customtypes = initialize_columns(ncols, types, names, stringtype, streaming, options; validate=validate)
451475
if transpose
452476
# set column positions
453477
for i = 1:ncols
@@ -651,6 +675,7 @@ end
651675
maxwarnings,
652676
debug,
653677
tempfile,
654-
streaming
678+
streaming,
679+
types,
655680
)
656681
end

src/file.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ Base.@propagate_inbounds function parserow(startpos, row, numwarnings, ctx::Cont
644644
if customtypes !== Tuple{}
645645
pos, code = parsecustom!(customtypes, buf, pos, len, row, rowoffset, i, col, ctx)
646646
else
647-
error("bad column type: $(type))")
647+
error("Column $i bad column type: `$(type)`")
648648
end
649649
end
650650
if promote_to_string(code)
@@ -684,17 +684,22 @@ Base.@propagate_inbounds function parserow(startpos, row, numwarnings, ctx::Cont
684684
# extra columns on this row, let's widen
685685
ctx.silencewarnings || toomanycolumns(ncols, rowoffset + row)
686686
j = i + 1
687-
T = ctx.streaming ? Union{ctx.stringtype, Missing} : NeedsTypeDetection
688687
while pos <= len && !Parsers.newline(code)
689-
col = Column(T, ctx.options)
688+
col = initialize_column(j, ctx)
690689
col.anymissing = ctx.streaming || rowoffset == 0 && row > 1 # assume all previous rows were missing
691690
col.pool = ctx.pool
691+
T = col.type
692+
# TODO: Support edge case where a custom type was provided for the new column?
693+
# Right now if `T` is a `nonstandardtype` not already in `customtypes`, then
694+
# we won't have a specialised parse method for it, so parsing is expected to fail.
695+
# Only log the error, rather than throw, in case parsing somehow works.
696+
nonstandardtype(T) === Union{} || T in ctx.customtypes.parameters || @error "Parsing extra column with unknown type `$T`. Parsing may fail!"
692697
if T === NeedsTypeDetection
693698
pos, code = detectcell(buf, pos, len, row, rowoffset, j, col, ctx, rowsguess)
694699
else
695700
# need to allocate
696-
col.column = allocate(ctx.stringtype, ctx.rowsguess)
697-
pos, code = parsevalue!(ctx.stringtype, buf, pos, len, row, rowoffset, j, col, ctx)
701+
col.column = allocate(T, ctx.rowsguess)
702+
pos, code = parsevalue!(T, buf, pos, len, row, rowoffset, j, col, ctx)
698703
end
699704
j += 1
700705
push!(columns, col)

test/basics.jl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,4 +798,25 @@ f = CSV.File(IOBuffer("time,date1,date2\n10:00:00.0,04/16/2020,04/17/2022\n"); d
798798
@test f[1].date1 == Dates.Date(2020, 4, 16)
799799
@test f[1].date2 == Dates.Date(2022, 4, 17)
800800

801+
# 1021 - https://github.com/JuliaData/CSV.jl/issues/1021
802+
# user-given types for columns only found later in file
803+
str = """
804+
1 2 3
805+
1 2
806+
1 2 3 4
807+
1
808+
1 2 3 4 5
809+
"""
810+
f = CSV.File(IOBuffer(str); delim=" ", header=false, types=String)
811+
@test String <: eltype(f.Column5)
812+
# case where `types isa AbstractVector`
813+
f = CSV.File(IOBuffer(str); delim=" ", header=false, types=[Int8, Int16, Int32, Int64, Int128])
814+
@test Int128 <: eltype(f.Column5)
815+
# case where `types isa Function`
816+
f = CSV.File(IOBuffer(str); delim=" ", header=false, types=(i,nm) -> (i == 5 ? Int8 : String))
817+
@test Int8 <: eltype(f.Column5)
818+
# case where `types isa AbstractDict`
819+
f = CSV.File(IOBuffer(str); delim=" ", header=false, types=Dict(r".*" => Float16))
820+
@test Float16 <: eltype(f.Column5)
821+
801822
end

0 commit comments

Comments
 (0)