Skip to content

Commit e5b5eba

Browse files
tp2750quinnj
andauthored
951 whitespace (#953)
* add option stripwhitespace * add stripwhitespace to Contexts * Add tests for stripwhitespace (issue 951) * Update src/keyworddocs.jl * Update test/basics.jl Co-authored-by: Thomas Poulsen <[email protected]> Co-authored-by: Jacob Quinn <[email protected]>
1 parent cde550c commit e5b5eba

File tree

6 files changed

+26
-9
lines changed

6 files changed

+26
-9
lines changed

src/chunks.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ function Chunks(source::ValidSources;
6060
decimal::Union{UInt8, Char}=UInt8('.'),
6161
truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
6262
falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
63+
stripwhitespace::Bool=false,
6364
# type options
6465
type=nothing,
6566
types=nothing,
@@ -76,7 +77,7 @@ function Chunks(source::ValidSources;
7677
validate=true,
7778
)
7879

79-
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
80+
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
8081
!ctx.threaded && throw(ArgumentError("unable to iterate chunks from input file source"))
8182
foreach(col -> col.lock = ReentrantLock(), ctx.columns)
8283
return Chunks(ctx)

src/context.jl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ function Context(source::ValidSources;
159159
decimal::Union{UInt8, Char}=UInt8('.'),
160160
truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
161161
falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
162+
stripwhitespace::Bool=false,
162163
# type options
163164
type=nothing,
164165
types=nothing,
@@ -174,7 +175,7 @@ function Context(source::ValidSources;
174175
parsingdebug::Bool=false,
175176
validate::Bool=true,
176177
)
177-
return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
178+
return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
178179
end
179180

180181
@refargs function Context(source::ValidSources,
@@ -213,6 +214,7 @@ end
213214
decimal::Union{UInt8, Char},
214215
truestrings::Union{Nothing, Vector{String}},
215216
falsestrings::Union{Nothing, Vector{String}},
217+
stripwhitespace::Bool,
216218
# type options
217219
type::Union{Nothing, Type},
218220
types::Union{Nothing, Type, AbstractVector, AbstractDict, Function},
@@ -361,14 +363,14 @@ end
361363
d, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows, del)
362364
wh1 = d == UInt(' ') ? 0x00 : wh1
363365
wh2 = d == UInt8('\t') ? 0x00 : wh2
364-
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug)
366+
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
365367
elseif del isa Char
366368
_, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows)
367-
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug)
369+
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
368370
d = del
369371
elseif del isa String
370372
_, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows)
371-
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug)
373+
options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
372374
d = del
373375
else
374376
error("invalid delim type")
@@ -467,7 +469,7 @@ end
467469
# devdoc: if we want to add any other column-specific parsing options, this is where we'd at the logic
468470
# e.g. per-column sentinel, decimal, trues, falses, openquotechar, closequotechar, escapechar, etc.
469471
if df !== nothing
470-
columns[i].options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, true, parsingdebug)
472+
columns[i].options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, true, parsingdebug, stripwhitespace)
471473
end
472474
end
473475
validate && checkinvalidcolumns(dateformat, "dateformat", ncols, names)

src/file.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ function File(source::ValidSources;
196196
decimal::Union{UInt8, Char}=UInt8('.'),
197197
truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
198198
falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
199+
stripwhitespace::Bool=false,
199200
# type options
200201
type=nothing,
201202
types=nothing,
@@ -217,7 +218,7 @@ function File(source::ValidSources;
217218
# dateformats=nothing;decimal=UInt8('.');truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=Dict{Type,Type}();
218219
# pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=false;parsingdebug=false;buffer_in_memory=false
219220
# @descend CSV.Context(CSV.Arg(source), CSV.Arg(header), CSV.Arg(normalizenames), CSV.Arg(datarow), CSV.Arg(skipto), CSV.Arg(footerskip), CSV.Arg(transpose), CSV.Arg(comment), CSV.Arg(ignoreemptyrows), CSV.Arg(ignoreemptylines), CSV.Arg(select), CSV.Arg(drop), CSV.Arg(limit), CSV.Arg(buffer_in_memory), CSV.Arg(threaded), CSV.Arg(ntasks), CSV.Arg(tasks), CSV.Arg(rows_to_check), CSV.Arg(lines_to_check), CSV.Arg(missingstrings), CSV.Arg(missingstring), CSV.Arg(delim), CSV.Arg(ignorerepeated), CSV.Arg(quoted), CSV.Arg(quotechar), CSV.Arg(openquotechar), CSV.Arg(closequotechar), CSV.Arg(escapechar), CSV.Arg(dateformat), CSV.Arg(dateformats), CSV.Arg(decimal), CSV.Arg(truestrings), CSV.Arg(falsestrings), CSV.Arg(type), CSV.Arg(types), CSV.Arg(typemap), CSV.Arg(pool), CSV.Arg(downcast), CSV.Arg(lazystrings), CSV.Arg(stringtype), CSV.Arg(strict), CSV.Arg(silencewarnings), CSV.Arg(maxwarnings), CSV.Arg(debug), CSV.Arg(parsingdebug), CSV.Arg(false))
220-
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
221+
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
221222
return File(ctx)
222223
end
223224

src/keyworddocs.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const KEYWORD_DOCS = """
2929
* `dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}`: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an `AbstractDict`, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index `Int`, or name `Symbol` or `String` to the format string for that column.
3030
* `decimal='.'`: a `Char` indicating how decimals are separated in floats, i.e. `3.14` uses `'.'`, or `3,14` uses a comma `','`
3131
* `truestrings`, `falsestrings`: `Vector{String}`s that indicate how `true` or `false` values are represented; by default `"true", "True", "TRUE", "T", "1"` are used to detect `true` and `"false", "False", "FALSE", "F", "0"` are used to detect `false`; note that columns with only `1` and `0` values will default to `Int64` column type unless explicitly requested to be `Bool` via `types` keyword argument
32+
* `stripwhitespace=false`: if true, leading and trailing whitespace are stripped from string values, including column names
3233
3334
## Column Type Options:
3435
@@ -46,4 +47,4 @@ const KEYWORD_DOCS = """
4647
## Iteration options:
4748
4849
* `reusebuffer=false`: [only supported by `CSV.Rows`] while iterating, whether a single row buffer should be allocated and reused on each iteration; only use if each row will be iterated once and not re-used (e.g. it's not safe to use this option if doing `collect(CSV.Rows(file))` because only current iterated row is "valid")
49-
"""
50+
"""

src/rows.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ function Rows(source::ValidSources;
104104
decimal::Union{UInt8, Char}=UInt8('.'),
105105
truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
106106
falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
107+
stripwhitespace::Bool=false,
107108
# type options
108109
type=nothing,
109110
types=nothing,
@@ -120,7 +121,7 @@ function Rows(source::ValidSources;
120121
validate::Bool=true,
121122
reusebuffer::Bool=false,
122123
)
123-
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, nothing, nothing, 0, nothing, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, true)
124+
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, nothing, nothing, 0, nothing, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, true)
124125
foreach(col -> col.pool = 0.0, ctx.columns)
125126
allocate!(ctx.columns, 1)
126127
values = all(x->x.type === ctx.stringtype && x.anymissing, ctx.columns) && ctx.stringtype === PosLenString ? Vector{PosLen}(undef, ctx.cols) : Vector{Any}(undef, ctx.cols)

test/basics.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,4 +758,15 @@ f = CSV.File(IOBuffer("a,a_1,a\n"))
758758
f = CSV.File(IOBuffer("a,a,a_1\n")) # this case is not covered in test_duplicate_columnnames.csv
759759
@test f.names == [:a, :a_2, :a_1]
760760

761+
# 951
762+
data = """
763+
| Name | Zip |
764+
| Joe | 123 |
765+
| Mary Anne | 1234 |
766+
"""
767+
f = CSV.File(IOBuffer(data); delim='|', normalizenames=true, stripwhitespace=false)
768+
@test f.Name[1] == " Joe "
769+
f = CSV.File(IOBuffer(data); delim='|', stripwhitespace=true)
770+
@test f.Name[2] == "Mary Anne"
771+
761772
end

0 commit comments

Comments
 (0)