Skip to content

Commit e5a69cb

Browse files
committed
Add the option to set the quote_char setting when opening a CSV file.
1 parent 0f915d1 commit e5a69cb

File tree

7 files changed

+49
-19
lines changed

7 files changed

+49
-19
lines changed

lib/explorer/backend/data_frame.ex

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ defmodule Explorer.Backend.DataFrame do
6060
columns :: columns_for_io(),
6161
infer_schema_length :: option(integer()),
6262
parse_dates :: boolean(),
63-
eol_delimiter :: option(String.t())
63+
eol_delimiter :: option(String.t()),
64+
quote_char :: option(String.t())
6465
) :: io_result(df)
6566
@callback to_csv(
6667
df,
@@ -91,7 +92,8 @@ defmodule Explorer.Backend.DataFrame do
9192
columns :: columns_for_io(),
9293
infer_schema_length :: option(integer()),
9394
parse_dates :: boolean(),
94-
eol_delimiter :: option(String.t())
95+
eol_delimiter :: option(String.t()),
96+
quote_char :: option(String.t())
9597
) :: io_result(df)
9698

9799
# IO: Parquet

lib/explorer/data_frame.ex

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,9 @@ defmodule Explorer.DataFrame do
601601
* `:encoding` - Encoding to use when reading the file. For now, the only possible values are `utf8` and `utf8-lossy`.
602602
The utf8-lossy option means that invalid utf8 values are replaced with � characters. (default: `"utf8"`)
603603
604+
* `:quote_char` - A single character used for csv quoting. Set to `nil` to turn off special handling and escaping
605+
of quotes. (default: `"\""`)
606+
604607
"""
605608
@doc type: :io
606609
@spec from_csv(filename :: String.t() | fs_entry(), opts :: Keyword.t()) ::
@@ -622,7 +625,8 @@ defmodule Explorer.DataFrame do
622625
columns: nil,
623626
infer_schema_length: @default_infer_schema_length,
624627
parse_dates: false,
625-
eol_delimiter: nil
628+
eol_delimiter: nil,
629+
quote_char: "\""
626630
)
627631

628632
backend = backend_from_options!(backend_opts)
@@ -641,7 +645,8 @@ defmodule Explorer.DataFrame do
641645
to_columns_for_io(opts[:columns]),
642646
opts[:infer_schema_length],
643647
opts[:parse_dates],
644-
opts[:eol_delimiter]
648+
opts[:eol_delimiter],
649+
opts[:quote_char]
645650
]
646651

647652
Shared.apply_init(backend, :from_csv, args, backend_opts)
@@ -803,7 +808,8 @@ defmodule Explorer.DataFrame do
803808
columns: nil,
804809
infer_schema_length: @default_infer_schema_length,
805810
parse_dates: false,
806-
eol_delimiter: nil
811+
eol_delimiter: nil,
812+
quote_char: "\""
807813
)
808814

809815
backend = backend_from_options!(backend_opts)
@@ -821,7 +827,8 @@ defmodule Explorer.DataFrame do
821827
to_columns_for_io(opts[:columns]),
822828
opts[:infer_schema_length],
823829
opts[:parse_dates],
824-
opts[:eol_delimiter]
830+
opts[:eol_delimiter],
831+
opts[:quote_char]
825832
]
826833

827834
Shared.apply_init(backend, :load_csv, args, backend_opts)

lib/explorer/polars_backend/data_frame.ex

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
4949
columns,
5050
infer_schema_length,
5151
parse_dates,
52-
eol_delimiter
52+
eol_delimiter,
53+
quote_char
5354
)
5455
when module in [S3.Entry, HTTP.Entry] do
5556
path = Shared.build_path_for_entry(entry)
@@ -71,7 +72,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
7172
columns,
7273
infer_schema_length,
7374
parse_dates,
74-
eol_delimiter
75+
eol_delimiter,
76+
quote_char
7577
)
7678

7779
File.rm(path)
@@ -93,7 +95,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
9395
columns,
9496
infer_schema_length,
9597
parse_dates,
96-
eol_delimiter
98+
eol_delimiter,
99+
quote_char
97100
) do
98101
infer_schema_length =
99102
if infer_schema_length == nil,
@@ -118,7 +121,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
118121
encoding,
119122
nil_values,
120123
parse_dates,
121-
char_byte(eol_delimiter)
124+
char_byte(eol_delimiter),
125+
char_byte(quote_char)
122126
)
123127

124128
case df do
@@ -200,7 +204,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
200204
columns,
201205
infer_schema_length,
202206
parse_dates,
203-
eol_delimiter
207+
eol_delimiter,
208+
quote_char
204209
) do
205210
infer_schema_length =
206211
if infer_schema_length == nil,
@@ -225,7 +230,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
225230
encoding,
226231
nil_values,
227232
parse_dates,
228-
char_byte(eol_delimiter)
233+
char_byte(eol_delimiter),
234+
char_byte(quote_char)
229235
)
230236

231237
case df do

lib/explorer/polars_backend/lazy_frame.ex

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
136136
_,
137137
_,
138138
_,
139+
_,
139140
_
140141
) do
141142
{:error,
@@ -156,7 +157,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
156157
columns,
157158
infer_schema_length,
158159
parse_dates,
159-
eol_delimiter
160+
eol_delimiter,
161+
quote_char
160162
)
161163
when is_nil(columns) do
162164
infer_schema_length =
@@ -178,7 +180,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
178180
encoding,
179181
nil_values,
180182
parse_dates,
181-
char_byte(eol_delimiter)
183+
char_byte(eol_delimiter),
184+
char_byte(quote_char)
182185
)
183186

184187
case result do
@@ -201,6 +204,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
201204
_,
202205
_,
203206
_,
207+
_,
204208
_
205209
) do
206210
{:error,
@@ -311,7 +315,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
311315
columns,
312316
infer_schema_length,
313317
parse_dates,
314-
eol_delimiter
318+
eol_delimiter,
319+
quote_char
315320
) do
316321
with {:ok, df} <-
317322
Eager.load_csv(
@@ -327,7 +332,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
327332
columns,
328333
infer_schema_length,
329334
parse_dates,
330-
eol_delimiter
335+
eol_delimiter,
336+
quote_char
331337
) do
332338
{:ok, Eager.lazy(df)}
333339
end

lib/explorer/polars_backend/native.ex

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ defmodule Explorer.PolarsBackend.Native do
109109
_encoding,
110110
_nil_vals,
111111
_parse_dates,
112-
_eol_delimiter
112+
_eol_delimiter,
113+
_quote_char
113114
),
114115
do: err()
115116

@@ -145,7 +146,8 @@ defmodule Explorer.PolarsBackend.Native do
145146
_encoding,
146147
_nil_vals,
147148
_parse_dates,
148-
_eol_delimiter
149+
_eol_delimiter,
150+
_quote_char
149151
),
150152
do: err()
151153

@@ -255,7 +257,8 @@ defmodule Explorer.PolarsBackend.Native do
255257
_encoding,
256258
_nil_vals,
257259
_parse_dates,
258-
_eol_delimiter
260+
_eol_delimiter,
261+
_quote_char
259262
),
260263
do: err()
261264

native/explorer/src/dataframe/io.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pub fn df_from_csv(
4747
null_vals: Vec<String>,
4848
parse_dates: bool,
4949
eol_delimiter: Option<u8>,
50+
quote_char: Option<u8>,
5051
) -> Result<ExDataFrame, ExplorerError> {
5152
let encoding = match encoding {
5253
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -71,6 +72,7 @@ pub fn df_from_csv(
7172
.with_parse_options(
7273
CsvParseOptions::default()
7374
.with_encoding(encoding)
75+
.with_quote_char(quote_char)
7476
.with_truncate_ragged_lines(true)
7577
.with_try_parse_dates(parse_dates)
7678
.with_separator(delimiter_as_byte)
@@ -180,6 +182,7 @@ pub fn df_load_csv(
180182
null_vals: Vec<String>,
181183
parse_dates: bool,
182184
eol_delimiter: Option<u8>,
185+
quote_char: Option<u8>,
183186
) -> Result<ExDataFrame, ExplorerError> {
184187
let encoding = match encoding {
185188
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -211,6 +214,7 @@ pub fn df_load_csv(
211214
null_vals.iter().map(|x| x.into()).collect(),
212215
)))
213216
.with_try_parse_dates(parse_dates)
217+
.with_quote_char(quote_char)
214218
.with_eol_char(eol_delimiter.unwrap_or(b'\n')),
215219
)
216220
.into_reader_with_file_handle(cursor)

native/explorer/src/lazyframe/io.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ pub fn lf_from_csv(
262262
null_vals: Vec<String>,
263263
parse_dates: bool,
264264
eol_delimiter: Option<u8>,
265+
quote_char: Option<u8>,
265266
) -> Result<ExLazyFrame, ExplorerError> {
266267
let encoding = match encoding {
267268
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -283,6 +284,7 @@ pub fn lf_from_csv(
283284
null_vals.iter().map(|x| x.into()).collect(),
284285
)))
285286
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
287+
.with_quote_char(quote_char)
286288
.finish()?;
287289

288290
Ok(ExLazyFrame::new(df))

0 commit comments

Comments
 (0)