Skip to content

Commit e3c3cdf

Browse files
authored
Add the option to set the quote_delimiter setting when opening a CSV file. (#1121)
* Add the option to set the `quote_char` setting when opening a CSV file. * Add tests for load_csv/2 with custom quote_char options * Rename `:quote_char` option to `:quote_delimiter` for CSV functions
1 parent 188050e commit e3c3cdf

File tree

8 files changed

+86
-19
lines changed

8 files changed

+86
-19
lines changed

lib/explorer/backend/data_frame.ex

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ defmodule Explorer.Backend.DataFrame do
6060
columns :: columns_for_io(),
6161
infer_schema_length :: option(integer()),
6262
parse_dates :: boolean(),
63-
eol_delimiter :: option(String.t())
63+
eol_delimiter :: option(String.t()),
64+
quote_delimiter :: option(String.t())
6465
) :: io_result(df)
6566
@callback to_csv(
6667
df,
@@ -91,7 +92,8 @@ defmodule Explorer.Backend.DataFrame do
9192
columns :: columns_for_io(),
9293
infer_schema_length :: option(integer()),
9394
parse_dates :: boolean(),
94-
eol_delimiter :: option(String.t())
95+
eol_delimiter :: option(String.t()),
96+
quote_delimiter :: option(String.t())
9597
) :: io_result(df)
9698

9799
# IO: Parquet

lib/explorer/data_frame.ex

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,9 @@ defmodule Explorer.DataFrame do
602602
* `:encoding` - Encoding to use when reading the file. For now, the only possible values are `utf8` and `utf8-lossy`.
603603
The utf8-lossy option means that invalid utf8 values are replaced with � characters. (default: `"utf8"`)
604604
605+
* `:quote_delimiter` - A single character used for csv quoting. Set to `nil` to turn off special handling and escaping
606+
of quotes. (default: `"\""`)
607+
605608
"""
606609
@doc type: :io
607610
@spec from_csv(filename :: String.t() | fs_entry(), opts :: Keyword.t()) ::
@@ -623,7 +626,8 @@ defmodule Explorer.DataFrame do
623626
columns: nil,
624627
infer_schema_length: @default_infer_schema_length,
625628
parse_dates: false,
626-
eol_delimiter: nil
629+
eol_delimiter: nil,
630+
quote_delimiter: "\""
627631
)
628632

629633
backend = backend_from_options!(backend_opts)
@@ -642,7 +646,8 @@ defmodule Explorer.DataFrame do
642646
to_columns_for_io(opts[:columns]),
643647
opts[:infer_schema_length],
644648
opts[:parse_dates],
645-
opts[:eol_delimiter]
649+
opts[:eol_delimiter],
650+
opts[:quote_delimiter]
646651
]
647652

648653
Shared.apply_init(backend, :from_csv, args, backend_opts)
@@ -804,7 +809,8 @@ defmodule Explorer.DataFrame do
804809
columns: nil,
805810
infer_schema_length: @default_infer_schema_length,
806811
parse_dates: false,
807-
eol_delimiter: nil
812+
eol_delimiter: nil,
813+
quote_delimiter: "\""
808814
)
809815

810816
backend = backend_from_options!(backend_opts)
@@ -822,7 +828,8 @@ defmodule Explorer.DataFrame do
822828
to_columns_for_io(opts[:columns]),
823829
opts[:infer_schema_length],
824830
opts[:parse_dates],
825-
opts[:eol_delimiter]
831+
opts[:eol_delimiter],
832+
opts[:quote_delimiter]
826833
]
827834

828835
Shared.apply_init(backend, :load_csv, args, backend_opts)

lib/explorer/polars_backend/data_frame.ex

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
4949
columns,
5050
infer_schema_length,
5151
parse_dates,
52-
eol_delimiter
52+
eol_delimiter,
53+
quote_delimiter
5354
)
5455
when module in [S3.Entry, HTTP.Entry] do
5556
path = Shared.build_path_for_entry(entry)
@@ -71,7 +72,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
7172
columns,
7273
infer_schema_length,
7374
parse_dates,
74-
eol_delimiter
75+
eol_delimiter,
76+
quote_delimiter
7577
)
7678

7779
File.rm(path)
@@ -93,7 +95,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
9395
columns,
9496
infer_schema_length,
9597
parse_dates,
96-
eol_delimiter
98+
eol_delimiter,
99+
quote_delimiter
97100
) do
98101
infer_schema_length =
99102
if infer_schema_length == nil,
@@ -118,7 +121,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
118121
encoding,
119122
nil_values,
120123
parse_dates,
121-
char_byte(eol_delimiter)
124+
char_byte(eol_delimiter),
125+
char_byte(quote_delimiter)
122126
)
123127

124128
case df do
@@ -200,7 +204,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
200204
columns,
201205
infer_schema_length,
202206
parse_dates,
203-
eol_delimiter
207+
eol_delimiter,
208+
quote_delimiter
204209
) do
205210
infer_schema_length =
206211
if infer_schema_length == nil,
@@ -225,7 +230,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
225230
encoding,
226231
nil_values,
227232
parse_dates,
228-
char_byte(eol_delimiter)
233+
char_byte(eol_delimiter),
234+
char_byte(quote_delimiter)
229235
)
230236

231237
case df do

lib/explorer/polars_backend/lazy_frame.ex

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
136136
_,
137137
_,
138138
_,
139+
_,
139140
_
140141
) do
141142
{:error,
@@ -156,7 +157,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
156157
columns,
157158
infer_schema_length,
158159
parse_dates,
159-
eol_delimiter
160+
eol_delimiter,
161+
quote_delimiter
160162
)
161163
when is_nil(columns) do
162164
infer_schema_length =
@@ -178,7 +180,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
178180
encoding,
179181
nil_values,
180182
parse_dates,
181-
char_byte(eol_delimiter)
183+
char_byte(eol_delimiter),
184+
char_byte(quote_delimiter)
182185
)
183186

184187
case result do
@@ -201,6 +204,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
201204
_,
202205
_,
203206
_,
207+
_,
204208
_
205209
) do
206210
{:error,
@@ -311,7 +315,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
311315
columns,
312316
infer_schema_length,
313317
parse_dates,
314-
eol_delimiter
318+
eol_delimiter,
319+
quote_delimiter
315320
) do
316321
with {:ok, df} <-
317322
Eager.load_csv(
@@ -327,7 +332,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
327332
columns,
328333
infer_schema_length,
329334
parse_dates,
330-
eol_delimiter
335+
eol_delimiter,
336+
quote_delimiter
331337
) do
332338
{:ok, Eager.lazy(df)}
333339
end

lib/explorer/polars_backend/native.ex

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ defmodule Explorer.PolarsBackend.Native do
109109
_encoding,
110110
_nil_vals,
111111
_parse_dates,
112-
_eol_delimiter
112+
_eol_delimiter,
113+
_quote_delimiter
113114
),
114115
do: err()
115116

@@ -145,7 +146,8 @@ defmodule Explorer.PolarsBackend.Native do
145146
_encoding,
146147
_nil_vals,
147148
_parse_dates,
148-
_eol_delimiter
149+
_eol_delimiter,
150+
_quote_delimiter
149151
),
150152
do: err()
151153

@@ -255,7 +257,8 @@ defmodule Explorer.PolarsBackend.Native do
255257
_encoding,
256258
_nil_vals,
257259
_parse_dates,
258-
_eol_delimiter
260+
_eol_delimiter,
261+
_quote_delimiter
259262
),
260263
do: err()
261264

native/explorer/src/dataframe/io.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pub fn df_from_csv(
4747
null_vals: Vec<String>,
4848
parse_dates: bool,
4949
eol_delimiter: Option<u8>,
50+
quote_delimiter: Option<u8>,
5051
) -> Result<ExDataFrame, ExplorerError> {
5152
let encoding = match encoding {
5253
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -71,6 +72,7 @@ pub fn df_from_csv(
7172
.with_parse_options(
7273
CsvParseOptions::default()
7374
.with_encoding(encoding)
75+
.with_quote_char(quote_delimiter)
7476
.with_truncate_ragged_lines(true)
7577
.with_try_parse_dates(parse_dates)
7678
.with_separator(delimiter_as_byte)
@@ -180,6 +182,7 @@ pub fn df_load_csv(
180182
null_vals: Vec<String>,
181183
parse_dates: bool,
182184
eol_delimiter: Option<u8>,
185+
quote_delimiter: Option<u8>,
183186
) -> Result<ExDataFrame, ExplorerError> {
184187
let encoding = match encoding {
185188
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -211,6 +214,7 @@ pub fn df_load_csv(
211214
null_vals.iter().map(|x| x.into()).collect(),
212215
)))
213216
.with_try_parse_dates(parse_dates)
217+
.with_quote_char(quote_delimiter)
214218
.with_eol_char(eol_delimiter.unwrap_or(b'\n')),
215219
)
216220
.into_reader_with_file_handle(cursor)

native/explorer/src/lazyframe/io.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ pub fn lf_from_csv(
262262
null_vals: Vec<String>,
263263
parse_dates: bool,
264264
eol_delimiter: Option<u8>,
265+
quote_delimiter: Option<u8>,
265266
) -> Result<ExLazyFrame, ExplorerError> {
266267
let encoding = match encoding {
267268
"utf8-lossy" => CsvEncoding::LossyUtf8,
@@ -283,6 +284,7 @@ pub fn lf_from_csv(
283284
null_vals.iter().map(|x| x.into()).collect(),
284285
)))
285286
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
287+
.with_quote_char(quote_delimiter)
286288
.finish()?;
287289

288290
Ok(ExLazyFrame::new(df))

test/explorer/data_frame/csv_test.exs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,43 @@ defmodule Explorer.DataFrame.CSVTest do
224224
}
225225
end
226226

227+
test "load_csv/2 quote_delimiter - different quote char" do
228+
data = """
229+
city,lat,lng
230+
'Elgin, Scotland, the UK',57.653484,-3.335724
231+
'Stoke-on-Trent, Staffordshire, the UK',53.002666,-2.179404
232+
'Solihull, Birmingham, UK',52.412811,-1.778197
233+
"""
234+
235+
frame = DF.load_csv!(data, quote_delimiter: "'")
236+
237+
assert DF.n_rows(frame) == 3
238+
assert DF.n_columns(frame) == 3
239+
240+
assert frame["city"][0] == "Elgin, Scotland, the UK"
241+
assert frame["city"][2] == "Solihull, Birmingham, UK"
242+
end
243+
244+
test "load_csv/2 quote_delimiter - no quote char" do
245+
data = """
246+
city;nickname;lat;lng
247+
Elgin, Scotland, the UK;"Little Ireland";57.653484;-3.335724
248+
Stoke-on-Trent, Staffordshire, the UK;nil;53.002666;-2.179404
249+
Solihull, Birmingham, UK;nil;52.412811;-1.778197
250+
"""
251+
252+
frame = DF.load_csv!(data, quote_delimiter: nil, delimiter: ";", nil_values: ["nil"])
253+
254+
assert DF.n_rows(frame) == 3
255+
assert DF.n_columns(frame) == 4
256+
257+
assert frame["city"][0] == "Elgin, Scotland, the UK"
258+
assert frame["city"][2] == "Solihull, Birmingham, UK"
259+
260+
assert frame["nickname"][0] == "\"Little Ireland\""
261+
assert frame["nickname"][1] == nil
262+
end
263+
227264
def assert_csv(type, csv_value, parsed_value, from_csv_options) do
228265
data = "column\n#{csv_value}\n"
229266
# parsing should work as expected

0 commit comments

Comments
 (0)