Skip to content

Commit f5a0caf

Browse files
thisisnicjonkeane
authored andcommitted
ARROW-15743: [R] skip not connected up to skip_rows on open_dataset despite error messages indicating otherwise
Closes #12523 from thisisnic/ARROW-15743_skip_rows Authored-by: Nic Crane <[email protected]> Signed-off-by: Jonathan Keane <[email protected]>
1 parent 28b7725 commit f5a0caf

File tree

2 files changed

+69
-15
lines changed

2 files changed

+69
-15
lines changed

r/R/dataset-format.R

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ CsvFileFormat$create <- function(...,
122122
opts = csv_file_format_parse_options(...),
123123
convert_options = csv_file_format_convert_opts(...),
124124
read_options = csv_file_format_read_opts(...)) {
125-
125+
check_csv_file_format_args(...)
126126
# Evaluate opts first to catch any unsupported arguments
127127
force(opts)
128128

@@ -170,23 +170,30 @@ CsvFileFormat$create <- function(...,
170170
dataset___CsvFileFormat__Make(opts, convert_options, read_options)
171171
}
172172

173-
# Support both readr-style option names and Arrow C++ option names
174-
csv_file_format_parse_options <- function(...) {
173+
# Check all arguments are valid
174+
check_csv_file_format_args <- function(...) {
175175
opts <- list(...)
176176
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
177-
convert_opts <- names(formals(CsvConvertOptions$create))
178-
read_opts <- names(formals(CsvReadOptions$create))
179-
opts[convert_opts] <- NULL
180-
opts[read_opts] <- NULL
181-
opts[["schema"]] <- NULL
177+
convert_opts <- c(names(formals(CsvConvertOptions$create)))
178+
179+
read_opts <- c(names(formals(CsvReadOptions$create)), "skip")
180+
181+
# We only currently support all of the readr options for parseoptions
182+
parse_opts <- c(
183+
names(formals(CsvParseOptions$create)),
184+
names(formals(readr_to_csv_parse_options))
185+
)
186+
182187
opt_names <- names(opts)
188+
183189
# Catch any readr-style options specified with full option names that are
184190
# supported by read_delim_arrow() (and its wrappers) but are not yet
185191
# supported here
186192
unsup_readr_opts <- setdiff(
187193
names(formals(read_delim_arrow)),
188-
names(formals(readr_to_csv_parse_options))
194+
c(convert_opts, read_opts, parse_opts, "schema")
189195
)
196+
190197
is_unsup_opt <- opt_names %in% unsup_readr_opts
191198
unsup_opts <- opt_names[is_unsup_opt]
192199
if (length(unsup_opts)) {
@@ -199,10 +206,20 @@ csv_file_format_parse_options <- function(...) {
199206
call. = FALSE
200207
)
201208
}
209+
202210
# Catch any options with full or partial names that do not match any of the
203211
# recognized Arrow C++ option names or readr-style option names
204-
arrow_opts <- names(formals(CsvParseOptions$create))
205-
readr_opts <- names(formals(readr_to_csv_parse_options))
212+
arrow_opts <- c(
213+
names(formals(CsvParseOptions$create)),
214+
names(formals(CsvReadOptions$create)),
215+
names(formals(CsvConvertOptions$create)),
216+
"schema"
217+
)
218+
219+
readr_opts <- c(
220+
names(formals(readr_to_csv_parse_options))
221+
)
222+
206223
is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
207224
is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
208225
unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
@@ -215,6 +232,25 @@ csv_file_format_parse_options <- function(...) {
215232
call. = FALSE
216233
)
217234
}
235+
}
236+
237+
# Support both readr-style option names and Arrow C++ option names
238+
csv_file_format_parse_options <- function(...) {
239+
opts <- list(...)
240+
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
241+
convert_opts <- names(formals(CsvConvertOptions$create))
242+
read_opts <- c(names(formals(CsvReadOptions$create)), "skip")
243+
opts[convert_opts] <- NULL
244+
opts[read_opts] <- NULL
245+
opts[["schema"]] <- NULL
246+
opt_names <- names(opts)
247+
248+
arrow_opts <- c(names(formals(CsvParseOptions$create)))
249+
readr_opts <- c(names(formals(readr_to_csv_parse_options)))
250+
251+
is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
252+
is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
253+
218254
# Catch options with ambiguous partial names (such as "del") that make it
219255
# unclear whether the user is specifying Arrow C++ options ("delimiter") or
220256
# readr-style options ("delim")
@@ -229,6 +265,7 @@ csv_file_format_parse_options <- function(...) {
229265
call. = FALSE
230266
)
231267
}
268+
232269
if (any(is_readr_opt)) {
233270
# Catch cases when the user specifies a mix of Arrow C++ options and
234271
# readr-style options
@@ -248,7 +285,7 @@ csv_file_format_convert_opts <- function(...) {
248285
# Filter out arguments meant for CsvParseOptions/CsvReadOptions
249286
arrow_opts <- names(formals(CsvParseOptions$create))
250287
readr_opts <- names(formals(readr_to_csv_parse_options))
251-
read_opts <- names(formals(CsvReadOptions$create))
288+
read_opts <- c(names(formals(CsvReadOptions$create)), "skip")
252289
opts[arrow_opts] <- NULL
253290
opts[readr_opts] <- NULL
254291
opts[read_opts] <- NULL
@@ -374,9 +411,11 @@ FileWriteOptions <- R6Class("FileWriteOptions",
374411
if (length(unsupported_passed_args) > 0) {
375412
err_header <- paste0(
376413
oxford_paste(unsupported_passed_args, quote_symbol = "`"),
377-
ngettext(length(unsupported_passed_args),
378-
" is not a valid argument ",
379-
" are not valid arguments "),
414+
ngettext(
415+
length(unsupported_passed_args),
416+
" is not a valid argument ",
417+
" are not valid arguments "
418+
),
380419
"for your chosen `format`."
381420
)
382421
err_info <- NULL

r/tests/testthat/test-dataset-csv.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,18 @@ test_that("Error if read_options$column_names and schema-names differ (ARROW-147
367367
"`column_names` and `schema` field names match but are not in the same order"
368368
)
369369
})
370+
371+
test_that("skip argument in open_dataset", {
372+
tbl <- df1[, c("int", "dbl")]
373+
374+
header_csv_dir <- make_temp_dir()
375+
write.table(tbl, file.path(header_csv_dir, "file1.csv"), sep = ",", row.names = FALSE)
376+
377+
ds <- open_dataset(
378+
header_csv_dir,
379+
format = "csv",
380+
schema = schema(int = int32(), dbl = float64()),
381+
skip = 1
382+
)
383+
expect_equal(collect(ds), tbl)
384+
})

0 commit comments

Comments
 (0)