Skip to content

Support schema.fieldMatch #216

@peterdesmet

Description

@peterdesmet

CHANGELOG: https://datapackage.org/overview/changelog/#schemafieldsmatch-new

Design decisions

  • The df returns columns in the order of the schema, not the csv file. This helps aligns files, the schema is the canonical truth.
  • The df returns columns only from the csv file, not all columns defined in the schema. Full schema dfs might be useful in some cases, but I think that should be handled after a df is loaded (e.g. bind_rows()), to avoid additional complexity in read_resource().
  • Column order and selection should be done via dplyr::select() on the returned df. While some could be done via vroom(col_select=) that doesn't allow to compare schema fields with csv fields before selecting, resulting in a vroom_select() error that is difficult to capture. By not using vroom(col_select=), that parameter can continue to be reserved for user input.

Example CSV

b_text a_int c_date
<string> <integer> <date>
text 1 2000-01-01

exact match

Note that when not provided, schema.fieldMatch defaults to exact, which is the v1 behaviour.

Expected:

b_text a_int c_date
<string> <integer> <date>
text 1 2000-01-01
library(vroom)
csv <- I("b_text,a_int,c_date\ntext,1,2000-01-01")
schema <- list(
  # Match on order (no names)
  col_character(),
  col_big_integer(),
  col_date(),
  .default = col_character()
)
vroom(csv, col_types = schema)
#> # A tibble: 1 × 3
#>   b_text   a_int c_date    
#>   <chr>  <int64> <date>    
#> 1 text         1 2000-01-01

Created on 2025-09-22 with reprex v2.1.1

equal match

Expected:

a_int b_text c_date
<integer> <string> <date>
1 text 2000-01-01
library(vroom)
library(dplyr, warn.conflicts = FALSE)
csv <- I("b_text,a_int,c_date\ntext,1,2000-01-01")
schema <- list(
  # Match on names, same amount as cols
  "a_int" = col_big_integer(),
  "b_text" = col_character(),
  "c_date" = col_datetime(),
  .default = col_character()
)
df <- vroom(csv, col_types = schema)
# Return cols in schema order
schema_cols <- c("a_int", "b_text", "c_date")
select(df, all_of(schema_cols))
#> # A tibble: 1 × 3
#>     a_int b_text c_date             
#>   <int64> <chr>  <dttm>             
#> 1       1 text   2000-01-01 00:00:00

Created on 2025-09-22 with reprex v2.1.1

subset match (csv may have more)

Expected:

a_int b_text c_date
<integer> <string> <string>!
1 text 2000-01-01
library(vroom)
library(dplyr, warn.conflicts = FALSE)
csv <- I("b_text,a_int,c_date\ntext,1,2000-01-01")
schema <- list(
  # Match on names, fewer in schema
  "a_int" = col_big_integer(),
  "b_text" = col_character(),
  .default = col_character()
)
df <- vroom(csv, col_types = schema)
# Return cols in schema order + add undefined
schema_cols <- c("a_int", "b_text")
undefined_cols <- setdiff(colnames(df), schema_cols)
select_cols <- c(schema_cols, undefined_cols)
select(df, all_of(select_cols))
#> # A tibble: 1 × 3
#>     a_int b_text c_date    
#>   <int64> <chr>  <chr>     
#> 1       1 text   2000-01-01

Created on 2025-09-22 with reprex v2.1.1

superset match (schema may have more)

Expected:

a_int b_text
<integer> <string>
1 text
library(vroom)
library(dplyr, warn.conflicts = FALSE)
csv <- I("b_text,a_int,c_date\ntext,1,2000-01-01")
schema <- list(
  # Match on names, more in schema
  "a_int" = col_big_integer(),
  "b_text" = col_character(),
  "d_double" = col_double(),
  .default = col_character() # Will be removed later
)
df <- suppressWarnings(vroom(csv, col_types = schema)) # Supress unmatched d_double
# Return cols in schema order + don't return undefined
schema_cols <- c("a_int", "b_text", "d_double")
select_cols <- schema_cols[schema_cols %in% colnames(df)] # Schema cols in df
select(df, all_of(select_cols))
#> # A tibble: 1 × 2
#>     a_int b_text
#>   <int64> <chr> 
#> 1       1 text

Created on 2025-09-22 with reprex v2.1.1

partial (schema and csv may have more)

Expected:

a_int b_text c_date
<integer> <string> <string>!
1 text 2000-01-01
library(vroom)
library(dplyr, warn.conflicts = FALSE)
csv <- I("b_text,a_int,c_date\ntext,1,2000-01-01")
schema <- list(
  # Match on names
  "a_int" = col_big_integer(),
  "b_text" = col_character(),
  "d_double" = col_double(),
  .default = col_character()
)
df <- suppressWarnings(vroom(csv, col_types = schema)) # Supress unmatched d_double
# Return cols in schema order + add undefined
schema_cols <- c("a_int", "b_text", "d_double")
schema_cols_in_df <- schema_cols[schema_cols %in% colnames(df)]
undefined_cols <- setdiff(colnames(df), schema_cols)
select_cols <- c(schema_cols_in_df, undefined_cols)
select(df, all_of(select_cols))
#> # A tibble: 1 × 3
#>     a_int b_text c_date    
#>   <int64> <chr>  <chr>     
#> 1       1 text   2000-01-01

Created on 2025-09-22 with reprex v2.1.1

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions