diff --git a/NAMESPACE b/NAMESPACE index 4fd12df06..3f33057d5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ S3method(names,duckdb_relation) S3method(print,duckdb_explain) S3method(print,duckdb_expr) S3method(print,duckdb_relation) +export(create_view) export(duckdb) export(duckdb_adbc) export(duckdb_fetch_arrow) @@ -17,6 +18,7 @@ export(duckdb_register_arrow) export(duckdb_shutdown) export(duckdb_unregister) export(duckdb_unregister_arrow) +export(export_parquet) export(read_csv_duckdb) export(simulate_duckdb) export(tbl_file) @@ -54,5 +56,8 @@ exportMethods(dbWriteTable) exportMethods(show) import(DBI) import(methods) +importFrom(DBI,dbExecute) +importFrom(dbplyr,remote_con) +importFrom(dbplyr,sql_render) importFrom(utils,head) useDynLib(duckdb, .registration = TRUE) diff --git a/R/create_view.R b/R/create_view.R new file mode 100644 index 000000000..2b638a001 --- /dev/null +++ b/R/create_view.R @@ -0,0 +1,37 @@ +#' Create or Replace a View from a `tbl` in DuckDB +#' +#' This function creates or replaces a view in DuckDB from a `dbplyr`-based `tbl` object. +#' It converts the lazy query associated with the `tbl` into SQL and defines a named view in the database. +#' +#' @param data A `tbl_dbi` object, typically produced by `dplyr::tbl()` or `dbplyr` pipelines. +#' @param view_name A character string specifying the name of the view to create. +#' +#' @return A `tbl` object pointing on the created view (invisible) +#' +#' @details +#' The function uses `CREATE OR REPLACE VIEW`, which means it will overwrite an existing view with the same name. +#' The view is created in the same DuckDB connection used by the `tbl`. The query is lazily evaluated. +#' +#' @examples +#' con <- DBI::dbConnect(duckdb::duckdb()) +#' copy_to(con, tibble(a = 1:3, b = letters[1:3]), "source_table", temporary = TRUE) +#' data <- dplyr::tbl(con, "source_table") %>% dplyr::filter(a > 1) +#' create_view(data, "filtered_view") +#' DBI::dbGetQuery(con, "SELECT * FROM filtered_view") +#' DBI::dbDisconnect(con, shutdown = TRUE) +#' +#' @importFrom DBI dbExecute dbQuoteIdentifie +#' @importFrom dbplyr remote_con sql_render +#' @export +create_view <- function(data, view_name) { + if (!inherits(data, "tbl_dbi")) stop("'data' must be a 'tbl_dbi' object.") + + con <- dbplyr::remote_con(data) + sql <- dbplyr::sql_render(data, con = con) + + sql <- sprintf("CREATE OR REPLACE VIEW %s AS %s", DBI::dbQuoteIdentifier(con, view_name), sql) + + DBI::dbExecute(con, sql) + + invisible(tbl(con, view_name)) +} diff --git a/R/export_parquet.R b/R/export_parquet.R new file mode 100644 index 000000000..7af2a5513 --- /dev/null +++ b/R/export_parquet.R @@ -0,0 +1,58 @@ +#' Export a DuckDB table to a Parquet file using COPY TO +#' +#' This function exports a `dbplyr`-based table or SQL query to a Parquet file +#' using DuckDB's native `COPY TO` command. +#' +#' @param data A `tbl_dbi` object representing a DuckDB table or query. +#' @param output Path to the output Parquet file (a single character string). +#' @param options A named list of key-value COPY options. Values can be character, +#' numeric, logical, or vectors (which will be converted to tuples). +#' Examples include `compression = "zstd"` or `ROW_GROUP_SIZE = 1000000`. +#' see https://duckdb.org/docs/sql/statements/copy.html#parquet-options for details. +#' +#' @return Returns the number of rows affected by the `COPY TO` command. +#' The function will stop with an error if the input types are invalid. +#' +#' @details +#' Option values of length >1 are wrapped in parentheses and comma-separated +#' (e.g., for `columns = c("a", "b")`, DuckDB will receive `COLUMNS (a,b)`). +#' +#' @examples +#' con <- DBI::dbConnect(duckdb::duckdb()) +#' DBI::dbWriteTable(con, "iris", iris) +#' tbl <- dplyr::tbl(con, "iris") +#' export_parquet(tbl, "iris.parquet", options = list(compression = "zstd")) +#' export_parquet(tbl, "iris_ds", options = list(partition_by = "Species", row_group_size = 1000)) +#' +#' @importFrom DBI dbExecute +#' @importFrom dbplyr remote_con sql_render +#' @export +export_parquet <- function(data, output, options = NULL, print_sql = FALSE) { + if (!inherits(data, "tbl_dbi")) stop("'data' must be a 'tbl_dbi' object.") + if (!is.character(output) || length(output) != 1) stop("'output' must be a single character string.") + if (!is.null(options) && !is.list(options)) stop("'options' must be a list or NULL.") + + con <- dbplyr::remote_con(data) + sql_query <- dbplyr::sql_render(data, con = con) + + # Normalize and format options + if (is.null(options)) options <- list() + formatted_options <- format_copy_to_options(options) + formatted_options$FORMAT <- 'PARQUET' + + parquet_opts <- paste(paste0(names(formatted_options), " ", formatted_options), collapse = ", ") + sql <- sprintf("COPY (%s) TO '%s' (%s)", sql_query, output, parquet_opts) + DBI::dbExecute(con, sql) +} + + +format_copy_to_options <- function(options) { + options <- lapply(options, function(x) { + if (is.logical(x) || is.character(x) || is.numeric(x)) as.character(x) + else stop("All option values must be character, numeric, or logical.") + + if (length(x) > 1) x <- paste0("(",paste0(x,collapse=","),")") + x + }) + setNames(options, toupper(names(options))) +} diff --git a/man/create_view.Rd b/man/create_view.Rd new file mode 100644 index 000000000..202047b4d --- /dev/null +++ b/man/create_view.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_view.R +\name{create_view} +\alias{create_view} +\title{Create or Replace a View from a \code{tbl} in DuckDB} +\usage{ +create_view(data, view_name) +} +\arguments{ +\item{data}{A \code{tbl_dbi} object, typically produced by \code{dplyr::tbl()} or \code{dbplyr} pipelines.} + +\item{view_name}{A character string specifying the name of the view to create.} +} +\value{ +Invisibly returns the result of \code{DBI::dbExecute()}, typically the number of rows affected (0 for views). +} +\description{ +This function creates or replaces a view in DuckDB from a \code{dbplyr}-based \code{tbl} object. +It converts the lazy query associated with the \code{tbl} into SQL and defines a named view in the database. +} +\details{ +The function uses \verb{CREATE OR REPLACE VIEW}, which means it will overwrite an existing view with the same name. +The view is created in the same DuckDB connection used by the \code{tbl}. The query is lazily evaluated. +} +\examples{ + con <- DBI::dbConnect(duckdb::duckdb()) + copy_to(con, tibble(a = 1:3, b = letters[1:3]), "source_table", temporary = TRUE) + data <- dplyr::tbl(con, "source_table") \%>\% dplyr::filter(a > 1) + create_view(data, "filtered_view") + DBI::dbGetQuery(con, "SELECT * FROM filtered_view") + DBI::dbDisconnect(con, shutdown = TRUE) + +} diff --git a/man/export_parquet.Rd b/man/export_parquet.Rd new file mode 100644 index 000000000..19d97e211 --- /dev/null +++ b/man/export_parquet.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export_parquet.R +\name{export_parquet} +\alias{export_parquet} +\title{Export a DuckDB table to a Parquet file using COPY TO} +\usage{ +export_parquet(data, output, options = NULL, print_sql = FALSE) +} +\arguments{ +\item{data}{A \code{tbl_dbi} object representing a DuckDB table or query.} + +\item{output}{Path to the output Parquet file (a single character string).} + +\item{options}{A named list of key-value COPY options. Values can be character, +numeric, logical, or vectors (which will be converted to tuples). +Examples include \code{compression = "zstd"} or \code{ROW_GROUP_SIZE = 1000000}. +see https://duckdb.org/docs/sql/statements/copy.html#parquet-options for details.} +} +\value{ +Invisibly returns the number of rows affected by the \verb{COPY TO} command. +The function will stop with an error if the input types are invalid. +} +\description{ +This function exports a \code{dbplyr}-based table or SQL query to a Parquet file +using DuckDB's native \verb{COPY TO} command. +} +\details{ +Option values of length >1 are wrapped in parentheses and comma-separated +(e.g., for \code{columns = c("a", "b")}, DuckDB will receive \code{COLUMNS (a,b)}). +} +\examples{ +con <- DBI::dbConnect(duckdb::duckdb()) +DBI::dbWriteTable(con, "iris", iris) +tbl <- dplyr::tbl(con, "iris") +export_parquet(tbl, "iris.parquet", options = list(compression = "zstd")) +export_parquet(tbl, "iris_ds", options = list(partition_by = "Species", row_group_size = 1000)) + +} diff --git a/tests/testthat/test-create_view.R b/tests/testthat/test-create_view.R new file mode 100644 index 000000000..03e19ac1e --- /dev/null +++ b/tests/testthat/test-create_view.R @@ -0,0 +1,52 @@ +test_that("create_view creates a view with expected content", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(x = 1:5, y = letters[1:5]) + copy_to(con, df, "original_table", temporary = TRUE) + + data <- tbl(con, "original_table") %>% filter(x > 3) + create_view(data, "view_test") + + result <- dbReadTable(con, "view_test") + expect_equal(nrow(result), 2) + expect_equal(result$x, c(4, 5)) + + result <- tbl(con, "view_test") |> dplyr::collect() + expect_equal(nrow(result), 2) + expect_equal(result$x, c(4, 5)) +}) + +test_that("create_view replaces an existing view", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(a = 1:2) + copy_to(con, df, "table1", temporary = TRUE) + data1 <- tbl(con, "table1") + create_view(data1, "replace_view") + + df2 <- tibble(a = 10:12) + copy_to(con, df2, "table2", temporary = TRUE) + data2 <- tbl(con, "table2") + create_view(data2, "replace_view") # Should replace + + result <- dbReadTable(con, "replace_view") + expect_equal(nrow(result), 3) + expect_equal(result$a, 10:12) +}) + +test_that("create_view works with quoted view names", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(id = 1:3) + copy_to(con, df, "quoted_table", temporary = TRUE) + data <- tbl(con, "quoted_table") + + create_view(data, "weird-Name With Space") + + result <- dbGetQuery(con, 'SELECT * FROM "weird-Name With Space"') + expect_equal(nrow(result), 3) +}) + diff --git a/tests/testthat/test-export_parquet.R b/tests/testthat/test-export_parquet.R new file mode 100644 index 000000000..e69dd2c09 --- /dev/null +++ b/tests/testthat/test-export_parquet.R @@ -0,0 +1,44 @@ +test_that("export_parquet write a valid Parquet file", { + withr::with_tempfile("parquet_file", fileext = ".parquet", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(x = 1:3, y = letters[1:3]) + copy_to(con, df, "test_table", temporary = TRUE) + + data <- tbl(con, "test_table") + export_parquet(data, parquet_file) + + expect_true(file.exists(parquet_file)) + }) +}) + +test_that("export_parquet allows options", { + withr::with_tempfile("parquet_file", fileext = ".parquet", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(a = 1:5, b = 1:5) + copy_to(con, df, "table_opt", temporary = TRUE) + data <- tbl(con, "table_opt") + + expect_silent(export_parquet(data, parquet_file, list(compression = "zstd", row_group_size = 1000))) + expect_true(file.exists(parquet_file)) + + }) +}) + +test_that("export_parquet échoue proprement si le fichier est invalide", { + con <- dbConnect(duckdb::duckdb()) + on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + + df <- data.frame(z = 1:2) + copy_to(con, df, "bad_path_table", temporary = TRUE) + data <- tbl(con, "bad_path_table") + + expect_error( + export_parquet(data, "/chemin/inexistant/fichier.parquet"), + "IO Error|Failed to open" + ) +}) + diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 000000000..097b24163 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/exporting-and-materializing-data.Rmd b/vignettes/exporting-and-materializing-data.Rmd new file mode 100644 index 000000000..a7d9a3996 --- /dev/null +++ b/vignettes/exporting-and-materializing-data.Rmd @@ -0,0 +1,147 @@ +--- +title: "Exporting and materializing data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Exporting and materializing data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +library(DBI) +library(dplyr) +library(duckdb) +devtools::load_all() +``` + +## Overview + +This vignette demonstrates how to use two utility functions — `export_parquet()` and `create_view()` — from this package and one, `compute()`, from `dbplyr` — to interact efficiently with **DuckDB** using `dplyr`-style workflows: + +- `export_parquet()` exports a DuckDB-backed table or query to a Parquet file. +- `create_view()` creates or replaces a DuckDB **view** from a `dbplyr` lazy table. +- `compute()` create a table from a `dbplyr` lazy table. + +These functions are useful when working with in-database analytics or for building reproducible pipelines that persist intermediate results. + +--- + +## Creating a DuckDB connection + +Let’s begin by creating an in-memory DuckDB database and loading some example data. + +```{r} +con <- dbConnect(duckdb()) +dbWriteTable(con, "mtcars", mtcars) + +# Reference it with dplyr +tbl_mtcars <- tbl(con, "mtcars") +``` + +--- + +## Creating a view with `create_view()` + +You can define a new **view** using a filtered or transformed version of a table: + +```{r} +tbl_mpg <- tbl_mtcars %>% + filter(mpg > 25) %>% + select(mpg, cyl, gear) + +my_view <- create_view(tbl_mpg, "mtcars_high_mpg") +my_view +``` + +Let’s check that the view now exists: + +```{r} +dbListObjects(con) +``` + +You can query it directly as a new `tbl`: + +```{r} +tbl(con, "mtcars_high_mpg") +``` + + + +--- + +## Exporting to a Parquet file + +You can use `export_parquet()` to persist the result of any DuckDB query or table to disk: + +```{r} +parquet_file <- tempfile(fileext = ".parquet") + +export_parquet( + tbl_mtcars %>% filter(gear == 4), + output = parquet_file, + options = list(compression = "zstd", ROW_GROUP_SIZE = 10000), + print_sql = TRUE +) + +file.exists(parquet_file) +``` + +This file can now be reused elsewhere, even outside of R (e.g., in Python, Apache Arrow, or cloud environments). + +--- + +## Materializing queries in a duckdb table with `compute()` + +The [`compute()`](https://dbplyr.tidyverse.org/reference/collapse.tbl_sql.html) function from `dbplyr` can also be used to materialize a lazy query result into a temporary table. Unlike `create_view()`, which creates a named view, `compute()` generates an anonymous temporary table in the DuckDB backend. + +This is useful when you want to persist intermediate results **without naming the object**, or when your workflow involves chained operations that would benefit from reducing the query plan complexity. + +```{r} +materialized_tbl <- tbl_mtcars %>% + filter(mpg > 25, gear == 4) %>% + compute() + +materialized_tbl +``` + +By materializing the data, you ensure that heavy transformations (e.g., joins or filters) are computed once and reused efficiently in subsequent steps. + +**Note:** The resulting table only lives for the duration of the session and is not accessible by name in the database. + +But if you need persistance, you can also create a named non temporary table : + +```{r} +materialized_tbl <- tbl_mtcars %>% + filter(mpg > 25, gear == 4) %>% + compute(name = "my_table", temporary = FALSE) + +materialized_tbl +``` + +You can query it directly as a new `tbl`: + +```{r} +tbl(con, "my_table") +``` + +## Cleanup + +```{r} +dbDisconnect(con, shutdown = TRUE) +``` + +--- + +## Summary + +These functions allow you to: + +- Materialize intermediate SQL logic as DuckDB views with `create_view()`. +- Export results to efficient Parquet files using `export_parquet()`. +- Materialize intermediate data as DuckDB table with `compute()`. + +They are particularly useful in workflows that rely on lazy evaluation, large datasets, or integration with downstream systems.