diff --git a/DESCRIPTION b/DESCRIPTION index 3616a59e5..7fad34ec9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,7 @@ VignetteBuilder: Config/Needs/dev: devtools, lifecycle, readr, glue, RcppTOML, smvr Config/Needs/lint: fs, lintr Config/Needs/website: etiennebacher/altdoc, future.apply -Config/polars/lib-version: 1.7.1-rc.1 +Config/polars/lib-version: 1.9.0-rc.1 Config/testthat/edition: 3 Config/testthat/parallel: true Config/testthat/start-first: lazyframe-frame, *-s3-base, polars_options, diff --git a/NEWS.md b/NEWS.md index c91b7b110..21775f9af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,16 @@ ## polars (development version) +### New features + +- `as_polars_series()` on R factors now has an option `factor_as_enum` to convert + to a Polars Enum type. This can be set globally (for instance to convert all + factors in an R `data.frame` to enums with `as_polars_df()`) with + `options(polars.factor_as_enum = TRUE)` (#1723). + +- Converting a Polars enum to an R factor now preserves all levels, even if they + don't appear in the data (#1723). + ## polars 1.8.0 This is an update that corresponds to Python Polars 1.37.1. diff --git a/R/as_polars_series.R b/R/as_polars_series.R index 916872f05..efe3ae3c1 100644 --- a/R/as_polars_series.R +++ b/R/as_polars_series.R @@ -253,11 +253,22 @@ as_polars_series.raw <- function(x, name = NULL, ...) { wrap() } +#' @param factor_as_enum `r lifecycle::badge("experimental")` if `FALSE` +#' (default), R factors are converted to Polars [Categorical][pl__Categorical]. +#' If `TRUE`, they are converted to Polars [Enum][pl__Enum]. #' @rdname as_polars_series #' @export -as_polars_series.factor <- function(x, name = NULL, ...) { +as_polars_series.factor <- function(x, name = NULL, ..., factor_as_enum = FALSE) { + if (missing(factor_as_enum)) { + opt <- getOption("polars.factor_as_enum") + if (!is.null(opt)) { + factor_as_enum <- opt + } + } + opt <- getOption("polars.factor_as_enum") + PlRSeries$new_str(name %||% "", as.character(x))$cast( - pl$Categorical()$`_dt`, + if (isTRUE(factor_as_enum)) pl$Enum(levels(x))$`_dt` else pl$Categorical()$`_dt`, strict = TRUE ) |> wrap() diff --git a/R/polars_options.R b/R/polars_options.R index 18e70e01e..8816b4885 100644 --- a/R/polars_options.R +++ b/R/polars_options.R @@ -20,6 +20,9 @@ #' `compat_level` argument. See the documentation of those functions for details. #' * for all `to_r_vector.*` options, see arguments of [to_r_vector()][series__to_r_vector]. #' * `df_knitr_print` (TODO: possible values??) +#' * `factor_as_enum`: if `FALSE` (default), R factors are converted to Polars' +#' [Categorical][pl__Categorical]. If `TRUE`, they are converted to Polars' +#' [Enum][pl__Enum]. #' #' @return #' `polars_options()` returns a named list where the names are option names and @@ -47,7 +50,8 @@ polars_options <- function() { to_r_vector.decimal = getOption("polars.to_r_vector.decimal", "double"), to_r_vector.as_clock_class = getOption("polars.to_r_vector.as_clock_class", FALSE), to_r_vector.ambiguous = getOption("polars.to_r_vector.ambiguous", "raise"), - to_r_vector.non_existent = getOption("polars.to_r_vector.non_existent", "raise") + to_r_vector.non_existent = getOption("polars.to_r_vector.non_existent", "raise"), + factor_as_enum = getOption("polars.factor_as_enum", FALSE) ) arg_match_compat_level(out[["compat_level"]], arg_nm = "compat_level") @@ -74,6 +78,7 @@ polars_options <- function() { c("raise", "null"), arg_nm = "to_r_vector.non_existent" ) + check_bool(out[["factor_as_enum"]], arg = "factor_as_enum") structure(out, class = "polars_options_list") } diff --git a/R/series-to_r_vector.R b/R/series-to_r_vector.R index f60442feb..a6c727b4f 100644 --- a/R/series-to_r_vector.R +++ b/R/series-to_r_vector.R @@ -15,6 +15,8 @@ #' - Decimal: [double]. #' - String: [character]. #' - Categorical: [factor]. +#' - Enum: [factor], all categories (levels) are preserved, even if unobserved in +#' the data. #' - Date: [Date] or [data.table::IDate][data.table::IDateTime], #' depending on the `date` argument. #' - Time: [hms::hms] or [data.table::ITime][data.table::IDateTime], diff --git a/man/as_polars_series.Rd b/man/as_polars_series.Rd index 56b60c4aa..b2efa375c 100644 --- a/man/as_polars_series.Rd +++ b/man/as_polars_series.Rd @@ -58,7 +58,7 @@ as_polars_series(x, name = NULL, ...) \method{as_polars_series}{raw}(x, name = NULL, ...) -\method{as_polars_series}{factor}(x, name = NULL, ...) +\method{as_polars_series}{factor}(x, name = NULL, ..., factor_as_enum = FALSE) \method{as_polars_series}{Date}(x, name = NULL, ...) @@ -117,6 +117,10 @@ When not specified, name is set to an empty string.} \item{...}{Additional arguments passed to the methods.} +\item{factor_as_enum}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} if \code{FALSE} +(default), R factors are converted to Polars \link[=pl__Categorical]{Categorical}. +If \code{TRUE}, they are converted to Polars \link[=pl__Enum]{Enum}.} + \item{strict}{A logical value to indicate whether throwing an error when the input \link{list}'s elements have different data types. If \code{FALSE} (default), all elements are automatically cast to the super type, or, diff --git a/man/polars_options.Rd b/man/polars_options.Rd index b2036d25d..acebf17f6 100644 --- a/man/polars_options.Rd +++ b/man/polars_options.Rd @@ -36,6 +36,9 @@ default value in parenthesis): \code{compat_level} argument. See the documentation of those functions for details. \item for all \verb{to_r_vector.*} options, see arguments of \link[=series__to_r_vector]{to_r_vector()}. \item \code{df_knitr_print} (TODO: possible values??) +\item \code{factor_as_enum}: if \code{FALSE} (default), R factors are converted to Polars' +\link[=pl__Categorical]{Categorical}. If \code{TRUE}, they are converted to Polars' +\link[=pl__Enum]{Enum}. } } \examples{ diff --git a/man/series__to_r_vector.Rd b/man/series__to_r_vector.Rd index 2037eec44..76d2c270a 100644 --- a/man/series__to_r_vector.Rd +++ b/man/series__to_r_vector.Rd @@ -132,6 +132,8 @@ depending on the \code{int64} argument. \item Decimal: \link{double}. \item String: \link{character}. \item Categorical: \link{factor}. +\item Enum: \link{factor}, all categories (levels) are preserved, even if unobserved in +the data. \item Date: \link{Date} or \link[data.table:IDateTime]{data.table::IDate}, depending on the \code{date} argument. \item Time: \link[hms:hms]{hms::hms} or \link[data.table:IDateTime]{data.table::ITime}, @@ -143,7 +145,7 @@ depending on the \code{as_clock_class} argument. \item Duration: \link{difftime} or \link[clock:duration-helper]{clock_duration}, depending on the \code{as_clock_class} argument. \item Binary: \link[blob:blob]{blob::blob}. -\item Null: \link[vctrs:unspecified]{vctrs::unspecified}. +\item Null: \link[vctrs:vctrs-unspecified]{vctrs::unspecified}. \item List, Array: \link[vctrs:list_of]{vctrs::list_of}. \item Struct: \link{data.frame} or \link[tibble:tbl_df-class]{tibble}, depending on the \code{struct} argument. } diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 33678295d..53ddc52a7 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -2277,7 +2277,7 @@ checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "r-polars" -version = "1.7.1-rc.1" +version = "1.9.0-rc.1" dependencies = [ "ciborium", "either", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index c31b5e8e5..2f94b5cfd 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "r-polars" -version = "1.7.1-rc.1" +version = "1.9.0-rc.1" edition = "2024" rust-version = "1.89.0" publish = false diff --git a/src/rust/src/series/export.rs b/src/rust/src/series/export.rs index eaed458bc..8fbc185c0 100644 --- a/src/rust/src/series/export.rs +++ b/src/rust/src/series/export.rs @@ -150,7 +150,7 @@ impl PlRSeries { series.cast(&DataType::Float64).unwrap().f64().unwrap(), ))), DataType::Float64 => Ok(::from(Wrap(series.f64().unwrap()))), - DataType::Categorical(_, _) | DataType::Enum(_, _) => { + DataType::Categorical(_, _) => { let r_func: FunctionSexp = ::from(savvy::eval_parse_text("as.factor")?).try_into()?; let chr_vec = @@ -159,6 +159,36 @@ impl PlRSeries { let _ = args.add("x", chr_vec); Ok(r_func.call(args)?.into()) } + DataType::Enum(_, mapping) => { + // Build factor manually to preserve Enum's level order + // Convert to physical representation and cast to u32 + let phys = series.to_physical_repr(); + let phys_u32 = phys.cast(&DataType::UInt32).map_err(RPolarsErr::from)?; + let phys_ca = phys_u32.u32().map_err(RPolarsErr::from)?; + let len = phys_ca.len(); + + // Get physical codes (0-indexed) and convert to R's 1-indexed + let mut codes = OwnedIntegerSexp::new(len)?; + for (i, opt_v) in phys_ca.into_iter().enumerate() { + match opt_v { + Some(v) => codes.set_elt(i, (v + 1) as i32)?, + None => codes.set_na(i)?, + } + } + + // Get categories from Enum mapping and set as levels + let categories = unsafe { + StringChunked::from_chunks( + PlSmallStr::from_static("category"), + vec![mapping.to_arrow(true)], + ) + }; + let levels: Sexp = Wrap(&categories).into(); + codes.set_attrib("levels", levels)?; + codes.set_class(["factor"])?; + + Ok(codes.into()) + } DataType::List(inner) => unsafe { let len = series.len(); let mut list = OwnedListSexp::new(len, false)?; diff --git a/tests/testthat/_snaps/as_polars_series.md b/tests/testthat/_snaps/as_polars_series.md index ef72a9970..d05fb5b39 100644 --- a/tests/testthat/_snaps/as_polars_series.md +++ b/tests/testthat/_snaps/as_polars_series.md @@ -426,6 +426,30 @@ null ] +# use option to convert factor to enum + + Code + as_polars_series(factor(1:2)) + Output + shape: (2,) + Series: '' [enum] + [ + "1" + "2" + ] + +--- + + Code + as_polars_series(factor(1:2), factor_as_enum = FALSE) + Output + shape: (2,) + Series: '' [cat] + [ + "1" + "2" + ] + # as_polars_series.default throws an error Code diff --git a/tests/testthat/_snaps/polars_options.md b/tests/testthat/_snaps/polars_options.md index 88c90871c..3f2370e59 100644 --- a/tests/testthat/_snaps/polars_options.md +++ b/tests/testthat/_snaps/polars_options.md @@ -86,6 +86,14 @@ Error in `polars_options()`: ! `to_r_vector.non_existent` must be one of "raise" or "null", not "foo". +# options are validated by polars_options() polars.factor_as_enum + + Code + print(polars_options()) + Condition + Error in `polars_options()`: + ! `factor_as_enum` must be `TRUE` or `FALSE`, not the string "foo". + # options for to_r_vector() works: polars.to_r_vector.uint8 = integer Code diff --git a/tests/testthat/test-as_polars_series.R b/tests/testthat/test-as_polars_series.R index e7bfaf598..a5ab22aa6 100644 --- a/tests/testthat/test-as_polars_series.R +++ b/tests/testthat/test-as_polars_series.R @@ -68,6 +68,16 @@ patrick::with_parameters_test_that( } ) +test_that("use option to convert factor to enum", { + withr::with_options( + list(polars.factor_as_enum = TRUE), + { + expect_snapshot(as_polars_series(factor(1:2))) + expect_snapshot(as_polars_series(factor(1:2), factor_as_enum = FALSE)) + } + ) +}) + test_that("as_polars_series.default throws an error", { x <- 1 class(x) <- "foo" diff --git a/tests/testthat/test-dataframe-s3-base.R b/tests/testthat/test-dataframe-s3-base.R index ffc127ef6..7c3d691b8 100644 --- a/tests/testthat/test-dataframe-s3-base.R +++ b/tests/testthat/test-dataframe-s3-base.R @@ -337,3 +337,36 @@ patrick::with_parameters_test_that( expect_snapshot(dat[[value]], error = TRUE) } ) + +test_that("as.data.frame() and as.list() keep all levels of Enums", { + lev <- c("b", "a", "c") + dat <- pl$DataFrame( + x = factor(c("a", "b"), levels = lev) + )$cast(x = pl$Enum(lev)) + + expect_equal( + dat |> + as.data.frame() |> + getElement("x") |> + levels(), + lev + ) + + # Same with as.list() + expect_equal( + dat |> + as.list(as_series = FALSE) |> + getElement("x") |> + levels(), + lev + ) + + # Changing the levels works + expect_equal( + dat$cast(x = pl$Enum(c("c", "b", "a"))) |> + as.data.frame() |> + getElement("x") |> + levels(), + c("c", "b", "a") + ) +}) diff --git a/tests/testthat/test-polars_options.R b/tests/testthat/test-polars_options.R index 9b72a8cd8..810a367e8 100644 --- a/tests/testthat/test-polars_options.R +++ b/tests/testthat/test-polars_options.R @@ -13,6 +13,7 @@ patrick::with_parameters_test_that( "polars.to_r_vector.as_clock_class", "polars.to_r_vector.ambiguous", "polars.to_r_vector.non_existent", + "polars.factor_as_enum", ), { withr::with_options( diff --git a/tools/lib-sums.tsv b/tools/lib-sums.tsv deleted file mode 100644 index d9e47a114..000000000 --- a/tools/lib-sums.tsv +++ /dev/null @@ -1,9 +0,0 @@ -url sha256sum -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-aarch64-apple-darwin.tar.gz ee64b59078c50a840a6733acdd89dce0f77461640934a0411010b5bb4633eb6e -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-aarch64-pc-windows-gnullvm.tar.gz 59c155f79fb60ed340609c75e94d557c83aef9dc8f560434205746ad7b14ee5b -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-aarch64-unknown-linux-gnu.tar.gz 2a62077ef33fad74ed9a209cd877cf72bbf687c2c1ab766598d1726f28467100 -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-aarch64-unknown-linux-musl.tar.gz 8e95d759baf271e0aa95d9aea2fce7989ebe9ebf60176997ccc0da0c4d382dea -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-x86_64-apple-darwin.tar.gz a224783bea798004eb0300960e6db60e5e1d63e901a4ca513c50cdf4eef03326 -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-x86_64-pc-windows-gnu.tar.gz a4dfdf9876023704cb4acaa8d1c7522cbceadab48ca945e2cf5a1e070edffc99 -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-x86_64-unknown-linux-gnu.tar.gz ecf22bf86965a34433e103f2a57ce60cdc16981d5ea452adfc6c21be96d2a815 -https://github.com/pola-rs/r-polars/releases/download/lib-v1.7.1-rc.1/libr_polars-1.7.1-rc.1-x86_64-unknown-linux-musl.tar.gz 2c50c044ee4513cf7b3c02caa01237f2aada433acac2a9cfc838ef9dc615e729