fix: Fix sample() reporting identical values in the entire column (#338)

Yousa-Mirage · web-flow · commit 8ec5e1c2566b · 2026-02-27T20:41:11.000+01:00
diff --git a/NEWS.md b/NEWS.md
@@ -20,6 +20,11 @@
 * In `arrange()`, if the data was grouped, the order was never maintained even if
   `maintain_order = TRUE` was passed in `group_by()`. This is now fixed (#332).
 
+* When exporting to CSV, `null_values` alone did not apply and could override explicitly
+  provided `null_value`. This is now fixed (@Yousa-Mirage, #334).
+
+* Fix `sample()` to make it work correctly (@Yousa-Mirage, #338).
+
 # tidypolars 0.17.0
 
 `tidypolars` requires `polars` >= 1.9.0 and `dplyr` >= 1.2.0.
diff --git a/R/funs-default.R b/R/funs-default.R
@@ -355,9 +355,20 @@ pl_round <- function(x, digits = 0, ...) {
 
 pl_sample <- function(x, size = NULL, replace = FALSE, ...) {
   check_empty_dots(...)
-  # TODO: how should I handle seed, given that R sample() doesn't have this arg
+  # WARNING: random seed is not supported and cannot take effect.
+  if (missing(size)) {
+    size <- x$len()
+  }
+  if (!is_polars_expr(size)) {
+    if (!is.numeric(size) || size <= 0 || size %% 1 != 0) {
+      cli_abort("{.code size} must be a positive integer.")
+    }
+    size <- as.integer(size)
+  }
+
   out <- x$sample(n = size, with_replacement = replace, shuffle = TRUE)
-  if (is.null(size) || size == 1) {
+
+  if (!is_polars_expr(size) && size == 1L) {
     out <- out$first()
   }
   out
diff --git a/tests/testthat/_snaps/funs_default-lazy.md b/tests/testthat/_snaps/funs_default-lazy.md
@@ -8,6 +8,15 @@
       Caused by error:
       ! lengths don't match: unable to add a column of length 4 to a DataFrame of height 5
 
+# sample() validates size
+
+    Code
+      current$collect()
+    Condition
+      Error in `mutate()`:
+      ! Error while running function `sample()` in Polars.
+      x `size` must be a positive integer.
+
 # seq_len() works
 
     Code
diff --git a/tests/testthat/_snaps/funs_default.md b/tests/testthat/_snaps/funs_default.md
@@ -10,6 +10,15 @@
       Caused by error:
       ! lengths don't match: unable to add a column of length 4 to a DataFrame of height 5
 
+# sample() validates size
+
+    Code
+      mutate(test_pl, y = sample(x, size = 1.5))
+    Condition
+      Error in `mutate()`:
+      ! Error while running function `sample()` in Polars.
+      x `size` must be a positive integer.
+
 # seq_len() works
 
     Code
diff --git a/tests/testthat/test-funs_default-lazy.R b/tests/testthat/test-funs_default-lazy.R
@@ -175,6 +175,96 @@ test_that("round() works", {
   )
 })
 
+test_that("sample() works with default size and n() size", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_lf(test_df)
+
+  foo <- test_pl |>
+    mutate(y = sample(x)) |>
+    pull(y)
+  res <- test_df |>
+    mutate(y = sample(x)) |>
+    pull(y)
+
+  expect_equal_lazy(sort(foo), sort(res))
+
+  foo_replace <- test_pl |>
+    mutate(y = sample(x, replace = TRUE)) |>
+    pull(y)
+  res_replace <- test_df |>
+    mutate(y = sample(x, replace = TRUE)) |>
+    pull(y)
+
+  expect_true(all(foo_replace %in% 1:5))
+  expect_true(all(res_replace %in% 1:5))
+
+  foo_1 <- test_pl |>
+    mutate(y = sample(x, size = 1)) |>
+    pull(y)
+  res_1 <- test_df |>
+    mutate(y = sample(x, size = 1)) |>
+    pull(y)
+
+  expect_true(unique(foo_1) %in% 1:5)
+  expect_true(unique(res_1) %in% 1:5)
+
+  foo_n <- test_pl |>
+    mutate(y = sample(x, size = n())) |>
+    pull(y)
+  res_n <- test_df |>
+    mutate(y = sample(x, size = n())) |>
+    pull(y)
+
+  expect_equal_lazy(sort(foo_n), sort(res_n))
+})
+
+test_that("sample() warns on unsupported args", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_lf(test_df)
+
+  expect_warning(
+    mutate(test_pl, y = sample(x, prob = 0.5)),
+    "doesn't know how to use some arguments"
+  )
+})
+
+test_that("sample() validates size", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_lf(test_df)
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = -1)),
+    mutate(test_df, y = sample(x, size = -1))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 0)),
+    mutate(test_df, y = sample(x, size = 0))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = NULL)),
+    mutate(test_df, y = sample(x, size = NULL))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 3)),
+    mutate(test_df, y = sample(x, size = 3))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 100, replace = FALSE)),
+    mutate(test_df, y = sample(x, size = 100, replace = FALSE))
+  )
+
+  # `mutate(test_df, y = sample(x, size = 1.5))` has a weird behavior
+  # when size is a double in [1, 2)
+  expect_snapshot_lazy(
+    mutate(test_pl, y = sample(x, size = 1.5)),
+    error = TRUE
+  )
+})
+
 test_that("stats::lag() is not supported", {
   test_df <- tibble(x = c(10, 20, 30, 40, 10, 20, 30, 40))
   test_pl <- as_polars_lf(test_df)
diff --git a/tests/testthat/test-funs_default.R b/tests/testthat/test-funs_default.R
@@ -171,6 +171,96 @@ test_that("round() works", {
   )
 })
 
+test_that("sample() works with default size and n() size", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_df(test_df)
+
+  foo <- test_pl |>
+    mutate(y = sample(x)) |>
+    pull(y)
+  res <- test_df |>
+    mutate(y = sample(x)) |>
+    pull(y)
+
+  expect_equal(sort(foo), sort(res))
+
+  foo_replace <- test_pl |>
+    mutate(y = sample(x, replace = TRUE)) |>
+    pull(y)
+  res_replace <- test_df |>
+    mutate(y = sample(x, replace = TRUE)) |>
+    pull(y)
+
+  expect_true(all(foo_replace %in% 1:5))
+  expect_true(all(res_replace %in% 1:5))
+
+  foo_1 <- test_pl |>
+    mutate(y = sample(x, size = 1)) |>
+    pull(y)
+  res_1 <- test_df |>
+    mutate(y = sample(x, size = 1)) |>
+    pull(y)
+
+  expect_true(unique(foo_1) %in% 1:5)
+  expect_true(unique(res_1) %in% 1:5)
+
+  foo_n <- test_pl |>
+    mutate(y = sample(x, size = n())) |>
+    pull(y)
+  res_n <- test_df |>
+    mutate(y = sample(x, size = n())) |>
+    pull(y)
+
+  expect_equal(sort(foo_n), sort(res_n))
+})
+
+test_that("sample() warns on unsupported args", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_df(test_df)
+
+  expect_warning(
+    mutate(test_pl, y = sample(x, prob = 0.5)),
+    "doesn't know how to use some arguments"
+  )
+})
+
+test_that("sample() validates size", {
+  test_df <- tibble(x = 1:5)
+  test_pl <- as_polars_df(test_df)
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = -1)),
+    mutate(test_df, y = sample(x, size = -1))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 0)),
+    mutate(test_df, y = sample(x, size = 0))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = NULL)),
+    mutate(test_df, y = sample(x, size = NULL))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 3)),
+    mutate(test_df, y = sample(x, size = 3))
+  )
+
+  expect_both_error(
+    mutate(test_pl, y = sample(x, size = 100, replace = FALSE)),
+    mutate(test_df, y = sample(x, size = 100, replace = FALSE))
+  )
+
+  # `mutate(test_df, y = sample(x, size = 1.5))` has a weird behavior
+  # when size is a double in [1, 2)
+  expect_snapshot(
+    mutate(test_pl, y = sample(x, size = 1.5)),
+    error = TRUE
+  )
+})
+
 test_that("stats::lag() is not supported", {
   test_df <- tibble(x = c(10, 20, 30, 40, 10, 20, 30, 40))
   test_pl <- as_polars_df(test_df)
diff --git a/tests/testthat/test-funs_math-lazy.R b/tests/testthat/test-funs_math-lazy.R
@@ -296,28 +296,6 @@ test_that("rank() works on various input types", {
   )
 })
 
-test_that("warns if unknown args", {
-  test_df <- tibble(
-    x1 = c("a", "a", "b", "a", "c"),
-    x2 = c(2, 1, 5, 3, 1),
-    value = sample(11:15),
-    value_trigo = seq(0, 0.4, 0.1),
-    value_mix = -2:2,
-    value_with_NA = c(-2, -1, NA, 1, 2)
-  )
-  test_pl <- as_polars_lf(test_df)
-  foo <- test_pl |>
-    mutate(x = sample(x2)) |>
-    pull(x)
-
-  expect_true(all(foo %in% c(1, 2, 3, 5)))
-
-  expect_warning(
-    test_pl |> mutate(x = sample(x2, prob = 0.5)),
-    "doesn't know how to use some arguments"
-  )
-})
-
 test_that("%% and %/% work", {
   test_df <- tibble(
     x1 = c("a", "a", "b", "a", "c"),
diff --git a/tests/testthat/test-funs_math.R b/tests/testthat/test-funs_math.R
@@ -292,28 +292,6 @@ test_that("rank() works on various input types", {
   )
 })
 
-test_that("warns if unknown args", {
-  test_df <- tibble(
-    x1 = c("a", "a", "b", "a", "c"),
-    x2 = c(2, 1, 5, 3, 1),
-    value = sample(11:15),
-    value_trigo = seq(0, 0.4, 0.1),
-    value_mix = -2:2,
-    value_with_NA = c(-2, -1, NA, 1, 2)
-  )
-  test_pl <- as_polars_df(test_df)
-  foo <- test_pl |>
-    mutate(x = sample(x2)) |>
-    pull(x)
-
-  expect_true(all(foo %in% c(1, 2, 3, 5)))
-
-  expect_warning(
-    test_pl |> mutate(x = sample(x2, prob = 0.5)),
-    "doesn't know how to use some arguments"
-  )
-})
-
 test_that("%% and %/% work", {
   test_df <- tibble(
     x1 = c("a", "a", "b", "a", "c"),
diff --git a/vignettes/supported-functions.Rmd b/vignettes/supported-functions.Rmd
@@ -68,6 +68,7 @@ out <- tribble(
   "`base`", "`rank`",
   "`base`", "`rev`",
   "`base`", "`round`",
+  "`base`", "`sample`",
   "`base`", "`seq`",
   "`base`", "`seq_len`",
   "`base`", "`sin`",
@@ -187,6 +188,8 @@ out <- tribble(
         "In `tidypolars`, `na.last = NA` is not supported.",
       Package == "`base`" & Function == "`sort`" ~
         "In `tidypolars`, `na.last` must be explicitly supplied as `TRUE` or `FALSE`.",
+      Package == "`base`" & Function == "`sample`" ~
+        "`set.seed()` is not supported. Randomness is handled by Polars and does not use R's RNG state.",
       Package == "`lubridate`" & Function %in% c("`rollbackward`", "`rollback`", "`rollforward`") ~
         "While time zone handling should mimick the behaviour of `lubridate` in most cases, it is possible that Polars errors if rolling back/forward leads to am ambiguous datetime. It is also possible to have some differences in hours/minutes/seconds when converting between Polars and R.",
       Package == "`lubridate`" & Function == "`wday`" ~