kgoldfeld · kgoldfeld · Jun 27, 2025 · Jun 26, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -27,9 +27,9 @@ jobs:
           - {os: windows-latest, r: 'oldrel'}
           - {os: macos-latest, r: 'release'}
           - {os: macos-latest, r: 'oldrel'}
-          - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
-          - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
-
+          - {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
+          - {os: ubuntu-22.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
+          
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}

diff --git a/.github/workflows/touchstone-receive.yaml b/.github/workflows/touchstone-receive.yaml
@@ -7,19 +7,17 @@ jobs:
       config: ${{ steps.read_touchstone_config.outputs.config }}
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
-           fetch-depth: 0
-
+          fetch-depth: 0
       - id: read_touchstone_config
         run: |
-          content=`cat ./touchstone/config.json`
-          # the following lines are only required for multi line json
-          content="${content//'%'/'%25'}"
-          content="${content//$'\n'/'%0A'}"
-          content="${content//$'\r'/'%0D'}"
-          # end of optional handling for multi line json
-          echo "::set-output name=config::$content"
+          {
+            echo "config<<EOF"
+            cat ./touchstone/config.json
+            echo "EOF"
+          } >> "$GITHUB_OUTPUT"
+
   build:
     needs: prepare
     runs-on: ${{ matrix.config.os }}
@@ -34,7 +32,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
            fetch-depth: 0
       - name: Set up git user
@@ -66,7 +64,7 @@ jobs:
         shell: Rscript {0}
       - name: Checkout benchmarking repo
         if: ${{ matrix.config.benchmarking_repo != ''}}
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           repository: ${{ matrix.config.benchmarking_repo }}
           ref: ${{ matrix.config.benchmarking_ref }}
@@ -76,20 +74,20 @@ jobs:
       - name: Save PR number
         run: |
           echo ${{ github.event.number }} > ./touchstone/pr-comment/NR
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: visual-benchmarks
           path: touchstone/plots/
-      - uses: actions/upload-artifact@v1
+      - uses: actions/upload-artifact@v4
         with:
           name: results
           path: touchstone/pr-comment
-      - uses: actions/download-artifact@v1
+      - uses: actions/download-artifact@v4
         with:
           name: results
       - name: comment PR
         run: cat touchstone/pr-comment/info.txt
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: pr
           path: touchstone/pr-comment/
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # simstudy (development version)
 
+## New features
+
+* Added an argument `na.rm` to the functions `genDataDensity` and `addDataDensity`. If set to `FALSE`,
+  the generated data will include missing values at the same rate as the distribution data.
+
 # simstudy 0.8.1
 
 ## New features

diff --git a/R/add_data.R b/R/add_data.R
@@ -527,11 +527,13 @@ addSynthetic <- function(dtOld, dtFrom,
 #' @title Add data from a density defined by a vector of integers
 #' @description Data are generated from an a density defined by a vector of integers.
 #' @param dtOld Name of data table that is to be updated.
-#' @param dataDist Vector that defines the desired density.
-#' @param varname Name of variable name.
-#' @param uselimits Indicator to use minimum and maximum of input data vector as 
-#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
-#' extends beyond the limits is used.
+#' @param dataDist Numeric vector. Defines the desired density.
+#' @param varname Character. Name of the variable.
+#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data 
+#' vector are used as limits for sampling. Defaults to FALSE, in which case a 
+#' smoothed density that extends beyond these limits is used.
+#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are 
+#' removed. If FALSE, the data will retain the same proportion of missing values.
 #' @return A data table with the generated data.
 #' @examples
 #' def <- defData(varname = "x1", formula = 5, dist = "poisson")
@@ -545,7 +547,7 @@ addSynthetic <- function(dtOld, dtFrom,
 #' @concept generate_data
 #' 
 #' 
-addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
+addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE, na.rm = TRUE) {
 
   assertNotMissing(dtOld = missing(dtOld), dataDist = missing(dataDist), varname = missing(varname))
   assertClass(dtOld = dtOld, class = "data.table")
@@ -555,9 +557,12 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
   dataDist <- round(dataDist, 0)
 
   if (uselimits) {
-    density_est <- stats::density(dataDist, n = 10000, from = min(dataDist), to = max(dataDist))
+    density_est <- 
+      stats::density(dataDist, n = 10000, from = min(dataDist, na.rm = TRUE), 
+                     to = max(dataDist, na.rm = TRUE), na.rm = TRUE)
   } else {
-    density_est <- stats::density(dataDist, n = 10000)
+    density_est <- 
+      stats::density(dataDist, n = 10000, na.rm = TRUE)
   }
 
   x <- density_est$x
@@ -571,6 +576,15 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
 
   .x <- sample(x, size = nrow(dtOld), replace = TRUE, prob = probabilities)
 
+  # If na.rm is not TRUE, then generate missing values based on proportion
+  # observed in dataDist
+
+  if (na.rm == FALSE) {
+    prop.missing <- sum(is.na(dataDist)) / length(dataDist)
+    is.missing <- stats::rbinom(length(.x), 1, prop.missing)
+    .x[is.missing == 1] <- NA
+  }
+
   dtOld[, (varname) := .x]
   dtOld[]
 }
diff --git a/R/generate_data.R b/R/generate_data.R
@@ -1178,31 +1178,33 @@ genSynthetic <- function(dtFrom, n = nrow(dtFrom),
 
 #' @title Generate data from a density defined by a vector of integers
 #' @description Data are generated from an a density defined by a vector of integers
-#' @param n Number of samples to draw from the density.
-#' @param dataDist Vector that defines the desired density
-#' @param varname Name of variable name
-#' @param uselimits Indicator to use minimum and maximum of input data vector as 
-#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
-#' extends beyond the limits is used.
-#' @param id A string specifying the field that serves as the record id. The
+#' @param n Integer. Number of samples to draw from the density.
+#' @param dataDist Numeric vector. Defines the desired density.
+#' @param varname Character. Name of the variable.
+#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data 
+#' vector are used as limits for sampling. Defaults to FALSE, in which case a 
+#' smoothed density that extends beyond these limits is used.
+#' @param id Character. A string specifying the field that serves as the record ID. The
 #' default field is "id".
+#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are 
+#' removed. If FALSE, the data will retain the same proportion of missing values.
 #' @return A data table with the generated data
 #' @examples
-#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
+#' data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
 #' 
 #' genDataDensity(500, data_dist, varname = "x1", id = "id")
 #' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id")
 #' @export
 #' @concept generate_data
 
-genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") {
+genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id", na.rm = TRUE) {
 
   assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname))
 
   dataDist <- round(dataDist, 0)
 
   .dd <- genData(n, id = id)
-  addDataDensity(.dd, dataDist, varname, uselimits)[]
+  addDataDensity(.dd, dataDist, varname, uselimits, na.rm = na.rm)[]
 
 }
 

diff --git a/man/addDataDensity.Rd b/man/addDataDensity.Rd
diff --git a/man/genDataDensity.Rd b/man/genDataDensity.Rd
diff --git a/tests/testthat/test-generate_data.R b/tests/testthat/test-generate_data.R
@@ -919,7 +919,6 @@ test_that("addDataDensity works", {
 
     def <- defData(varname = "x1", formula = 5, dist = "poisson")
 
-    dd <- genData(10000, def)
     dd <- genData(10000, def)
     dd <- addDataDensity(dd, data_dist, varname = "x2")
 
@@ -942,9 +941,41 @@ test_that("addDataDensity works", {
 
   expect_lte(dp[, round(mean(p.tails), 2)], 0.05)
   expect_lt(dp[, mean(p.value <= .05)], 0.05)
-
 
 
+  ###
+
+  f.na <- function(data_dist, narm) {
+
+    def <- defData(varname = "x1", formula = 5, dist = "poisson")
+
+    dd <- genData(10000, def)
+    dd <- addDataDensity(dd, data_dist, varname = "x2", uselimits = TRUE, na.rm = narm)
+
+    dd[]
+
+  }
+
+  compare3 <- function() {
+    ints <- rpois(50, rpois(1, 8))
+    pmiss <- rbeta(1, 1, 9)
+    ints[rbinom(50, 1, pmiss) == 1] <- NA
+    dx <- f.na(ints, narm = FALSE)
+    dx[, mean(is.na(x2))]
+  }
+
+  expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)
+
+  compare4 <- function() {
+    ints <- rpois(50, rpois(1, 8))
+    pmiss <- rbeta(1, 1, 9)
+    ints[rbinom(50, 1, pmiss) == 1] <- NA
+    dx <- f.na(ints, narm = TRUE)
+    dx[, mean(is.na(x2))]
+  }
+
+  expect_equal(mean(sapply(1:100, function(x) compare4())), 0.0, tolerance = 0)
+
 })
 
 test_that("genDataDensity works", {
@@ -969,6 +1000,36 @@ test_that("genDataDensity works", {
   kstest <- mean(sapply(1:200, function(x) compare()) < .05)
   expect_lt(kstest, 0.05)
 
+  ### Testing na.rm
+
+  f.na <- function(data_dist, narm) {
+
+    dd <- genDataDensity(10000, data_dist, varname = "x1", uselimits = TRUE, na.rm = narm)
+    dd[]
+
+  }
+
+  compare3 <- function() {
+    ints <- rpois(50, rpois(1, 8))
+    pmiss <- rbeta(1, 1, 9)
+    ints[rbinom(50, 1, pmiss) == 1] <- NA
+    dx <- f.na(ints, narm = FALSE)
+    dx[, mean(is.na(x1))]
+  }
+
+  expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)
+
+  compare4 <- function() {
+    ints <- rpois(50, rpois(1, 8))
+    pmiss <- rbeta(1, 1, 9)
+    ints[rbinom(50, 1, pmiss) == 1] <- NA
+    dx <- f.na(ints, narm = TRUE)
+    dx[, mean(is.na(x1))]
+  }
+
+  expect_equal(mean(sapply(1:100, function(x) compare4())), 0, tolerance = 0)
+
+
 })
 
 test_that("genDataDensity and addDataDensity throws errors", {

diff --git a/touchstone/config.json b/touchstone/config.json
@@ -1,5 +1,5 @@
 {
-    "os": "ubuntu-20.04",
-    "r": "4.1.1",
-    "rspm": "https://packagemanager.rstudio.com/all/__linux__/focal/2912022-01-07+MTo3NDQwNTcyLDI6NDUyNjIxNTs0QzU3NUZBRQ"
-}
+    "os": "ubuntu-22.04",
+    "r": "4.4.3",
+    "rspm": "https://packagemanager.rstudio.com/all/__linux__/jammy/latest"
+}