Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ jobs:
- {os: windows-latest, r: 'oldrel'}
- {os: macos-latest, r: 'release'}
- {os: macos-latest, r: 'oldrel'}
- {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}

- {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
- {os: ubuntu-22.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
Expand Down
30 changes: 14 additions & 16 deletions .github/workflows/touchstone-receive.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,17 @@ jobs:
config: ${{ steps.read_touchstone_config.outputs.config }}
steps:
- name: Checkout repo
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
fetch-depth: 0

fetch-depth: 0
- id: read_touchstone_config
run: |
content=`cat ./touchstone/config.json`
# the following lines are only required for multi line json
content="${content//'%'/'%25'}"
content="${content//$'\n'/'%0A'}"
content="${content//$'\r'/'%0D'}"
# end of optional handling for multi line json
echo "::set-output name=config::$content"
{
echo "config<<EOF"
cat ./touchstone/config.json
echo "EOF"
} >> "$GITHUB_OUTPUT"
Comment on lines +15 to +19
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's ChatGPT, not me. :)


build:
needs: prepare
runs-on: ${{ matrix.config.os }}
Expand All @@ -34,7 +32,7 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout repo
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up git user
Expand Down Expand Up @@ -66,7 +64,7 @@ jobs:
shell: Rscript {0}
- name: Checkout benchmarking repo
if: ${{ matrix.config.benchmarking_repo != ''}}
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
repository: ${{ matrix.config.benchmarking_repo }}
ref: ${{ matrix.config.benchmarking_ref }}
Expand All @@ -76,20 +74,20 @@ jobs:
- name: Save PR number
run: |
echo ${{ github.event.number }} > ./touchstone/pr-comment/NR
- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v4
with:
name: visual-benchmarks
path: touchstone/plots/
- uses: actions/upload-artifact@v1
- uses: actions/upload-artifact@v4
with:
name: results
path: touchstone/pr-comment
- uses: actions/download-artifact@v1
- uses: actions/download-artifact@v4
with:
name: results
- name: comment PR
run: cat touchstone/pr-comment/info.txt
- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v4
with:
name: pr
path: touchstone/pr-comment/
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# simstudy (development version)

## New features

* Added an argument `na.rm` to the functions `genDataDensity` and `addDataDensity`. If set to `FALSE`,
the generated data will include missing values at the same rate as the distribution data.

# simstudy 0.8.1

## New features
Expand Down
30 changes: 22 additions & 8 deletions R/add_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -527,11 +527,13 @@ addSynthetic <- function(dtOld, dtFrom,
#' @title Add data from a density defined by a vector of integers
#' @description Data are generated from an a density defined by a vector of integers.
#' @param dtOld Name of data table that is to be updated.
#' @param dataDist Vector that defines the desired density.
#' @param varname Name of variable name.
#' @param uselimits Indicator to use minimum and maximum of input data vector as
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
#' extends beyond the limits is used.
#' @param dataDist Numeric vector. Defines the desired density.
#' @param varname Character. Name of the variable.
#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data
#' vector are used as limits for sampling. Defaults to FALSE, in which case a
#' smoothed density that extends beyond these limits is used.
#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are
#' removed. If FALSE, the data will retain the same proportion of missing values.
#' @return A data table with the generated data.
#' @examples
#' def <- defData(varname = "x1", formula = 5, dist = "poisson")
Expand All @@ -545,7 +547,7 @@ addSynthetic <- function(dtOld, dtFrom,
#' @concept generate_data
#'
#'
addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE, na.rm = TRUE) {

assertNotMissing(dtOld = missing(dtOld), dataDist = missing(dataDist), varname = missing(varname))
assertClass(dtOld = dtOld, class = "data.table")
Expand All @@ -555,9 +557,12 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
dataDist <- round(dataDist, 0)

if (uselimits) {
density_est <- stats::density(dataDist, n = 10000, from = min(dataDist), to = max(dataDist))
density_est <-
stats::density(dataDist, n = 10000, from = min(dataDist, na.rm = TRUE),
to = max(dataDist, na.rm = TRUE), na.rm = TRUE)
} else {
density_est <- stats::density(dataDist, n = 10000)
density_est <-
stats::density(dataDist, n = 10000, na.rm = TRUE)
}

x <- density_est$x
Expand All @@ -571,6 +576,15 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {

.x <- sample(x, size = nrow(dtOld), replace = TRUE, prob = probabilities)

# If na.rm is not TRUE, then generate missing values based on proportion
# observed in dataDist

if (na.rm == FALSE) {
prop.missing <- sum(is.na(dataDist)) / length(dataDist)
is.missing <- stats::rbinom(length(.x), 1, prop.missing)
.x[is.missing == 1] <- NA
}

dtOld[, (varname) := .x]
dtOld[]
}
22 changes: 12 additions & 10 deletions R/generate_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -1178,31 +1178,33 @@ genSynthetic <- function(dtFrom, n = nrow(dtFrom),

#' @title Generate data from a density defined by a vector of integers
#' @description Data are generated from an a density defined by a vector of integers
#' @param n Number of samples to draw from the density.
#' @param dataDist Vector that defines the desired density
#' @param varname Name of variable name
#' @param uselimits Indicator to use minimum and maximum of input data vector as
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
#' extends beyond the limits is used.
#' @param id A string specifying the field that serves as the record id. The
#' @param n Integer. Number of samples to draw from the density.
#' @param dataDist Numeric vector. Defines the desired density.
#' @param varname Character. Name of the variable.
#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data
#' vector are used as limits for sampling. Defaults to FALSE, in which case a
#' smoothed density that extends beyond these limits is used.
#' @param id Character. A string specifying the field that serves as the record ID. The
#' default field is "id".
#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are
#' removed. If FALSE, the data will retain the same proportion of missing values.
#' @return A data table with the generated data
#' @examples
#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
#' data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
#'
#' genDataDensity(500, data_dist, varname = "x1", id = "id")
#' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id")
#' @export
#' @concept generate_data

genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") {
genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id", na.rm = TRUE) {

assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname))

dataDist <- round(dataDist, 0)

.dd <- genData(n, id = id)
addDataDensity(.dd, dataDist, varname, uselimits)[]
addDataDensity(.dd, dataDist, varname, uselimits, na.rm = na.rm)[]

}

Expand Down
15 changes: 9 additions & 6 deletions man/addDataDensity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 19 additions & 9 deletions man/genDataDensity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 63 additions & 2 deletions tests/testthat/test-generate_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,6 @@ test_that("addDataDensity works", {

def <- defData(varname = "x1", formula = 5, dist = "poisson")

dd <- genData(10000, def)
dd <- genData(10000, def)
dd <- addDataDensity(dd, data_dist, varname = "x2")

Expand All @@ -942,9 +941,41 @@ test_that("addDataDensity works", {

expect_lte(dp[, round(mean(p.tails), 2)], 0.05)
expect_lt(dp[, mean(p.value <= .05)], 0.05)



###

f.na <- function(data_dist, narm) {

def <- defData(varname = "x1", formula = 5, dist = "poisson")

dd <- genData(10000, def)
dd <- addDataDensity(dd, data_dist, varname = "x2", uselimits = TRUE, na.rm = narm)

dd[]

}

compare3 <- function() {
ints <- rpois(50, rpois(1, 8))
pmiss <- rbeta(1, 1, 9)
ints[rbinom(50, 1, pmiss) == 1] <- NA
dx <- f.na(ints, narm = FALSE)
dx[, mean(is.na(x2))]
}

expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)

compare4 <- function() {
ints <- rpois(50, rpois(1, 8))
pmiss <- rbeta(1, 1, 9)
ints[rbinom(50, 1, pmiss) == 1] <- NA
dx <- f.na(ints, narm = TRUE)
dx[, mean(is.na(x2))]
}

expect_equal(mean(sapply(1:100, function(x) compare4())), 0.0, tolerance = 0)

})

test_that("genDataDensity works", {
Expand All @@ -969,6 +1000,36 @@ test_that("genDataDensity works", {
kstest <- mean(sapply(1:200, function(x) compare()) < .05)
expect_lt(kstest, 0.05)

### Testing na.rm

f.na <- function(data_dist, narm) {

dd <- genDataDensity(10000, data_dist, varname = "x1", uselimits = TRUE, na.rm = narm)
dd[]

}

compare3 <- function() {
ints <- rpois(50, rpois(1, 8))
pmiss <- rbeta(1, 1, 9)
ints[rbinom(50, 1, pmiss) == 1] <- NA
dx <- f.na(ints, narm = FALSE)
dx[, mean(is.na(x1))]
}

expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)

compare4 <- function() {
ints <- rpois(50, rpois(1, 8))
pmiss <- rbeta(1, 1, 9)
ints[rbinom(50, 1, pmiss) == 1] <- NA
dx <- f.na(ints, narm = TRUE)
dx[, mean(is.na(x1))]
}

expect_equal(mean(sapply(1:100, function(x) compare4())), 0, tolerance = 0)


})

test_that("genDataDensity and addDataDensity throws errors", {
Expand Down
8 changes: 4 additions & 4 deletions touchstone/config.json
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a frozen snapshot and now it's a moving target so it might cause issues through updates? But I guess it would be good to see those in CI anyway :)

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"os": "ubuntu-20.04",
"r": "4.1.1",
"rspm": "https://packagemanager.rstudio.com/all/__linux__/focal/2912022-01-07+MTo3NDQwNTcyLDI6NDUyNjIxNTs0QzU3NUZBRQ"
}
"os": "ubuntu-22.04",
"r": "4.4.3",
"rspm": "https://packagemanager.rstudio.com/all/__linux__/jammy/latest"
}