Skip to content

Commit 0734ac6

Browse files
authored
Merge pull request #238 from kgoldfeld/237-any-possibility-of-adding-narmfalse-argument-to-the-gendatadensity-and-adddatadensity-functions
237 any possibility of adding narmfalse argument to the gendatadensity and adddatadensity functions
2 parents df88032 + 3f22600 commit 0734ac6

File tree

9 files changed

+151
-58
lines changed

9 files changed

+151
-58
lines changed

.github/workflows/R-CMD-check.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ jobs:
2727
- {os: windows-latest, r: 'oldrel'}
2828
- {os: macos-latest, r: 'release'}
2929
- {os: macos-latest, r: 'oldrel'}
30-
- {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
31-
- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
32-
30+
- {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
31+
- {os: ubuntu-22.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
32+
3333
env:
3434
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
3535
RSPM: ${{ matrix.config.rspm }}

.github/workflows/touchstone-receive.yaml

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,17 @@ jobs:
77
config: ${{ steps.read_touchstone_config.outputs.config }}
88
steps:
99
- name: Checkout repo
10-
uses: actions/checkout@v2
10+
uses: actions/checkout@v4
1111
with:
12-
fetch-depth: 0
13-
12+
fetch-depth: 0
1413
- id: read_touchstone_config
1514
run: |
16-
content=`cat ./touchstone/config.json`
17-
# the following lines are only required for multi line json
18-
content="${content//'%'/'%25'}"
19-
content="${content//$'\n'/'%0A'}"
20-
content="${content//$'\r'/'%0D'}"
21-
# end of optional handling for multi line json
22-
echo "::set-output name=config::$content"
15+
{
16+
echo "config<<EOF"
17+
cat ./touchstone/config.json
18+
echo "EOF"
19+
} >> "$GITHUB_OUTPUT"
20+
2321
build:
2422
needs: prepare
2523
runs-on: ${{ matrix.config.os }}
@@ -34,7 +32,7 @@ jobs:
3432
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
3533
steps:
3634
- name: Checkout repo
37-
uses: actions/checkout@v2
35+
uses: actions/checkout@v4
3836
with:
3937
fetch-depth: 0
4038
- name: Set up git user
@@ -66,7 +64,7 @@ jobs:
6664
shell: Rscript {0}
6765
- name: Checkout benchmarking repo
6866
if: ${{ matrix.config.benchmarking_repo != ''}}
69-
uses: actions/checkout@v2
67+
uses: actions/checkout@v4
7068
with:
7169
repository: ${{ matrix.config.benchmarking_repo }}
7270
ref: ${{ matrix.config.benchmarking_ref }}
@@ -76,20 +74,20 @@ jobs:
7674
- name: Save PR number
7775
run: |
7876
echo ${{ github.event.number }} > ./touchstone/pr-comment/NR
79-
- uses: actions/upload-artifact@v2
77+
- uses: actions/upload-artifact@v4
8078
with:
8179
name: visual-benchmarks
8280
path: touchstone/plots/
83-
- uses: actions/upload-artifact@v1
81+
- uses: actions/upload-artifact@v4
8482
with:
8583
name: results
8684
path: touchstone/pr-comment
87-
- uses: actions/download-artifact@v1
85+
- uses: actions/download-artifact@v4
8886
with:
8987
name: results
9088
- name: comment PR
9189
run: cat touchstone/pr-comment/info.txt
92-
- uses: actions/upload-artifact@v2
90+
- uses: actions/upload-artifact@v4
9391
with:
9492
name: pr
9593
path: touchstone/pr-comment/

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# simstudy (development version)
22

3+
## New features
4+
5+
* Added an argument `na.rm` to the functions `genDataDensity` and `addDataDensity`. If set to `FALSE`,
6+
the generated data will include missing values at the same rate as the distribution data.
7+
38
# simstudy 0.8.1
49

510
## New features

R/add_data.R

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -527,11 +527,13 @@ addSynthetic <- function(dtOld, dtFrom,
527527
#' @title Add data from a density defined by a vector of integers
528528
#' @description Data are generated from an a density defined by a vector of integers.
529529
#' @param dtOld Name of data table that is to be updated.
530-
#' @param dataDist Vector that defines the desired density.
531-
#' @param varname Name of variable name.
532-
#' @param uselimits Indicator to use minimum and maximum of input data vector as
533-
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
534-
#' extends beyond the limits is used.
530+
#' @param dataDist Numeric vector. Defines the desired density.
531+
#' @param varname Character. Name of the variable.
532+
#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data
533+
#' vector are used as limits for sampling. Defaults to FALSE, in which case a
534+
#' smoothed density that extends beyond these limits is used.
535+
#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are
536+
#' removed. If FALSE, the data will retain the same proportion of missing values.
535537
#' @return A data table with the generated data.
536538
#' @examples
537539
#' def <- defData(varname = "x1", formula = 5, dist = "poisson")
@@ -545,7 +547,7 @@ addSynthetic <- function(dtOld, dtFrom,
545547
#' @concept generate_data
546548
#'
547549
#'
548-
addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
550+
addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE, na.rm = TRUE) {
549551

550552
assertNotMissing(dtOld = missing(dtOld), dataDist = missing(dataDist), varname = missing(varname))
551553
assertClass(dtOld = dtOld, class = "data.table")
@@ -555,9 +557,12 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
555557
dataDist <- round(dataDist, 0)
556558

557559
if (uselimits) {
558-
density_est <- stats::density(dataDist, n = 10000, from = min(dataDist), to = max(dataDist))
560+
density_est <-
561+
stats::density(dataDist, n = 10000, from = min(dataDist, na.rm = TRUE),
562+
to = max(dataDist, na.rm = TRUE), na.rm = TRUE)
559563
} else {
560-
density_est <- stats::density(dataDist, n = 10000)
564+
density_est <-
565+
stats::density(dataDist, n = 10000, na.rm = TRUE)
561566
}
562567

563568
x <- density_est$x
@@ -571,6 +576,15 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {
571576

572577
.x <- sample(x, size = nrow(dtOld), replace = TRUE, prob = probabilities)
573578

579+
# If na.rm is not TRUE, then generate missing values based on proportion
580+
# observed in dataDist
581+
582+
if (na.rm == FALSE) {
583+
prop.missing <- sum(is.na(dataDist)) / length(dataDist)
584+
is.missing <- stats::rbinom(length(.x), 1, prop.missing)
585+
.x[is.missing == 1] <- NA
586+
}
587+
574588
dtOld[, (varname) := .x]
575589
dtOld[]
576590
}

R/generate_data.R

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,31 +1178,33 @@ genSynthetic <- function(dtFrom, n = nrow(dtFrom),
11781178

11791179
#' @title Generate data from a density defined by a vector of integers
11801180
#' @description Data are generated from an a density defined by a vector of integers
1181-
#' @param n Number of samples to draw from the density.
1182-
#' @param dataDist Vector that defines the desired density
1183-
#' @param varname Name of variable name
1184-
#' @param uselimits Indicator to use minimum and maximum of input data vector as
1185-
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
1186-
#' extends beyond the limits is used.
1187-
#' @param id A string specifying the field that serves as the record id. The
1181+
#' @param n Integer. Number of samples to draw from the density.
1182+
#' @param dataDist Numeric vector. Defines the desired density.
1183+
#' @param varname Character. Name of the variable.
1184+
#' @param uselimits Logical. If TRUE, the minimum and maximum of the input data
1185+
#' vector are used as limits for sampling. Defaults to FALSE, in which case a
1186+
#' smoothed density that extends beyond these limits is used.
1187+
#' @param id Character. A string specifying the field that serves as the record ID. The
11881188
#' default field is "id".
1189+
#' @param na.rm Logical. If TRUE (default), missing values in `dataDist` are
1190+
#' removed. If FALSE, the data will retain the same proportion of missing values.
11891191
#' @return A data table with the generated data
11901192
#' @examples
1191-
#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
1193+
#' data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
11921194
#'
11931195
#' genDataDensity(500, data_dist, varname = "x1", id = "id")
11941196
#' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id")
11951197
#' @export
11961198
#' @concept generate_data
11971199

1198-
genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") {
1200+
genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id", na.rm = TRUE) {
11991201

12001202
assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname))
12011203

12021204
dataDist <- round(dataDist, 0)
12031205

12041206
.dd <- genData(n, id = id)
1205-
addDataDensity(.dd, dataDist, varname, uselimits)[]
1207+
addDataDensity(.dd, dataDist, varname, uselimits, na.rm = na.rm)[]
12061208

12071209
}
12081210

man/addDataDensity.Rd

Lines changed: 9 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/genDataDensity.Rd

Lines changed: 19 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-generate_data.R

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -919,7 +919,6 @@ test_that("addDataDensity works", {
919919

920920
def <- defData(varname = "x1", formula = 5, dist = "poisson")
921921

922-
dd <- genData(10000, def)
923922
dd <- genData(10000, def)
924923
dd <- addDataDensity(dd, data_dist, varname = "x2")
925924

@@ -942,9 +941,41 @@ test_that("addDataDensity works", {
942941

943942
expect_lte(dp[, round(mean(p.tails), 2)], 0.05)
944943
expect_lt(dp[, mean(p.value <= .05)], 0.05)
945-
946944

947945

946+
###
947+
948+
f.na <- function(data_dist, narm) {
949+
950+
def <- defData(varname = "x1", formula = 5, dist = "poisson")
951+
952+
dd <- genData(10000, def)
953+
dd <- addDataDensity(dd, data_dist, varname = "x2", uselimits = TRUE, na.rm = narm)
954+
955+
dd[]
956+
957+
}
958+
959+
compare3 <- function() {
960+
ints <- rpois(50, rpois(1, 8))
961+
pmiss <- rbeta(1, 1, 9)
962+
ints[rbinom(50, 1, pmiss) == 1] <- NA
963+
dx <- f.na(ints, narm = FALSE)
964+
dx[, mean(is.na(x2))]
965+
}
966+
967+
expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)
968+
969+
compare4 <- function() {
970+
ints <- rpois(50, rpois(1, 8))
971+
pmiss <- rbeta(1, 1, 9)
972+
ints[rbinom(50, 1, pmiss) == 1] <- NA
973+
dx <- f.na(ints, narm = TRUE)
974+
dx[, mean(is.na(x2))]
975+
}
976+
977+
expect_equal(mean(sapply(1:100, function(x) compare4())), 0.0, tolerance = 0)
978+
948979
})
949980

950981
test_that("genDataDensity works", {
@@ -969,6 +1000,36 @@ test_that("genDataDensity works", {
9691000
kstest <- mean(sapply(1:200, function(x) compare()) < .05)
9701001
expect_lt(kstest, 0.05)
9711002

1003+
### Testing na.rm
1004+
1005+
f.na <- function(data_dist, narm) {
1006+
1007+
dd <- genDataDensity(10000, data_dist, varname = "x1", uselimits = TRUE, na.rm = narm)
1008+
dd[]
1009+
1010+
}
1011+
1012+
compare3 <- function() {
1013+
ints <- rpois(50, rpois(1, 8))
1014+
pmiss <- rbeta(1, 1, 9)
1015+
ints[rbinom(50, 1, pmiss) == 1] <- NA
1016+
dx <- f.na(ints, narm = FALSE)
1017+
dx[, mean(is.na(x1))]
1018+
}
1019+
1020+
expect_equal(mean(sapply(1:100, function(x) compare3())), 0.10, tolerance = .04)
1021+
1022+
compare4 <- function() {
1023+
ints <- rpois(50, rpois(1, 8))
1024+
pmiss <- rbeta(1, 1, 9)
1025+
ints[rbinom(50, 1, pmiss) == 1] <- NA
1026+
dx <- f.na(ints, narm = TRUE)
1027+
dx[, mean(is.na(x1))]
1028+
}
1029+
1030+
expect_equal(mean(sapply(1:100, function(x) compare4())), 0, tolerance = 0)
1031+
1032+
9721033
})
9731034

9741035
test_that("genDataDensity and addDataDensity throws errors", {

touchstone/config.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"os": "ubuntu-20.04",
3-
"r": "4.1.1",
4-
"rspm": "https://packagemanager.rstudio.com/all/__linux__/focal/2912022-01-07+MTo3NDQwNTcyLDI6NDUyNjIxNTs0QzU3NUZBRQ"
5-
}
2+
"os": "ubuntu-22.04",
3+
"r": "4.4.3",
4+
"rspm": "https://packagemanager.rstudio.com/all/__linux__/jammy/latest"
5+
}

0 commit comments

Comments
 (0)