Add error checks and code comments to dset(); update version nr to 0.2.0 since this should be an initial stable release

civilstat · civilstat · commit 03cfa9927a9c · 2022-03-22T11:33:10.000-04:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CIPerm
 Type: Package
 Title: Computationally-Efficient CIs for Mean Shift from Permutation Methods
-Version: 0.1.0.9005
+Version: 0.2.0
 Date: 2022-03-22
 Authors@R: c(
     person("Emily", "Tupaj", role = "aut"),
diff --git a/R/cint.R b/R/cint.R
@@ -36,7 +36,7 @@
 
 cint <- function(dset, conf.level = .95, tail = c("Two", "Left", "Right")){
 
-  sig = 1 - conf.level
+  sig <- 1 - conf.level
   stopifnot(sig > 0 & sig < 1)
   tail <- match.arg(tail)
 
@@ -84,8 +84,8 @@ cint <- function(dset, conf.level = .95, tail = c("Two", "Left", "Right")){
     siglevel <- sig
     index <- roundOrCeiling(siglevel*num) - 1
     UB <- w.i[(num-index)]
-    LT = c(-Inf, UB)
-    conf.achieved = 1-((index+1)/num)
+    LT <- c(-Inf, UB)
+    conf.achieved <- 1-((index+1)/num)
     message(paste0("Achieved conf. level: 1-(", index+1, "/", num, ")"))
     return(list(conf.int = LT,
                 conf.level.achieved = conf.achieved))
@@ -94,8 +94,8 @@ cint <- function(dset, conf.level = .95, tail = c("Two", "Left", "Right")){
     index <- roundOrCeiling(siglevel*num) - 1
     LB <- w.i[1+nk0+index] # starts counting from the (1+nk0)'th element of w.i
     # (not the first (original) which will always be 'NaN')
-    RT = c(LB, Inf)
-    conf.achieved = 1-((index+1)/num)
+    RT <- c(LB, Inf)
+    conf.achieved <- 1-((index+1)/num)
     message(paste0("Achieved conf. level: 1-(", index+1, "/", num, ")"))
     return(list(conf.int = RT,
                 conf.level.achieved = conf.achieved))
@@ -107,8 +107,8 @@ cint <- function(dset, conf.level = .95, tail = c("Two", "Left", "Right")){
     # (not the first (original) which will always be 'NaN')
     Upper <- if(is.na(UB)) Inf else UB
     Lower <- if(is.na(LB)) -Inf else LB
-    CI = c(Lower, Upper)
-    conf.achieved = 1-(2*(index+1)/num)
+    CI <- c(Lower, Upper)
+    conf.achieved <- 1-(2*(index+1)/num)
     message(paste0("Achieved conf. level: 1-2*(", index+1, "/", num, ")"))
     return(list(conf.int = CI,
                 conf.level.achieved = conf.achieved))
diff --git a/R/dset.R b/R/dset.R
@@ -111,39 +111,50 @@
 
 dset <- function(group1, group2, nmc = 10000, returnData = FALSE){
   stopifnot(nmc >= 0)
-  # TODO: add more error checks:
-  #   nmc must be a non-neg integer, not 1, & not too small relative to n,m(?);
-  #   group1 and group2 must be numeric, vectors, and non-empty
+  stopifnot(nmc != 1)
+  stopifnot(is.numeric(group1) & is.numeric(group2))
+  stopifnot(length(group1) >= 1 & length(group2) >= 1)
+  stopifnot(!any(is.na(c(group1, group2))))
 
-  # creates the dataset referenced in pval and cint
   combined <- c(group1, group2)
 
   n <- length(group1)
   m <- length(group2)
   den <- (1/n + 1/m)
 
   N <- n + m
-  num <- choose(N, n)
-  if(nmc == 0 | num <= nmc) {
-    dcombn <- utils::combn(1:N, n)
+  num <- choose(N, n) # number of possible combinations
+
+  # Form a matrix where each column contains indices in new "group1" for that comb or perm
+  if(nmc == 0 | num <= nmc) { # take all possible combinations
+    dcombn1 <- utils::combn(1:N, n)
   } else { # use Monte Carlo sample of permutations, not all possible combinations
-    dcombn <- replicate(nmc, sample(N, n))
-    dcombn[,1] <- 1:n # force the 1st "combination" to be original data order
+    dcombn1 <- replicate(nmc, sample(N, n))
+    dcombn1[,1] <- 1:n # force the 1st "combination" to be original data order
     num <- nmc
   }
 
-  dcombn2 <- apply(dcombn, 2, function(x) setdiff(1:N, x))
-  group1_perm <- matrix(combined[dcombn], nrow = n)
+  # Form the equivalent matrix for indices in new "group2"
+  dcombn2 <- apply(dcombn1, 2, function(x) setdiff(1:N, x))
+
+  # Form the corresponding matrices of data values, not data indices
+  group1_perm <- matrix(combined[dcombn1], nrow = n)
   group2_perm <- matrix(combined[dcombn2], nrow = m)
 
-  k <- colSums(matrix(dcombn %in% ((n+1):N), nrow=n))
+  # For each comb or perm, compute:
+  #   difference in group means; sum in group1; difference in group medians;
+  #   and sum of *ranks* in group1 (the statistic for the Wilcoxon rank sum test)
   diffmean <- colMeans(group1_perm) - colMeans(group2_perm)
   sum1 <- colSums(group1_perm)
   diffmedian <- matrixStats::colMedians(group1_perm) - matrixStats::colMedians(group2_perm)
-
   r <- rank(combined, ties.method = "first")
-  wilsum <- colSums(matrix(r[dcombn], nrow = n))
-  wkd = (diffmean[1] - diffmean) / (k * den)
+  wilsum <- colSums(matrix(r[dcombn1], nrow = n))
+
+  # For each comb or perm, compute:
+  #   k = how many values swapped from group1 to group2?
+  #   wkd = Nguyen (2009) statistic whose quantiles are used for CI endpoints
+  k <- colSums(matrix(dcombn1 %in% ((n+1):N), nrow=n))
+  wkd <- (diffmean[1] - diffmean) / (k * den)
 
   dataframe <- data.frame(diffmean = diffmean,
                           sum1 = sum1,
diff --git a/R/pval.R b/R/pval.R
@@ -45,15 +45,15 @@ pval <- function(dset, tail = c("Two", "Left", "Right"),
   num <- nrow(dset)
 
   if (tail == "Left"){
-    pvalmean = sum(dset$diffmean <= dset$diffmean[1])/num
-    pvalsum = sum(dset$sum1 <= dset$sum1[1])/num
-    pvalmedian = sum(dset$diffmedian <= dset$diffmedian[1])/num
-    pvalwilsum = sum(dset$wilsum <= dset$wilsum[1])/num
+    pvalmean <- sum(dset$diffmean <= dset$diffmean[1])/num
+    pvalsum <- sum(dset$sum1 <= dset$sum1[1])/num
+    pvalmedian <- sum(dset$diffmedian <= dset$diffmedian[1])/num
+    pvalwilsum <- sum(dset$wilsum <= dset$wilsum[1])/num
   } else if (tail == "Right") {
-    pvalmean = sum(dset$diffmean >= dset$diffmean[1])/num
-    pvalsum = sum(dset$sum1 >= dset$sum1[1])/num
-    pvalmedian = sum(dset$diffmedian >= dset$diffmedian[1])/num
-    pvalwilsum = sum(dset$wilsum >= dset$wilsum[1])/num
+    pvalmean <- sum(dset$diffmean >= dset$diffmean[1])/num
+    pvalsum <- sum(dset$sum1 >= dset$sum1[1])/num
+    pvalmedian <- sum(dset$diffmedian >= dset$diffmedian[1])/num
+    pvalwilsum <- sum(dset$wilsum >= dset$wilsum[1])/num
   } else { # tail == "Two"
     pvalmean <- sum(abs(dset$diffmean - mean(dset$diffmean)) >= abs(dset$diffmean[1] - mean(dset$diffmean)))/num
     pvalsum <- sum(abs(dset$sum1 - mean(dset$sum1)) >= abs(dset$sum1[1] - mean(dset$sum1)))/num