Functions for simultaneous confidence bands of ECDF

TeemuSailynoja · TeemuSailynoja · commit 7e492aac60b9 · 2021-11-30T16:28:41.000+02:00
diff --git a/R/helpers-ppc.R b/R/helpers-ppc.R
@@ -65,6 +65,32 @@ validate_yrep <- function(yrep, y) {
 }
 
 
+#' Validate PIT
+#'
+#' Checks that `pit` is numeric, doesn't have any NAs, and is either a vector,
+#' or a 1-D array with values in [0,1].
+#'
+#' @param pit The 'pit' object from the user.
+#' @return Either throws an error or returns a numeric vector.
+#' @noRd
+validate_pit <- function(pit) {
+  stopifnot(is.numeric(pit))
+
+  if (!is_vector_or_1Darray(pit)) {
+    abort("'pit' must be a vector or 1D array.")
+  }
+
+  if (any(pit > 1) || any(pit < 0)) {
+    abort("'pit' must only contain values between 0 and 1.")
+  }
+
+  if (anyNA(pit)) {
+    abort("NAs not allowed in 'pit'.")
+  }
+
+  unname(pit)
+}
+
 #' Validate group
 #'
 #' Checks that grouping variable has same length as `y` and is either a vector or
@@ -250,6 +276,143 @@ all_counts <- function(x, ...) {
   all_whole_number(x, ...) && min(x) >= 0
 }
 
+
+adjust_gamma <- function(N,
+                         L = 1,
+                         K = N,
+                         prob = 0.99,
+                         M = 1000,
+                         adj_method = "interpolate") {
+  if (any(c(K, N, L) < 1)) {
+    abort("Parameters 'N', 'L' and 'K' must be positive integers.")
+  }
+  if (prob >= 1 || prob <= 0) {
+    abort("Value of 'prob' must be in (0,1).")
+  }
+  if (L == 1) {
+    gamma <- adjust_gamma_optimize(N, K, prob)
+  }
+  else {
+    gamma <- adjust_gamma_simulate(N, L, K, prob, M)
+  }
+  gamma
+}
+
+# Adjust coverage parameter to find simultaneous confidence intervals for the
+# ECDF of a sample from the uniform distribution.
+# N - length of samples
+# K - number of equally spaced evaluation points, i.e. the right ends of the
+# partition intervals.
+adjust_gamma_optimize <- function(N, K, prob=0.99) {
+  target <- function(gamma, prob, N, K) {
+    z <- 1:(K - 1) / K
+    z1 <- c(0, z)
+    z2 <- c(z, 1)
+
+    # pre-compute quantiles and use symmetry for increased efficiency.
+    x2_lower <- qbinom(gamma / 2, N, z2)
+    x2_upper <- c(N - rev(x2_lower)[2:K], 1)
+
+    # Compute the total probability of trajectories inside the confidence
+    # intervals. Initialize the set and corresponding probabilities known
+    # to be 0 and 1 for the starting value z1 = 0.
+    x1 <- 0
+    p_int <- 1
+    for (i in seq_along(z1)) {
+      tmp <- p_interior(
+        p_int, x1 = x1, x2 = x2_lower[i]: x2_upper[i],
+        z1 = z1[i], z2 = z2[i], gamma = gamma, N = N
+      )
+      x1 <- tmp$x1
+      p_int <- tmp$p_int
+    }
+    abs(prob - sum(p_int))
+  }
+  optimize(target, c(0, 1 - prob), prob, N = N, K = K)$minimum
+}
+
+# Adjust coverage parameter to find silmultaneous confidence intervals for the
+# ECDFs of multiple samples (chains) from the uniform distribution.
+# N - length of samples (chains).
+# L - number of samples (chains).
+# K - number of equally spaced evaluation points, i.e. the right ends of the
+# partition intervals.
+# M - number of simulations used to determine the 'prob' middle quantile.
+#'
+adjust_gamma_simulate <- function(N, L, K, prob = 0.99, M = 1000) {
+  gamma <- numeric(M)
+  z <- (1:(K - 1)) / K
+  n <- N * (L - 1)
+  k <- floor(z * N * L)
+  for (m in seq_len(M)) {
+    u <- replicate(L, runif(N)) %>% u_scale
+    scaled_ecdfs <- apply(outer(u, z, "<="), c(2, 3), sum)
+    gamma[m] <- 2 * min(
+      apply(
+        scaled_ecdfs, 1, phyper, m = N, n = n, k = k
+      ),
+      apply(
+        scaled_ecdfs - 1, 1, phyper, m = N, n = n, k = k, lower.tail = FALSE
+      )
+    )
+  }
+  alpha_quantile(gamma, 1 - prob)
+}
+
+p_interior <- function(p_int, x1, x2, z1, z2, gamma, N) {
+  z_tilde <- (z2 - z1) / (1 - z1)
+  N_tilde <- rep(N - x1, each = length(x2))
+  p_int <- rep(p_int, each = length(x2))
+  x_diff <- outer(x2, x1, "-")
+  p_x2_int <- p_int * dbinom(x_diff, N_tilde, z_tilde)
+
+  list(p_int = rowSums(p_x2_int), x1 = x2)
+}
+
+# 100 * `alpha` percent of the trials are allowed to be rejected.
+# In case of ties, return the largest value dominating at most
+# 100 * (alpha + tol) percent of the values.
+alpha_quantile <- function(gamma, alpha, tol = 0.001) {
+  a <- unname(quantile(gamma, probs = alpha))
+  a_tol <- unname(quantile(gamma, probs = alpha + tol))
+  if (a == a_tol) {
+    if (min(gamma) < a) {
+      # take the largest value that doesn't exceed the tolerance.
+      a <- max(gamma[gamma < a])
+    }
+  }
+  a
+}
+
+# Compute simultaneous confidence intervals for one or more samples from the
+# standard uniform distribution.
+# N - sample length
+# L - number of samples
+# K - size of uniform partition defining the ECDF evaluation points.
+# gamma - coverage parameter for the marginal distribution (binomial for
+# one sample and hypergeometric for multiple rank transformed chains).
+#' @noRd
+ecdf_intervals <- function(N, L=1, K, gamma) {
+  lims <- list()
+  z <- seq(0,1, length.out = K + 1)
+  if (L == 1) {
+    lims$lower <- qbinom(gamma / 2, N, z)
+    lims$upper <- qbinom(1 - gamma / 2, N, z)
+  } else {
+    n <- N * (L - 1)
+    k <- floor(z * L * N)
+    lims$lower <- qhyper(gamma / 2, N, n, k)
+    lims$upper <- qhyper(1 - gamma / 2, N, n, k)
+  }
+  lims
+}
+
+#' Transform observations in 'x' into their corresponding fractional ranks.
+#' @noRd
+u_scale <- function(x) {
+  array(rank(x) / length(x), dim = dim(x), dimnames = dimnames(x))
+}
+
 # labels ----------------------------------------------------------------
 create_yrep_ids <- function(ids) paste('italic(y)[rep] (', ids, ")")
 yrep_label <- function() expression(italic(y)[rep])