improved cvlar, but it doesn't give same results as glmnet

joftius · joftius · commit bf1fa5e76dcd · 2018-06-02T23:43:08.000-04:00
diff --git a/forLater/josh/selectiveInference/R/cv.R b/forLater/josh/selectiveInference/R/cv.R
@@ -19,7 +19,7 @@ cvMakeFolds <- function(x, nfolds = 5) {
 }
 
 # To interface with glmnet
-foldid <- function(folds) {
+foldidglmnet <- function(folds) {
   n <- sum(sapply(folds, length))
   glmnetfoldid <- rep(0, n)
   for (ind in 1:length(folds)) {
@@ -28,6 +28,18 @@ foldid <- function(folds) {
   glmnetfoldid
 }
 
+# cv.glmnet and estimateSigma mashup
+cvglmnetlar <- function(x, y, foldid) {
+  cvfit <- cv.glmnet(x, y, intercept = FALSE, foldid = foldid)
+  lamhat <- cvfit$lambda.min
+  yhat <- predict(cvfit, x, s = lamhat)
+  nz <- sum(coef(cvfit, s = lamhat) !=0)
+  cvfit$sigma <- sqrt(sum((y-yhat)^2)/(length(y)-nz-1))
+  cvfit$df <- nz
+  return(cvfit)
+}
+
+
 #--------------------------------------
 # Functions for computing quadratic form for cv-error
 #--------------------------------------
@@ -59,6 +71,8 @@ cvProductHat <- function(folds, inds, finds, ginds, hat_matrices) {
     return(Reduce('+', terms))
 }
 
+# This is too "clever," I can't easily understand it
+# simpler code is preferable for maintenance and forking etc
 cvRSSquad <- function(x, folds, active.sets) {
     hat_matrices <- cvHatMatrix(x, folds, active.sets)
     nfolds <- length(folds)
@@ -78,9 +92,28 @@ cvRSSquad <- function(x, folds, active.sets) {
     return(Q)
 }
 
+cvopt <- function(x, y, maxsteps, folds, active.sets) {
+  yperm <- y[order(unlist(folds))]
+  RSSquads <- list()
+  # Can this loop be optimized with smart updating of each model along each path?
+  for (s in 1:maxsteps) {
+    initial.active <- lapply(active.sets, function(a) a[1:s])
+    RSSquads[[s]] <- cvRSSquad(x, folds, initial.active)
+  }
+  
+  RSSs <- lapply(RSSquads, function(Q) t(y) %*% Q %*% y)
+  sstar <- which.max(RSSs)
+  quadstar <- RSSquads[sstar][[1]]
+  
+  RSSquads <- lapply(RSSquads, function(quad) quad - quadstar)
+  RSSquads[[sstar]] <- NULL # remove the all zeroes case
+  return(list(sstar = sstar, RSSquads = RSSquads))
+}
+
 
 #--------------------------------------
 # Functions for forward stepwise
+# broke this while making cvlar
 #--------------------------------------
 
 cvfs <- function(x, y, index = 1:ncol(x), maxsteps, sigma = NULL, intercept = TRUE, center = TRUE, normalize = TRUE, nfolds = 5) {
@@ -130,18 +163,7 @@ cvfs <- function(x, y, index = 1:ncol(x), maxsteps, sigma = NULL, intercept = TR
     }
     #projections <- do.call(c, projections)
 
-    RSSquads <- list()
-    for (s in 1:maxsteps) {
-        initial.active <- lapply(active.sets, function(a) a[1:s])
-        RSSquads[[s]] <- cvRSSquad(X, folds, initial.active)
-    }
-
-    RSSs <- lapply(RSSquads, function(Q) t(Y) %*% Q %*% Y)
-    sstar <- which.min(RSSs)
-    quadstar <- RSSquads[sstar][[1]]
 
-    RSSquads <- lapply(RSSquads, function(quad) quad - quadstar)
-    RSSquads[[sstar]] <- NULL # remove the all zeroes case
 
     fit <- groupfs(X, Y, index=index, maxsteps=sstar, sigma=sigma, intercept=intercept, center=center, normalize=normalize)
     fit$cvobj <- cvobj
@@ -157,54 +179,240 @@ cvfs <- function(x, y, index = 1:ncol(x), maxsteps, sigma = NULL, intercept = TR
 # Functions for lar
 #--------------------------------------
 
-cvlar <- function(x, y, maxsteps) { # other args
-    folds <- cvMakeFolds(x)
-    models <- lapply(folds, function(fold) {
-        x.train <- x
-        y.train <- y
-        x.train[fold,] <- 0
-        y.train[fold] <- 0
-        x.test <- x[fold,]
-        y.test <- y[fold]
-        larpath.train <- lar(x.train, y.train, maxsteps = maxsteps, intercept = F, normalize = F)
-        return(larpath.train)
-    })
+cvlar <- function(x, y, maxsteps, folds = NULL) { # other args
+  this.call = match.call()
+  if (is.null(folds)) folds <- cvMakeFolds(x)
+  models <- lapply(folds, function(fold) {
+    x.train <- x
+    y.train <- y
+    x.train[fold,] <- 0
+    y.train[fold] <- 0
+    x.test <- x[fold,]
+    y.test <- y[fold]
+    larpath.train <- lar(x.train, y.train, maxsteps = maxsteps, intercept = F, normalize = F)
+    return(larpath.train)
+  })
+  
+  active.sets <- lapply(models, function(model) model$action)
+  #lambdas <- lapply(models, function(model) model$lambda)
+  #lmin <- min(unlist(lambdas))
+  cvmin <- cvopt(x, y, maxsteps, folds, active.sets)
+  sstar <- cvmin$sstar
+  fit <- lar(x, y, maxsteps=sstar, intercept = F, normalize = F)
+  fit$ols <- lsfit(x[, fit$action, drop = F], y, intercept = F)
+  names(fit$ols$coefficients) <- fit$action
+  fit$sigma <- sqrt(sum((fit$ols$residuals)^2)/(length(y)-length(fit$action)-1))
+  fit$RSSquads <- cvmin$RSSquads
+  # tall Gamma encoding all cv-model paths
+  fit$tallGamma <- do.call(rbind, lapply(models, function(model) return(model$Gamma)))
+  fit$khat <- sstar
+  fit$folds <- folds
+  fit$call <- this.call
+  class(fit) <- "cvlar"
+  # more to do here?
+  return(fit)
+}
+
+# cvlarInf <- function(obj, ...) {
+#   pv.unadj <- larInf(obj, type = "all", k = obj$khat, verbose = T, ...)
+#   obj$Gamma <- rbind(obj$Gamma, obj$tallGamma)
+#   pv.adj <- larInf(obj, type = "all", k = obj$khat, verbose = T, ...)
+#   return(list(pv.unadj = pv.unadj, pv.adj = pv.adj))
+# }
+
+cvlarInf <- function (obj, sigma, alpha = 0.1,
+                    k = NULL,
+                    gridrange = c(-100, 100),
+                    bits = NULL, mult = 2, 
+                    ntimes = 2, verbose = FALSE) {
+  this.call = match.call()
+  #checkargs.misc(sigma = sigma, alpha = alpha, k = k, gridrange = gridrange, mult = mult, ntimes = ntimes)
+  if (class(obj) != "cvlar") 
+    stop("obj must be an object of class cvlar")
+  if (!is.null(bits) && !requireNamespace("Rmpfr", quietly = TRUE)) {
+    warning("Package Rmpfr is not installed, reverting to standard precision")
+    bits = NULL
+  }
+  x = obj$x
+  y = obj$y
+  p = ncol(x)
+  n = nrow(x)
+  G = obj$Gamma
+  #nk = obj$nk
+  sx = obj$sx
+  k = obj$khat
+  sigma = obj$sigma
+  # may the gods of OOP have mercy on us
+  class(obj) <- "lar"
+  pv.unadj <- larInf(obj, type = "all", sigma = sigma, k = obj$khat)
+  class(obj) <- "cvlar"
+  #pv.spacing = pv.modspac = pv.covtest = khat = NULL
+  
+  G = rbind(obj$Gamma, obj$tallGamma) #G[1:nk[k], ]
+  u = rep(0, nrow(G))
+  kk = k
+  pv = vlo = vup = numeric(kk)
+  vmat = matrix(0, kk, n)
+  ci = tailarea = matrix(0, kk, 2)
+  sign = numeric(kk)
+  vars = obj$action[1:kk]
+  xa = x[, vars]
+  M = pinv(crossprod(xa)) %*% t(xa)
+  for (j in 1:kk) {
+    if (verbose) 
+      cat(sprintf("Inference for variable %i ...\n", 
+                  vars[j]))
+    vj = M[j, ]
+    mj = sqrt(sum(vj^2))
+    vj = vj/mj
+    sign[j] = sign(sum(vj * y))
+    vj = sign[j] * vj
+    Gj = rbind(G, vj)
+    uj = c(u, 0)
+    a = poly.pval(y, Gj, uj, vj, sigma, bits)
+    pv[j] = a$pv
+    sxj = sx[vars[j]]
+    vlo[j] = a$vlo * mj/sxj
+    vup[j] = a$vup * mj/sxj
+    vmat[j, ] = vj * mj/sxj
+    
+    #a = poly.int(y, Gj, uj, vj, sigma, alpha, gridrange = gridrange, flip = (sign[j] == -1), bits = bits)
+    #ci[j, ] = a$int * mj/sxj
+    #tailarea[j, ] = a$tailarea
+  }
+  out = list(type = type, k = k, khat = khat, pv = pv, 
+             pv.unadj = pv.unadj, vlo = vlo, vup = vup, vmat = vmat, 
+             y = y, vars = vars, sign = sign, sigma = sigma, 
+             alpha = alpha, call = this.call)
+  class(out) = "cvlarInf"
+  return(out)
+}
 
-    active.sets <- lapply(models, function(model) model$action)
-    lambdas <- lapply(models, function(model) model$lambda)
-    lmin <- min(unlist(lambdas))
 
-    # Interpolate lambda grid or parametrize by steps?
-    # interpolation probably requires re-writing cvRSSquads for
-    # penalized fits in order to make sense
 
-    # do steps for now just to have something that works?
+poly.pval <- function(y, G, u, v, sigma, bits=NULL) {
+  z = sum(v*y)
+  vv = sum(v^2)
+  sd = sigma*sqrt(vv)
+  
+  rho = G %*% v / vv
+  vec = (u - G %*% y + rho*z) / rho
+  vlo = suppressWarnings(max(vec[rho>0]))
+  vup = suppressWarnings(min(vec[rho<0]))
+  
+  pv = tnorm.surv(z,0,sd,vlo,vup,bits)
+  return(list(pv=pv,vlo=vlo,vup=vup))
+}
 
-    RSSquads <- list()
-    for (s in 1:maxsteps) {
-        initial.active <- lapply(active.sets, function(a) a[1:s])
-        RSSquads[[s]] <- cvRSSquad(x, folds, initial.active)
-    }
+pinv <- function(A, tol=.Machine$double.eps) {
+  e = eigen(A)
+  v = Re(e$vec)
+  d = Re(e$val)
+  d[d > tol] = 1/d[d > tol]
+  d[d < tol] = 0
+  if (length(d)==1) return(v*d*v)
+  else return(v %*% diag(d) %*% t(v))
+}
 
-    RSSs <- lapply(RSSquads, function(Q) t(y) %*% Q %*% y)
-    sstar <- which.min(RSSs)
-    quadstar <- RSSquads[sstar][[1]]
+tnorm.surv <- function(z, mean, sd, a, b, bits=NULL) {
+  z = max(min(z,b),a)
+  
+  # Check silly boundary cases
+  p = numeric(length(mean))
+  p[mean==-Inf] = 0
+  p[mean==Inf] = 1
+  
+  # Try the multi precision floating point calculation first
+  o = is.finite(mean)
+  mm = mean[o]
+  pp = mpfr.tnorm.surv(z,mm,sd,a,b,bits) 
+  
+  # If there are any NAs, then settle for an approximation
+  oo = is.na(pp)
+  if (any(oo)) pp[oo] = bryc.tnorm.surv(z,mm[oo],sd,a,b)
+  
+  p[o] = pp
+  return(p)
+}
 
-    # Need to add these later?
-    #RSSquads <- lapply(RSSquads, function(quad) quad - quadstar)
-    #RSSquads[[sstar]] <- NULL # remove the all zeroes case
+mpfr.tnorm.surv <- function(z, mean=0, sd=1, a, b, bits=NULL) {
+  # If bits is not NULL, then we are supposed to be using Rmpf
+  # (note that this was fail if Rmpfr is not installed; but
+  # by the time this function is being executed, this should
+  # have been properly checked at a higher level; and if Rmpfr
+  # is not installed, bits would have been previously set to NULL)
+  if (!is.null(bits)) {
+    z = Rmpfr::mpfr((z-mean)/sd, precBits=bits)
+    a = Rmpfr::mpfr((a-mean)/sd, precBits=bits)
+    b = Rmpfr::mpfr((b-mean)/sd, precBits=bits)
+    return(as.numeric((Rmpfr::pnorm(b)-Rmpfr::pnorm(z))/
+                        (Rmpfr::pnorm(b)-Rmpfr::pnorm(a))))
+  }
+  
+  # Else, just use standard floating point calculations
+  z = (z-mean)/sd
+  a = (a-mean)/sd
+  b = (b-mean)/sd
+  return((pnorm(b)-pnorm(z))/(pnorm(b)-pnorm(a)))
+}
 
-    fit <- lar(x, y, maxsteps=sstar, intercept = F, normalize = F)
+bryc.tnorm.surv <- function(z, mean=0, sd=1, a, b) {
+  z = (z-mean)/sd
+  a = (a-mean)/sd
+  b = (b-mean)/sd
+  n = length(mean)
+  
+  term1 = exp(z*z)
+  o = a > -Inf
+  term1[o] = ff(a[o])*exp(-(a[o]^2-z[o]^2)/2)
+  term2 = rep(0,n)
+  oo = b < Inf
+  term2[oo] = ff(b[oo])*exp(-(b[oo]^2-z[oo]^2)/2)
+  p = (ff(z)-term2)/(term1-term2)
+  
+  # Sometimes the approximation can give wacky p-values,
+  # outside of [0,1] ..
+  #p[p<0 | p>1] = NA
+  p = pmin(1,pmax(0,p))
+  return(p)
+}
 
-    # Very tall Gamma encoding all cv-model paths
-    Gamma <- do.call(rbind, lapply(models, function(model) return(model$Gamma)))
-    fit$Gamma <- rbind(fit$Gamma, Gamma)
-    fit$khat <- sstar
-    fit$folds <- folds
-    # more to do here
-    return(fit)
+ff <- function(z) {
+  return((z^2+5.575192695*z+12.7743632)/
+           (z^3*sqrt(2*pi)+14.38718147*z*z+31.53531977*z+2*12.77436324))
 }
 
-cvlarInf <- function(obj) {
-  larInf(obj, type = "all", k = obj$khat)
+
+
+print.cvlar <- function(x, ...) {
+  cat("\nCall:\n")
+  dput(x$call)
+  
+  cat("\nSequence of LAR moves:\n")
+  nsteps = length(x$action)
+  tab = cbind(1:nsteps,x$action,x$sign)
+  colnames(tab) = c("Step","Var","Sign")
+  rownames(tab) = rep("",nrow(tab))
+  print(tab)
+  invisible()
 }
+
+print.cvlarInf <- function(x, ...) {
+  cat("\nCall:\n")
+  dput(x$call)
+  
+  cat(sprintf("\nStandard deviation of noise (specified or estimated) sigma = %0.3f\n",
+              x$sigma))
+  
+  
+  cat(sprintf("\nTesting results at step = %i, with alpha = %0.3f\n",x$k,x$alpha))
+  cat("",fill=T)
+  tab = cbind(x$vars,
+              round(x$sign*x$vmat%*%x$y,3),
+              round(x$sign*x$vmat%*%x$y/(x$sigma*sqrt(rowSums(x$vmat^2))),3),
+              round(x$pv,3))
+  colnames(tab) = c("Var", "Coef", "Z-score", "P-value")
+  rownames(tab) = rep("",nrow(tab))
+  print(tab)
+  invisible()
+}