We don’t need to be clever if we ruthlessly apply conditional probability
Two commonplace applications of the assume-and-deduce strategy.
- the incorporation of measurement error into our models.
- estimation of missing data through Bayesian Imputation
library(rethinking)
data(WaffleDivorce)
d <- WaffleDivorce
# points
plot(d$Divorce ~ d$MedianAgeMarriage, ylim=c(4,15),
xlab="Median age marriage", ylab="Divorce rate")
# std errors
for (i in 1:nrow(d)){
ci <- d$Divorce[i] + c(-1, 1)*d$Divorce.SE[i]
x <- d$MedianAgeMarriage[i]
lines(c(x,x), ci)
}Information flows among the measurements to provide improved estimates of the data itself.
To incorporate measurement error, recognize that we can replace the observed data for divorce rate with a distribution. When there is uncertainty about the true value, that uncertainty can be replaced by a distribution that represents the information we have.
If you wanted to simulate measurement error, you would assign a distribution to each observation and sample from it.
Example: Gaussian distribution with mean equal to the observed value and standard deviation equal to the measurement’s standard error.
dlist <- list(
div_obs = d$Divorce,
div_sd = d$Divorce.SE,
R = d$Marriage,
A = d$MedianAgeMarriage
)
m14.1 <- map2stan(
alist(
div_est ~ dnorm(mu, sigma),
mu <- a + bA*A + bR*R,
div_obs ~ dnorm(div_est, div_sd),
a ~ dnorm(0, 10),
bA ~ dnorm(0, 10),
bR ~ dnorm(0, 10),
sigma ~ dcauchy(0, 2.5)
),
data = dlist,
start=list(div_est=dlist$div_obs),
WAIC=FALSE, iter=5000, warmup=1000, chains=2, cores=2,
control=list(adapt_delta=0.95))
precis(m14.1, depth=2)
What happens when there is measurement error on predictor variables as well?
dlist <- list(
div_obs = d$Divorce,
div_sd = d$Divorce.SE,
mar_obs = d$Marriage,
mar_sd = d$Marriage.SE,
A = d$MedianAgeMarriage
)
m14.2 <- map2stan(
alist(
div_est ~ dnorm(mu, sigma),
mu <- a + bA*A + bR*mar_est[i],
div_obs ~ dnorm(div_est, div_sd),
mar_obs ~ dnorm(mar_est, mar_sd),
a ~ dnorm(0, 10),
bA ~ dnorm(0, 10),
bR ~ dnorm(0, 10),
sigma ~ dcauchy(0, 2.5)
),
data=dlist,
start=list(div_est=dlist$div_obs, mar_est=dlist$mar_obs),
WAIC=FALSE, iter=5000, warmup=1000, chains=3, cores=3,
control=list(adapt_delta=0.95))
Error on the predictor did not change the major inference. It did provide updated estimates of marriage rate itself.
When you have a distribution of values don’t reduce it down to a single value to use in a regression. Use the entire distribution. Do not average, instead model.
Information can flow from present data to missing data, as long as we are willing to make a model of the whole variable.
Use Bayesian imputation to conserve and use information,and produce estimates for missing values.
MCAR Missing Completely at Random imputation
Simultaneously model the predictor variable that has missing data together with the outcome variable. The present values will produce estimates that comprise a prior for each missing value. These priors will then be updated by the relationship between the predictor and the outcome. So there will be a posterior distribution for each missing value.
library(rethinking)
data(milk)
d <- milk
d$neocortex.prop <- d$neocortex.perc / 100
d$logmass <- log(d$mass)
# prep data
data_list <- list(
kcal = d$kcal.per.g,
neocortex = d$neocortex.prop,
logmass = d$logmass)
# fit model
m14.3 <- map2stan(
alist(
kcal ~ dnorm(mu, sigma),
mu <- a + bN*neocortex + bM*logmass,
neocortex ~ dnorm(nu, sigma_N),
a ~ dnorm(0, 100),
c(bN, bM) ~ dnorm(0, 10),
nu ~ dnorm(0.5, 1),
sigma_N ~ dcauchy(0,1),
sigma ~ dcauchy(0,1)
),
data=data_list, iter=1e4, chains=2)
precis(m14.3, depth=2)
Make a model that accounts for the association among the predictors themselves.
m14.4 <- map2stan(
alist(
kcal ~ dnorm(mu, sigma),
mu <- a + bN*neocortex + bM*logmass,
neocortex ~ dnorm(nu, sigma_N),
nu <- a_N + gM*logmass,
a ~ dnorm(0, 100),
c(bN, bM, gM) ~ dnorm(0,10),
a_N ~ dnorm(0.5, 1),
sigma_N ~ dcauchy(0, 1),
sigma ~ dcauchy(0,1)
),
data=data_list, iter=1e4, chains=2)
precis(m14.4, depth=2)
In many cases, it is more plausible that missing values are not randomly distributed across cases. Certain values of outcomes or predictors are more likely to induce missingness.
\[ T_i ~ Poisson(/mui) log μ{i} = α + β*log_pop_esti log_pop_obsi ~ Normal(log_pop_est{i}, logpop_sei) alpha ~ Normal(0,1) beta ~ Normal(0,1) \]
\[ T_i ~ Poisson(/mui) log μ{i} = α + β*log_pop{i} log_popi ~ dnorm(nu, sigma_logpop) alpha ~ Normal(0, 10) beta ~ Normal(0, 1) nu ~ Normal(0.5, 1) sigma_logpop ~ Cauchy(0,1) \]
We assume that the location of the missing values is completely random with respect to those values and all other values in the data.
data(milk)
d <- milk
d$neocortex.prop <- d$neocortex.perc / 100
d$logmass <- log(d$mass)
# prep data
data_list <- list(
kcal = d$kcal.per.g,
neocortex = d$neocortex.prop,
logmass = d$logmass)
m14_imputation <- map2stan(
alist(
kcal ~ dnorm(mu, sigma),
mu <- a + bM*neocortex + bM*logmass,
neocortex ~ dnorm(nu, sigma_N),
a ~ dnorm(0, 100),
c(bN, bM) ~ dnorm(0,10),
nu ~ dnorm(0.5, 1),
sigma_N ~ dcauchy(0,1),
sigma ~ dcauchy(0, 1)
),
data=data_list, iter=1e4, chains=2)
#prep data
d_complete <- d[ complete.cases(d$neocortex.prop),]
data_list_c <- list(
kcal = d_complete$kcal.per.g,
neocortex = d_complete$neocortex.prop,
logmass = d_complete$logmass)
m14_complete <- map2stan(
alist(
kcal ~ dnorm(mu, sigma),
mu <- a + bN*neocortex + bM*logmass,
a ~ dnorm(0, 100),
c(bN, bM) ~ dnorm(0, 10),
sigma ~ dcauchy(0,1)
),
data=data_list_c, iter=1e4, chains=2)
compare(m14_imputation, m14_complete)
The WAIC score goes down for the model with imputation. With the model receiving almost all of the weight.
library(rethinking)
data(WaffleDivorce)
d <- WaffleDivorce
dlist <- list(
div_obs=d$Divorce,
div_sd=d$Divorce.SE * 2,
R=d$Marriage,
A=d$MedianAgeMarriage
)
m14.1 <- map2stan(
a
list(
div_est ~ dnorm(mu, sigma),
mu <- a + bA*A + bR*R,
div_obs ~ dnorm(div_est, div_sd),
a ~ dnorm(0,10),
bA ~ dnorm(0,10),
bR ~ dnorm(0, 10),
sigma ~ dcauchy(0, 2.5)
),
data=dlist,
start=list(div_est=dlist$div_obs),
WAIC=FALSE, iter=10000, warmup=2000, chains=2, cores=2,
control=list(adapt_delta=0.95))
precis(m14.1, depth=2)
It was much more difficult to convergence, rhat above 1.
library(rethinking)
data(elephants)
d <- elephants
#d$log_age <- d$AGE
m14h1 <- map2stan(
alist(
MATINGS ~ dpois(lambda),
log(lambda) <- a + bA*AGE,
a ~ dnorm(0, 5),
bA ~ dnorm(0,1)
), data=d, iter=5000, warmup=1000, chains=2, cores=2)
precis(m14h1)
#plot(m14h1)
m14h1.se <- map2stan(
alist(
MATINGS ~ dpois(lambda),
log(lambda) <- a + bA*AGE_est[i],
AGE ~ dnorm(AGE_est, 5),
a ~ dnorm(0, 5),
bA ~ dnorm(0,1)
), data=d, start=list(AGE_est=d$AGE), iter=5000, warmup=1000, chains=2, cores=2)
precis(m14h1.se)
#plot(m14h1)
matings_sample <- sim(m14h1.se)
matings_est <- apply(matings_sample, 2, mean)
post1 <- extract.samples(m14h1)
lambda1 <- sapply(age_seq, function(x) exp(post1$a + x*post1$b))
lambda1.avg <- apply(lambda1, 2, mean)
lambda1.PI <- apply(lambda1, 2, PI)
post <- extract.samples(m14h1.se)
age_estimated <- apply(post$AGE_est, 2, mean)
plot(1, 1, xlab='age', ylab='mating', xlim=c(25, 55), ylim=c(0, 10), type='n')
points(d$AGE, d$MATINGS, pch=16, col='blue')
points(age_estimated, matings_est)
age_seq <- seq(25, 55, by=0.5)
lambda <- sapply(age_seq, function(x) exp(post$a + x*post$b))
lambda.avg <- apply(lambda, 2, mean)
lambda.PI <- apply(lambda, 2, PI)
lines(age_seq, lambda1.avg, col=col.alpha(rangi2, 0.4))
shade(lambda1.PI, age_seq, col=col.alpha(rangi2, 0.4))
lines(age_seq, lambda.avg, type='l', lty=2)
shade(lambda.PI, age_seq)
Even though confidence interva does not change a lot, the estimated measurements are placed much closer on the inferred trend, due to shrinkage.
m14h2.se <- map2stan(
alist(
MATINGS ~ dpois(lambda),
log(lambda) <- a + bA*AGE_est[i],
AGE ~ dnorm(AGE_est, 35),
a ~ dnorm(0, 5),
bA ~ dnorm(0,1)
), data=d, start=list(AGE_est=d$AGE), iter=5000, warmup=1000, chains=2, cores=2)
precis(m14h2.se, depth=2)
#plot(m14h1)
set.seed(100)
x <- c(rnorm(10), NA)
y <- c(rnorm(10, x), 100)
d <- list(x=x, y=y)
m14h3.filtered <- map2stan(
alist(
y ~ dnorm(mu, sigma),
mu <- a + b*x,
a ~ dnorm(0, 100),
b ~ dnorm(0, 100),
sigma ~ dcauchy(0, 1)
), data=list(x=x[1:10], y=y[1:10], iter=3000, chains=2, cores=2))
m14h3 <- map2stan(
alist(
y ~ dnorm(mu, sigma),
mu <- a + b*x,
x ~ dnorm(0, 1),
a ~ dnorm(0, 100),
b ~ dnorm(0, 100),
sigma ~ dcauchy(0, 1)
), data=d, iter=3000, chains=2, cores=2)
precis(m14h3)
post14h3 <- extract.samples(m14h3)
dens(post14h3$b)
precis(m14h3.filtered)
post14h3.filtered <- extract.samples(m14h3.filtered)
dens(post14h3.filtered$b)
The system cannot determine the relationship with the extreme 100 value. Is it a positve or negative relationship. Therefore two distributions.