# Multilevel Modeling in Stan

There are a few different ways to model data that contains repeated observations for units over time, or that is nested within groups. First, we could simply pool all the data together and ignore the nested structure (pooling). Second, we could consider observations from each group as entirely independent from observations in other groups (no pooling). Finally, we could use information about the similarity of observations within groups to inform our individual estimates (partial pooling).

# Import Data

library(pacman)

register_knitr_engine(override = FALSE) # This registers cmdstanr with knitr so that we can use
# it with R Markdown.

data(radon)

# Pooling

Let’s start with the pooling estimates. As a point of reference, we’ll fit a frequentist model first.

## Frequentist Fit

# fit model

# print results
library(broom)
library(broom.mixed)
tidy(pooling.fit, conf.int = TRUE)
## # A tibble: 2 × 7
##   term       estimate std.error statistic   p.value conf.low conf.high
##   <chr>         <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercep…    1.36     0.0285     47.7  9.17e-251    1.31      1.42
## 2 floor        -0.586    0.0700     -8.38 1.95e- 16   -0.724    -0.449
# calculate fitted values
radon$pooling.fit <- fitted(pooling.fit) # plot the model results ggplot(data = radon, aes(x = as_factor(floor), y = log_radon, group = county)) + geom_line(aes(y = pooling.fit), color = "black") + geom_point(alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() ## Bayesian Fit Program the model in Stan. data { int n; // number of observations vector[n] log_radon; vector[n] vfloor; } parameters { real alpha; // intercept parameter real beta; // slope parameter real sigma; // variance parameter } model { // conditional mean vector[n] mu; // linear combination mu = alpha + beta * vfloor; // priors alpha ~ normal(0, 100); beta ~ normal(0, 100); sigma ~ uniform(0, 100); // likelihood function log_radon ~ normal(mu, sigma); } generated quantities { vector[n] log_lik; // calculate log-likelihood vector[n] y_rep; // replications from posterior predictive distribution for (i in 1:n) { // generate mpg predicted value real log_radon_hat = alpha + beta * vfloor[i]; // calculate log-likelihood log_lik[i] = normal_lpdf(log_radon[i] | log_radon_hat, sigma); // normal_lpdf is the log of the normal probability density function // generate replication values y_rep[i] = normal_rng(log_radon_hat, sigma); // normal_rng generates random numbers from a normal distribution } } Now we fit this model: # prepare data for stan library(tidybayes) model.data <- radon %>% rename(vfloor = floor) %>% dplyr::select(log_radon, vfloor) %>% compose_data() # fit stan model stan.pooling.fit <- pooling.model$sample(data = model.data, refresh = 0)
## Running MCMC with 4 sequential chains...
##
## Chain 1 finished in 0.6 seconds.
## Chain 2 finished in 0.6 seconds.
## Chain 3 finished in 0.6 seconds.
## Chain 4 finished in 0.7 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 0.6 seconds.
## Total execution time: 3.0 seconds.
library(rstan)
stan.pooling.fit <- read_stan_csv(stan.pooling.fit$output_files()) tidy(stan.pooling.fit) ## # A tibble: 1,841 × 3 ## term estimate std.error ## <chr> <dbl> <dbl> ## 1 alpha 1.36 0.0291 ## 2 beta -0.588 0.0721 ## 3 sigma 0.791 0.0188 ## 4 log_lik[1] -0.691 0.0247 ## 5 log_lik[2] -0.910 0.0281 ## 6 log_lik[3] -0.741 0.0246 ## 7 log_lik[4] -1.97 0.0688 ## 8 log_lik[5] -0.718 0.0243 ## 9 log_lik[6] -0.818 0.0260 ## 10 log_lik[7] -1.32 0.0418 ## # … with 1,831 more rows stan.pooling.values <- stan.pooling.fit %>% # use this model fit spread_draws(alpha, beta, sigma) %>% # extract samples in tidy format median_hdci() # calculate the median HDCI # plot the model results for the median estimates ggplot(data = radon, aes(x = floor, y = log_radon, group = county)) + geom_abline(data = stan.pooling.values, aes(intercept = alpha, slope = beta), color = "black") + geom_point(alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() # plot the model results with posterior samples stan.pooling.samples <- stan.pooling.fit %>% # use this model fit spread_draws(alpha, beta, sigma, n = 20) # extract 20 samples in tidy format ggplot() + geom_abline(data = stan.pooling.samples, aes(intercept = alpha, slope = beta), color = "black", alpha = 0.3) + geom_point(data = radon, aes(x = floor, y = log_radon, group = county), alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() # No Pooling Now lets move to the no pooling estimates. These are sometimes referred to as “fixed-effects” by economists because you control for grouped data structures by including indicator variables to the grouping units. To give you a reference point, we’ll fit this model in a frequentist framework first. ## Frequentist Fit # fit the model without an intercept to include all 85 counties no.pooling.fit <- lm(formula = log_radon ~ floor + as_factor(county) - 1, data = radon) # print results tidy(no.pooling.fit, conf.int = TRUE) ## # A tibble: 86 × 7 ## term estimate std.error statistic p.value conf.low conf.high ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 floor -0.689 0.0706 -9.76 2.22e-21 -0.828 -0.551 ## 2 as_factor… 0.887 0.364 2.44 1.49e- 2 0.173 1.60 ## 3 as_factor… 0.931 0.101 9.23 2.20e-19 0.733 1.13 ## 4 as_factor… 1.55 0.422 3.67 2.57e- 4 0.721 2.38 ## 5 as_factor… 1.59 0.278 5.72 1.50e- 8 1.04 2.13 ## 6 as_factor… 1.45 0.364 4.00 6.92e- 5 0.741 2.17 ## 7 as_factor… 1.54 0.419 3.66 2.64e- 4 0.713 2.36 ## 8 as_factor… 2.02 0.194 10.4 5.87e-24 1.64 2.41 ## 9 as_factor… 2.00 0.365 5.47 5.99e- 8 1.28 2.71 ## 10 as_factor… 1.05 0.230 4.55 6.19e- 6 0.595 1.50 ## # … with 76 more rows # calculate fitted values radon$no.pooling.fit <- fitted(no.pooling.fit)

# plot the model results
ggplot(data = radon, aes(x = as_factor(floor), y = log_radon, group = county)) +
geom_line(aes(y = pooling.fit), color = "black") +
geom_line(aes(y = no.pooling.fit), color = "red") + # add no pooling estimates
geom_point(alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
ggpubr::theme_pubr()

## Bayesian Fit

Program the model in Stan.

data {
int n; // number of observations
int n_county; // number of counties
vector[n] vfloor;
int<lower = 0, upper = n_county> county[n];
}

parameters {
vector[n_county] alpha; // vector of county intercepts
real beta; // slope parameter
real<lower = 0> sigma; // variance parameter
}

model {
// conditional mean
vector[n] mu;

// linear combination
mu = alpha[county] + beta * vfloor;

// priors
alpha ~ normal(0, 100);
beta ~ normal(0, 100);
sigma ~ uniform(0, 100);

// likelihood function
}

generated quantities {
vector[n] log_lik; // calculate log-likelihood
vector[n] y_rep; // replications from posterior predictive distribution

for (i in 1:n) {
// generate mpg predicted value
real log_radon_hat = alpha[county[i]] + beta * vfloor[i];

// calculate log-likelihood
// normal_lpdf is the log of the normal probability density function

// generate replication values
// normal_rng generates random numbers from a normal distribution
}
}

Now we fit this model:

# prepare data for stan
model.data <-
rename(vfloor = floor) %>%
compose_data()

# fit stan model
stan.no.pooling.fit <- no.pooling.model$sample(data = model.data, refresh = 0) ## Running MCMC with 4 sequential chains... ## ## Chain 1 finished in 1.0 seconds. ## Chain 2 finished in 0.9 seconds. ## Chain 3 finished in 1.0 seconds. ## Chain 4 finished in 1.0 seconds. ## ## All 4 chains finished successfully. ## Mean chain execution time: 1.0 seconds. ## Total execution time: 4.4 seconds. rstan.stan.no.pooling.fit <- read_stan_csv(stan.no.pooling.fit$output_files())
tidy(rstan.stan.no.pooling.fit, conf.int = TRUE)
## # A tibble: 1,925 × 5
##    term      estimate std.error conf.low conf.high
##    <chr>        <dbl>     <dbl>    <dbl>     <dbl>
##  1 alpha[1]     0.889    0.370     0.157      1.61
##  2 alpha[2]     0.928    0.0981    0.732      1.12
##  3 alpha[3]     1.55     0.428     0.725      2.39
##  4 alpha[4]     1.58     0.282     1.04       2.15
##  5 alpha[5]     1.46     0.368     0.736      2.16
##  6 alpha[6]     1.53     0.417     0.740      2.33
##  7 alpha[7]     2.03     0.196     1.64       2.41
##  8 alpha[8]     2.00     0.381     1.27       2.75
##  9 alpha[9]     1.05     0.230     0.600      1.49
## 10 alpha[10]    1.57     0.297     0.984      2.18
## # … with 1,915 more rows
stan.nopool.values <-
stan.no.pooling.fit %>% # use this model fit
recover_types(radon) %>% # this matches indexes to original factor levels
spread_draws(alpha[county], beta, sigma) %>% # extract samples in tidy format
median_hdci() # calculate the median HDCI

# plot the model results for the median estimates
ggplot(data = radon, aes(x = floor, y = log_radon, group = county)) +
geom_abline(data = stan.pooling.values,
aes(intercept = alpha, slope = beta), color = "black") +
# add the new estimates in red
geom_abline(data = stan.nopool.values,
aes(intercept = alpha, slope = beta), color = "red") +
geom_point(alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
scale_color_manual(values = c("No Pooling", "Pooling")) +
ggpubr::theme_pubr()

# plot the model results with posterior samples
stan.nopooling.samples <-
stan.no.pooling.fit %>% # use this model fit
spread_draws(alpha[county], beta, sigma, n = 20) # extract 20 samples in tidy format

ggplot() +
geom_abline(data = stan.pooling.samples,
aes(intercept = alpha, slope = beta),
color = "black",
alpha = 0.3) +
geom_abline(data = stan.nopooling.samples,
aes(intercept = alpha, slope = beta),
color = "red",
alpha = 0.3) +
aes(x = floor, y = log_radon, group = county),
alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
ggpubr::theme_pubr()

# Partial Pooling

Now lets use a partial pooling model to estimate the relationship between a floor measurement and log radon level. As opposed to no pooling models, which essentially estimate a separate regression for each group, partial pooling models use information about the variance between groups get produce more accurate estimates for units within groups. So, for example, in counties were there are only 2 observations the partial pooling model will pull the floor estimates towards the mean of the full sample. This leads to less over fitting and more accurate predictions. Partial pooling models are more commonly known as multilevel or hierarchical models.

## Frequentist Fit

Let’s fit the partial pooling model in a frequentist framework:

library(lme4)

# fit the model
partial.pooling.fit <- lmer(formula = log_radon ~ floor + (1 | county),

# print results
tidy(partial.pooling.fit, conf.int = TRUE)
## # A tibble: 4 × 8
##   effect   group term  estimate std.error statistic conf.low conf.high
##   <chr>    <chr> <chr>    <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 fixed    <NA>  (Int…    1.49     0.0496     30.1     1.40      1.59
## 2 fixed    <NA>  floor   -0.663    0.0677     -9.80   -0.795    -0.530
## 3 ran_pars coun… sd__…    0.315   NA          NA      NA        NA
## 4 ran_pars Resi… sd__…    0.726   NA          NA      NA        NA
# calculate fitted values
radon$partial.pooling.fit <- fitted(partial.pooling.fit) # plot the model results ggplot(data = radon, aes(x = as_factor(floor), y = log_radon, group = county)) + geom_line(aes(y = pooling.fit), color = "black") + geom_line(aes(y = no.pooling.fit), color = "red") + # add no pooling estimates geom_line(aes(y = partial.pooling.fit), color = "blue") + # add partial pooling estimates geom_point(alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() ## Bayesian Fit Program the model in Stan. data { int n; // number of observations int n_county; // number of counties vector[n] log_radon; vector[n] vfloor; int<lower = 0, upper = n_county> county[n]; } parameters { vector[n_county] alpha; // vector of county intercepts real beta; // slope parameter real<lower = 0> sigma_a; // variance of counties real<lower = 0> sigma_y; // model residual variance real mu_a; // mean of counties } model { // conditional mean vector[n] mu; // linear combination mu = alpha[county] + beta * vfloor; // priors beta ~ normal(0, 1); // hyper-priors mu_a ~ normal(0, 1); sigma_a ~ cauchy(0, 2.5); sigma_y ~ cauchy(0, 2.5); // level-2 likelihood alpha ~ normal(mu_a, sigma_a); // level-1 likelihood log_radon ~ normal(mu, sigma_y); } generated quantities { vector[n] log_lik; // calculate log-likelihood vector[n] y_rep; // replications from posterior predictive distribution for (i in 1:n) { // generate mpg predicted value real log_radon_hat = alpha[county[i]] + beta * vfloor[i]; // calculate log-likelihood log_lik[i] = normal_lpdf(log_radon[i] | log_radon_hat, sigma_y); // normal_lpdf is the log of the normal probability density function // generate replication values y_rep[i] = normal_rng(log_radon_hat, sigma_y); // normal_rng generates random numbers from a normal distribution } } Now we fit this model: # prepare data for stan model.data <- radon %>% rename(vfloor = floor) %>% dplyr::select(log_radon, vfloor, county) %>% compose_data() # fit stan model stan.partial.pooling.fit <- partial.pooling.model$sample(data = model.data,
refresh = 0)
## Running MCMC with 4 sequential chains...
##
## Chain 1 finished in 1.1 seconds.
## Chain 2 finished in 1.1 seconds.
## Chain 3 finished in 1.1 seconds.
## Chain 4 finished in 1.1 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 1.1 seconds.
## Total execution time: 4.7 seconds.
rstan.stan.partial.pooling.fit <- read_stan_csv(stan.partial.pooling.fit$output_files()) tidy(rstan.stan.partial.pooling.fit, conf.int = TRUE) ## # A tibble: 1,927 × 5 ## term estimate std.error conf.low conf.high ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 alpha[1] 1.23 0.244 0.738 1.70 ## 2 alpha[2] 0.983 0.0954 0.802 1.17 ## 3 alpha[3] 1.50 0.249 1.02 1.99 ## 4 alpha[4] 1.53 0.206 1.13 1.94 ## 5 alpha[5] 1.47 0.240 0.992 1.94 ## 6 alpha[6] 1.51 0.256 0.982 2.00 ## 7 alpha[7] 1.87 0.164 1.56 2.19 ## 8 alpha[8] 1.70 0.239 1.25 2.19 ## 9 alpha[9] 1.20 0.184 0.830 1.56 ## 10 alpha[10] 1.52 0.215 1.09 1.93 ## # … with 1,917 more rows stan.partialpool.values <- stan.partial.pooling.fit %>% # use this model fit recover_types(radon) %>% # this matches indexes to original factor levels spread_draws(alpha[county], beta, sigma_a, sigma_y) %>% # extract samples in tidy format median_hdci() # calculate the median HDCI # plot the model results for the median estimates ggplot(data = radon, aes(x = floor, y = log_radon, group = county)) + geom_abline(data = stan.pooling.values, aes(intercept = alpha, slope = beta), color = "black") + # no pooling estimates in red geom_abline(data = stan.nopool.values, aes(intercept = alpha, slope = beta), color = "red") + # partial pooling estimates in blue geom_abline(data = stan.partialpool.values, aes(intercept = alpha, slope = beta), color = "blue") + geom_point(alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + scale_color_manual(values = c("No Pooling", "Pooling")) + ggpubr::theme_pubr() # plot the model results with posterior samples stan.partialpool.samples <- stan.partial.pooling.fit %>% # use this model fit recover_types(radon) %>% spread_draws(alpha[county], beta, sigma_a, sigma_y, n = 20) # extract 20 samples in tidy format ggplot() + geom_abline(data = stan.pooling.samples, aes(intercept = alpha, slope = beta), color = "black", alpha = 0.3) + geom_abline(data = stan.nopooling.samples, aes(intercept = alpha, slope = beta), color = "red", alpha = 0.3) + geom_abline(data = stan.partialpool.samples, aes(intercept = alpha, slope = beta), color = "blue", alpha = 0.3) + geom_point(data = radon, aes(x = floor, y = log_radon, group = county), alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() # Adding Group-level Predictors Partial pooling also allow us to include group-level predictor variables which can dramatically improve the model’s predictions. Let’s add a group-level predictor to the model. For a group-level predictor, let’s add a measure of the county’s uranium level. ## Frequentist Fit In frequentist, we can fit this model as follows: # fit model partial.pooling.fit.02 <- lmer(formula = log_radon ~ floor + log_uranium + (1 | county), data = radon) # print model results tidy(partial.pooling.fit.02, conf.int = TRUE) ## # A tibble: 5 × 8 ## effect group term estimate std.error statistic conf.low conf.high ## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 fixed <NA> (Int… 1.50 0.0362 41.3 1.43 1.57 ## 2 fixed <NA> floor -0.639 0.0660 -9.67 -0.768 -0.509 ## 3 fixed <NA> log_… 0.697 0.0876 7.96 0.526 0.869 ## 4 ran_pars coun… sd__… 0.148 NA NA NA NA ## 5 ran_pars Resi… sd__… 0.728 NA NA NA NA # calculate fitted values radon$partial.pooling.fit.02 <- fitted(partial.pooling.fit.02)

# plot the model results
ggplot(data = radon, aes(x = as_factor(floor), y = log_radon, group = county)) +
geom_line(aes(y = pooling.fit), color = "black") +
geom_line(aes(y = no.pooling.fit), color = "red") + # add no pooling estimates
# partial pooling with group-level predictor in dashed purple
geom_line(aes(y = partial.pooling.fit.02), color = "purple", lty = 2) +
# partial pooling without group-level predictors in blue
geom_line(aes(y = partial.pooling.fit), color = "blue") + # add partial pooling estimates
geom_point(alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
ggpubr::theme_pubr()

## Bayesian Fit

To program this model in Stan, we’ll use the transformed parameters block to create a conditional mean equation for the level 2 model.

data {
int n; // number of observations
int n_county; // number of counties
vector[n] vfloor;
vector[n] log_uranium;
int<lower = 0, upper = n_county> county[n];
}

parameters {
vector[n_county] alpha; // vector of county intercepts
real b_floor; // slope parameter
real b_uranium;
real<lower = 0> sigma_a; // variance of counties
real<lower = 0> sigma_y; // model residual variance
real mu_a; // mean of counties
}

model {
// conditional mean
vector[n] mu;

// linear combination
mu = alpha[county] + b_floor * vfloor + b_uranium * log_uranium;

// priors
b_floor ~ normal(0, 100);
b_uranium ~ normal(0, 100);

// hyper-priors
mu_a ~ normal(0, 10);
sigma_a ~ cauchy(0, 2.5);
sigma_y ~ cauchy(0, 2.5);

// level-2 likelihood
alpha ~ normal(mu_a, sigma_a);

// level-1 likelihood
}

generated quantities {
vector[n] log_lik; // calculate log-likelihood
vector[n] y_rep; // replications from posterior predictive distribution

for (i in 1:n) {
// generate mpg predicted value
real log_radon_hat = alpha[county[i]] + b_floor * vfloor[i] + b_uranium * log_uranium[i];

// calculate log-likelihood
// normal_lpdf is the log of the normal probability density function

// generate replication values
// normal_rng generates random numbers from a normal distribution
}
}

Now we fit this model:

# prepare data for stan
model.data <-
rename(vfloor = floor) %>%
compose_data()

# fit stan model
stan.partial.pooling.model.02 <- partial.pooling.model.02$sample(data = model.data, refresh = 0) ## Running MCMC with 4 sequential chains... ## ## Chain 1 finished in 1.6 seconds. ## Chain 2 finished in 1.6 seconds. ## Chain 3 finished in 1.7 seconds. ## Chain 4 finished in 1.6 seconds. ## ## All 4 chains finished successfully. ## Mean chain execution time: 1.6 seconds. ## Total execution time: 7.0 seconds. rstan.stan.partial.pooling.fit.02 <- read_stan_csv(stan.partial.pooling.model.02$output_files())
tidy(rstan.stan.partial.pooling.fit.02, conf.int = TRUE)
## # A tibble: 1,928 × 5
##    term      estimate std.error conf.low conf.high
##    <chr>        <dbl>     <dbl>    <dbl>     <dbl>
##  1 alpha[1]      1.47    0.145      1.17      1.76
##  2 alpha[2]      1.51    0.0976     1.32      1.71
##  3 alpha[3]      1.50    0.148      1.22      1.83
##  4 alpha[4]      1.58    0.150      1.36      1.93
##  5 alpha[5]      1.50    0.146      1.22      1.81
##  6 alpha[6]      1.47    0.142      1.17      1.75
##  7 alpha[7]      1.60    0.131      1.39      1.90
##  8 alpha[8]      1.52    0.145      1.27      1.84
##  9 alpha[9]      1.43    0.132      1.15      1.68
## 10 alpha[10]     1.49    0.141      1.21      1.78
## # … with 1,918 more rows
stan.partialpool.02.values <-
stan.partial.pooling.model.02 %>% # use this model fit
recover_types(radon) %>% # this matches indexes to original factor levels
spread_draws(alpha[county], b_floor, b_uranium, sigma_a, sigma_y) %>% # extract samples in tidy format
median_hdci() # calculate the median HDCI

# plot the model results for the median estimates
ggplot(data = radon, aes(x = floor, y = log_radon, group = county)) +
geom_abline(data = stan.pooling.values,
aes(intercept = alpha, slope = beta), color = "black") +
# no pooling estimates in red
geom_abline(data = stan.nopool.values,
aes(intercept = alpha, slope = beta), color = "red") +
# partial pooling estimates in blue
geom_abline(data = stan.partialpool.values,
aes(intercept = alpha, slope = beta), color = "blue") +
geom_abline(data = stan.partialpool.02.values,
aes(intercept = alpha, slope = b_floor), color = "purple", lty = 2) +
geom_point(alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
scale_color_manual(values = c("No Pooling", "Pooling")) +
ggpubr::theme_pubr()

# plot the model results with posterior samples
stan.partialpool.02.samples <-
stan.partial.pooling.model.02 %>% # use this model fit
spread_draws(alpha[county], b_floor, b_uranium, sigma_a, sigma_y, n = 20) # extract 20 samples in tidy format

ggplot() +
geom_abline(data = stan.pooling.samples,
aes(intercept = alpha, slope = beta),
color = "black",
alpha = 0.3) +
geom_abline(data = stan.nopooling.samples,
aes(intercept = alpha, slope = beta),
color = "red",
alpha = 0.3) +
geom_abline(data = stan.partialpool.samples,
aes(intercept = alpha, slope = beta),
color = "blue",
alpha = 0.3) +
geom_abline(data = stan.partialpool.02.samples,
aes(intercept = alpha, slope = b_floor),
color = "purple",
lty = 2,
alpha = 0.3) +
aes(x = floor, y = log_radon, group = county),
alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
ggpubr::theme_pubr()

# Varying Slope Model

There’s no reason that we need to estimate the same slope for every county, and partial pooling models allow us to let the slopes parameters vary between counties. Let’s estimate a final model where we let the effect of floor vary between counties.

## Frequentist Fit

# fit model
partial.pooling.fit.03 <- lmer(formula = log_radon ~ floor + log_uranium +
(1 + floor | county),

# print model results
tidy(partial.pooling.fit.03, conf.int = TRUE)
## # A tibble: 7 × 8
##   effect   group term  estimate std.error statistic conf.low conf.high
##   <chr>    <chr> <chr>    <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 fixed    <NA>  (Int…    1.49     0.0341     43.8     1.43      1.56
## 2 fixed    <NA>  floor   -0.615    0.0838     -7.34   -0.779    -0.451
## 3 fixed    <NA>  log_…    0.745    0.0849      8.77    0.578     0.911
## 4 ran_pars coun… sd__…    0.122   NA          NA      NA        NA
## 5 ran_pars coun… cor_…    0.204   NA          NA      NA        NA
## 6 ran_pars coun… sd__…    0.342   NA          NA      NA        NA
## 7 ran_pars Resi… sd__…    0.719   NA          NA      NA        NA
# calculate fitted values
radon$partial.pooling.fit.03 <- fitted(partial.pooling.fit.03) # plot the model results ggplot(data = radon, aes(x = as_factor(floor), y = log_radon, group = county)) + geom_line(aes(y = pooling.fit), color = "black") + geom_line(aes(y = no.pooling.fit), color = "red") + # add no pooling estimates # partial pooling with group-level predictor in dashed purple geom_line(aes(y = partial.pooling.fit.02), color = "purple", lty = 2) + # partial pooling without group-level predictors in blue geom_line(aes(y = partial.pooling.fit), color = "blue") + # add partial pooling estimates geom_line(aes(y = partial.pooling.fit.03), color = "green") + geom_point(alpha = 0.3, size = 3, position = position_jitter(width = 0.1, height = 0.2)) + facet_wrap(~ county) + ggpubr::theme_pubr() ## Bayesian Fit To program this model in Stan, we’ll need to include the variance covariance matrix for the varying intercept and slope parameters. data { int n; // number of observations int n_county; // number of counties int<lower = 0, upper = n_county> county[n]; // level 1 variables vector[n] log_radon; int vfloor[n]; // level 2 variables real log_uranium[n]; } parameters { vector[n_county] b_floor; vector[n_county] a_county; real b_uranium; real a; real b; vector<lower=0>[2] sigma_county; real<lower=0> sigma; corr_matrix[2] Rho; } model { // conditional mean vector[n] mu; // varying slopes and varying intercepts component { // vector for intercept mean and slope mean vector[2] YY[n_county]; vector[2] MU; MU = [a , b]'; for (j in 1:n_county) YY[j] = [a_county[j], b_floor[j]]'; YY ~ multi_normal(MU , quad_form_diag(Rho , sigma_county)); } // linear model for (i in 1:n) { mu[i] = a_county[county[i]] + b_floor[county[i]] * vfloor[i] + b_uranium * log_uranium[i]; } // priors sigma ~ exponential(3); b_uranium ~ normal(0 , 100); // hyper-priors Rho ~ lkj_corr(1); sigma_county ~ exponential(3); b ~ normal(0 , 100); a ~ normal(0 , 100); // likelihood log_radon ~ normal(mu , sigma); } generated quantities { vector[n] log_lik; // calculate log-likelihood vector[n] y_rep; // replications from posterior predictive distribution for (i in 1:n) { // generate mpg predicted value real log_radon_hat = a_county[county[i]] + b_floor[county[i]] * vfloor[i] + b_uranium * log_uranium[i]; // calculate log-likelihood log_lik[i] = normal_lpdf(log_radon[i] | log_radon_hat, sigma); // normal_lpdf is the log of the normal probability density function // generate replication values y_rep[i] = normal_rng(log_radon_hat, sigma); // normal_rng generates random numbers from a normal distribution } } Now we fit this model: # prepare data for stan model.data <- radon %>% rename(vfloor = floor) %>% dplyr::select(log_radon, vfloor, county, log_uranium) %>% compose_data() # fit stan model stan.partial.pooling.model.03 <- partial.pooling.model.03$sample(data = model.data,
refresh = 0)
## Running MCMC with 4 sequential chains...
##
## Chain 1 finished in 17.3 seconds.
## Chain 2 finished in 5.5 seconds.
## Chain 3 finished in 7.0 seconds.
## Chain 4 finished in 5.5 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 8.8 seconds.
## Total execution time: 35.8 seconds.
rstan.stan.partial.pooling.fit.03 <- read_stan_csv(stan.partial.pooling.model.03\$output_files())
tidy(rstan.stan.partial.pooling.fit.03, conf.int = TRUE)
## # A tibble: 2,018 × 5
##    term        estimate std.error conf.low conf.high
##    <chr>          <dbl>     <dbl>    <dbl>     <dbl>
##  1 b_floor[1]    -0.578     0.275   -1.07    0.0620
##  2 b_floor[2]    -0.696     0.246   -1.27   -0.266
##  3 b_floor[3]    -0.603     0.247   -1.09   -0.0606
##  4 b_floor[4]    -0.509     0.231   -0.939  -0.00409
##  5 b_floor[5]    -0.580     0.273   -1.09    0.0265
##  6 b_floor[6]    -0.636     0.299   -1.23    0.00619
##  7 b_floor[7]    -0.398     0.292   -0.828   0.309
##  8 b_floor[8]    -0.604     0.243   -1.07   -0.0826
##  9 b_floor[9]    -0.529     0.298   -0.974   0.236
## 10 b_floor[10]   -0.688     0.241   -1.22   -0.245
## # … with 2,008 more rows
stan.partialpool.03.values <-
stan.partial.pooling.model.03 %>% # use this model fit
recover_types(radon) %>% # this matches indexes to original factor levels
spread_draws(a_county[county], b_floor[county], b_uranium, sigma_county[], sigma) %>% # extract samples in tidy format
median_hdci() # calculate the median HDCI

# plot the model results for the median estimates
ggplot(data = radon, aes(x = floor, y = log_radon, group = county)) +
geom_abline(data = stan.pooling.values,
aes(intercept = alpha, slope = beta), color = "black") +
# no pooling estimates in red
geom_abline(data = stan.nopool.values,
aes(intercept = alpha, slope = beta), color = "red") +
# partial pooling estimates in blue
geom_abline(data = stan.partialpool.values,
aes(intercept = alpha, slope = beta), color = "blue") +
geom_abline(data = stan.partialpool.02.values,
aes(intercept = alpha, slope = b_floor), color = "purple", lty = 2) +
geom_abline(data = stan.partialpool.03.values,
aes(intercept = a_county, slope = b_floor), color = "green") +
geom_point(alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
scale_color_manual(values = c("No Pooling", "Pooling")) +
ggpubr::theme_pubr()

# plot the model results with posterior samples
stan.partialpool.03.samples <-
stan.partial.pooling.model.03 %>% # use this model fit
spread_draws(a_county[county], b_floor[county], b_uranium, sigma_county[], sigma, n = 20) # extract 20 samples in tidy format

ggplot() +
geom_abline(data = stan.pooling.samples,
aes(intercept = alpha, slope = beta),
color = "black",
alpha = 0.3) +
geom_abline(data = stan.nopooling.samples,
aes(intercept = alpha, slope = beta),
color = "red",
alpha = 0.3) +
geom_abline(data = stan.partialpool.samples,
aes(intercept = alpha, slope = beta),
color = "blue",
alpha = 0.3) +
geom_abline(data = stan.partialpool.02.samples,
aes(intercept = alpha, slope = b_floor),
color = "purple",
lty = 2,
alpha = 0.3) +
geom_abline(data = stan.partialpool.03.samples,
aes(intercept = a_county, slope = b_floor),
color = "green",
alpha = 0.3) +
aes(x = floor, y = log_radon, group = county),
alpha = 0.3,
size = 3,
position = position_jitter(width = 0.1, height = 0.2)) +
facet_wrap(~ county) +
ggpubr::theme_pubr()

# Model Comparison

Now we can compute the widely applicable information criterion (WAIC) to determine which has the best fit to the data:

library(rethinking)

compare(stan.pooling.fit, rstan.stan.no.pooling.fit,
rstan.stan.partial.pooling.fit, rstan.stan.partial.pooling.fit.02,
rstan.stan.partial.pooling.fit.03)
##                                       WAIC       SE      dWAIC
## rstan.stan.partial.pooling.fit.02 2054.615 57.54352   0.000000
## rstan.stan.partial.pooling.fit.03 2057.063 59.36121   2.447496
## rstan.stan.partial.pooling.fit    2071.685 55.83546  17.069744
## rstan.stan.no.pooling.fit         2118.367 56.46214  63.751704
## stan.pooling.fit                  2179.970 49.92133 125.354529
##                                         dSE     pWAIC       weight
## rstan.stan.partial.pooling.fit.02        NA 25.974319 7.726052e-01
## rstan.stan.partial.pooling.fit.03  6.046955 43.977915 2.272430e-01
## rstan.stan.partial.pooling.fit    10.460089 48.051214 1.518133e-04
## rstan.stan.no.pooling.fit         17.726772 83.018908 1.107774e-14
## stan.pooling.fit                  23.387237  3.842781 4.651227e-28
plot(compare(stan.pooling.fit, rstan.stan.no.pooling.fit,
rstan.stan.partial.pooling.fit, rstan.stan.partial.pooling.fit.02,
rstan.stan.partial.pooling.fit.03))

These results show that the stan.partial.pooling.model.02, which is the varying intercept with a group-level predictor, has the lowest WAIC. In other words, this model provides the best out-of-sample predictions. The varying intercept, varying slope model is very closely behind with and increase in WAIC of only 3.2 points. The plot shows that although the varying intercept model with a group-level predictor has the best out-of-sample performance (open dots), it does not have the best in-sample performance (solid dots). The no-pooling model has the best in-sample performance, but this model also has the largest degree of over fitting.

Previous