In this tutorial, I will guide you through the steps to perform statistical inference on a compartmental model, using the readsdr package as a bridging tool to streamline the process. Specifically, I will use the Susceptible-Exposed-Infectious-Recovered (SEIR) model as an example. This tutorial assumes familiarity with System Dynamics (SD) software (e.g., Stella) and concepts related to ordinary differential equation (ODE) models and statistics.
library(cmdstanr)
library(dplyr)
library(ggplot2)
library(gt)
library(posterior)
library(readr)
library(readsdr) # Development version
library(stringr)
library(tidyr)
Previously, the SEIR model was built in Stella. You can download it here. Let’s read the SEIR into R with the read_xmile function and see its initial conditions (sd_stocks) and constant values (sd_constants).
model_path <- "https://jandraor.github.io/models/SEIR_standard.stmx"
mdl <- read_xmile(model_path)
stocks_df <- sd_stocks(mdl)
stocks_df
## name init_value
## 1 S 9999
## 2 E 0
## 3 I 1
## 4 R 0
## 5 C 0
constants_df <- sd_constants(mdl) |>
mutate(value = format(value, scientific = FALSE))
constants_df
## name value
## 1 par_beta 1.00
## 2 N 10000.00
## 3 par_sigma 0.50
## 4 par_gamma 0.50
## 5 par_rho 0.75
## 6 R0 0.00
## 7 I0 1.00
Briefly, a Data Generating Process can be thought of as the amalgamation of a process and a measurement component. The process component corresponds to the ODE model, whereas the measurement component concerns the structure that accounts for the discrepancies between model prediction and actual data. In this case, the measurement component is formulated in terms of a Negative Binomial distribution. To generate synthetic data, we employ the function sd_measurements. Notice that the measurement component is defined using Stan language. Furthermore, the quantity of interest corresponds to the difference at discrete times of the stock C (cumulative incidence).
meas_mdl <- list("y ~ neg_binomial_2(net_flow(C), 10)")
set.seed(789)
dat_df <- sd_measurements(n_meas = 1,
ds_inputs = mdl$deSolve_components,
meas_model = meas_mdl,
start_time = 0,
stop_time = 70,
timestep = 1/16,
integ_method = "rk4")
ggplot(dat_df, aes(time, measurement)) +
geom_point(colour = "#7B92DC") +
labs(x = "Day", y = "Incidence [People per day]") +
theme_classic()
meas_mdl <- list("y ~ neg_binomial_2(net_flow(C), phi)")
est_params <- list(sd_prior(par_name = "par_beta",
dist = "lognormal",
dist_pars = c(0, 1)),
sd_prior("par_rho", "beta", c(2, 2)),
# This parameter only affects initial conditions
sd_prior("I0", "lognormal", c(0, 1), type = "init"))
stan_file <- sd_Bayes(filepath = model_path,
meas_mdl = meas_mdl,
estimated_params = est_params)
cat(stan_file)
## // Code generated by the R package readsdr v0.3.0.9000
## // See more info at github https://github.com/jandraor/readsdr
## functions {
## vector X_model(real time, vector y, array[] real params) {
## vector[5] dydt;
## real S_to_E;
## real E_to_I;
## real I_to_R;
## real C_in;
## S_to_E = params[1]*y[1]*y[3]/10000;
## E_to_I = 0.5*y[2];
## I_to_R = 0.5*y[3];
## C_in = params[2]*E_to_I;
## dydt[1] = -S_to_E;
## dydt[2] = S_to_E-E_to_I;
## dydt[3] = E_to_I-I_to_R;
## dydt[4] = I_to_R;
## dydt[5] = C_in;
## return dydt;
## }
## }
## data {
## int<lower = 1> n_obs;
## array[n_obs] int y;
## array[n_obs] real ts;
## }
## parameters {
## real<lower = 0> par_beta;
## real<lower = 0, upper = 1> par_rho;
## real<lower = 0> I0;
## real<lower = 0> inv_phi;
## }
## transformed parameters{
## array[n_obs] vector[5] x; // Output from the ODE solver
## array[2] real params;
## vector[5] x0; // init values
## array[n_obs] real delta_x_1;
## real phi;
## phi = 1 / inv_phi;
## x0[1] = (10000) - I0 - (0); // S
## x0[2] = 0; // E
## x0[3] = I0; // I
## x0[4] = 0; // R
## x0[5] = 0; // C
## params[1] = par_beta;
## params[2] = par_rho;
## x = ode_rk45(X_model, x0, 0, ts, params);
## delta_x_1[1] = x[1, 5] - x0[5] + 1e-5;
## for (i in 1:n_obs-1) {
## delta_x_1[i + 1] = x[i + 1, 5] - x[i, 5] + 1e-5;
## }
## }
## model {
## par_beta ~ lognormal(0, 1);
## par_rho ~ beta(2, 2);
## I0 ~ lognormal(0, 1);
## inv_phi ~ exponential(5);
## y ~ neg_binomial_2(delta_x_1, phi);
## }
## generated quantities {
## real log_lik;
## array[n_obs] int sim_y;
## log_lik = neg_binomial_2_lpmf(y | delta_x_1, phi);
## sim_y = neg_binomial_2_rng(delta_x_1, phi);
## }
stan_path <- "./Stan_files/SEIR.stan"
write_file(stan_file, file = stan_path)
stan_d <- list(n_obs = nrow(dat_df),
y = dat_df$measurement,
ts = 1:max(dat_df$time))
mod <- cmdstan_model(stan_path)
fit <- mod$sample(data = stan_d,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 100,
save_warmup = FALSE)
## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
## Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
## Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
## Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
## Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
## Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
## Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
## Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
## Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
## Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
## Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
## Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
## Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
## Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
## Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
## Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
## Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
## Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
## Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
## Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
## Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
## Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
## Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1 finished in 14.7 seconds.
## Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
## Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
## Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
## Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
## Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
## Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
## Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
## Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
## Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
## Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
## Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
## Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
## Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
## Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
## Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
## Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
## Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
## Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
## Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2 finished in 13.6 seconds.
## Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4 finished in 13.7 seconds.
## Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3 finished in 13.7 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 13.9 seconds.
## Total execution time: 21.5 seconds.
Before proceeding with any further analysis, it is imperative that we trust the results from the Hamiltonian Monte Carlo (HMC) sampler. To do so, Stan provides a range of tests that check the validity of the sampling process. If we type the code below, one should get the message ‘Processing complete, no problems’. Any other result suggests model reformulation or adjustment to the algorithm’s parameters.
fit$cmdstan_diagnose()
## Processing csv files: /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR-202404221731-1-35840e.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR-202404221731-2-35840e.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR-202404221731-3-35840e.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR-202404221731-4-35840e.csv
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## Split R-hat values satisfactory all parameters.
##
## Processing complete, no problems detected.
posterior_df <- as_draws_df(fit$draws())
summarise_predicted_observations <- function(posterior_df, var_name) {
y_tilde <- readsdr::extract_timeseries_var(var_name, posterior_df)
y_tilde |> group_by(time) |>
summarise(q2.5 = quantile(value, 0.025),
q25 = quantile(value, 0.25),
mean = mean(value),
q75 = quantile(value, 0.75),
q97.5 = quantile(value, 0.975))
}
summary_y <- summarise_predicted_observations(posterior_df, "sim_y")
ggplot(summary_y, aes(time, mean)) +
geom_line(colour = "grey60") +
geom_ribbon(aes(ymin = q25, ymax = q75), fill = "grey90", alpha = 0.5) +
geom_ribbon(aes(ymin = q2.5, ymax = q97.5), fill = "grey90", alpha = 0.25) +
geom_point(data = dat_df, aes(time, measurement), colour = "#7B92DC") +
labs(y = "Incidence", x = "Day") +
theme_classic() +
theme(axis.line = element_line(colour = "grey70"),
axis.text = element_text(colour = "grey70"),
axis.ticks = element_line(colour = "grey70"))
pars_df <- posterior_df[, c("par_beta", "I0", "par_rho", "inv_phi")] |>
mutate(par_phi = 1 / inv_phi) |>
dplyr::select(-inv_phi) |>
mutate(iter = row_number()) |>
pivot_longer(-iter, names_to = "par")
Vertical lines denote the true value.
actual_vals <- data.frame(par = c("I0", "par_beta", "par_rho", "par_phi"),
value = c(1, 1, 0.75, 10))
ggplot(pars_df, aes(value)) +
geom_histogram(colour = "white", fill = "grey60", alpha = 0.75) +
facet_wrap(vars(par), scales = "free") +
geom_vline(data = actual_vals, aes(xintercept = value),
colour = "#7B92DC", linewidth = 1.5) +
theme_classic()
The readsdr package contains a data set of daily influenza cases detected by the US Public Health Service in Maryland during the 1918 influenza pandemic, from 22 September to 30 November 1918 (Frost and Sydenstricker 1919).
Maryland <- Maryland |>
mutate(time = row_number())
ggplot(Maryland, aes(time, Cumberland)) +
geom_col(fill = "#E35A43") +
labs(x = "Day", y = "Cases") +
theme_bw()
This example replicates the work presented in Andrade and Duggan (2021). However, here I use the negative binomial distribution instead of Poisson.
meas_mdl <- list("y ~ neg_binomial_2(net_flow(C), phi)")
est_params <- list(sd_prior(par_name = "par_beta",
dist = "lognormal",
dist_pars = c(0, 1)),
sd_prior("par_rho", "beta", c(2, 2)),
sd_prior("I0", "lognormal", c(0, 1), type = "init"))
stan_file <- sd_Bayes(filepath = model_path,
meas_mdl = meas_mdl,
estimated_params = est_params,
# Override values in the .stmx file
const_list = list(N = 5234,
# Initial number of recovered individuals
R0 = round(5234 * 0.3)))
cat(stan_file)
## // Code generated by the R package readsdr v0.3.0.9000
## // See more info at github https://github.com/jandraor/readsdr
## functions {
## vector X_model(real time, vector y, array[] real params) {
## vector[5] dydt;
## real S_to_E;
## real E_to_I;
## real I_to_R;
## real C_in;
## S_to_E = params[1]*y[1]*y[3]/5234;
## E_to_I = 0.5*y[2];
## I_to_R = 0.5*y[3];
## C_in = params[2]*E_to_I;
## dydt[1] = -S_to_E;
## dydt[2] = S_to_E-E_to_I;
## dydt[3] = E_to_I-I_to_R;
## dydt[4] = I_to_R;
## dydt[5] = C_in;
## return dydt;
## }
## }
## data {
## int<lower = 1> n_obs;
## array[n_obs] int y;
## array[n_obs] real ts;
## }
## parameters {
## real<lower = 0> par_beta;
## real<lower = 0, upper = 1> par_rho;
## real<lower = 0> I0;
## real<lower = 0> inv_phi;
## }
## transformed parameters{
## array[n_obs] vector[5] x; // Output from the ODE solver
## array[2] real params;
## vector[5] x0; // init values
## array[n_obs] real delta_x_1;
## real phi;
## phi = 1 / inv_phi;
## x0[1] = (5234) - I0 - (1570); // S
## x0[2] = 0; // E
## x0[3] = I0; // I
## x0[4] = 1570; // R
## x0[5] = 0; // C
## params[1] = par_beta;
## params[2] = par_rho;
## x = ode_rk45(X_model, x0, 0, ts, params);
## delta_x_1[1] = x[1, 5] - x0[5] + 1e-5;
## for (i in 1:n_obs-1) {
## delta_x_1[i + 1] = x[i + 1, 5] - x[i, 5] + 1e-5;
## }
## }
## model {
## par_beta ~ lognormal(0, 1);
## par_rho ~ beta(2, 2);
## I0 ~ lognormal(0, 1);
## inv_phi ~ exponential(5);
## y ~ neg_binomial_2(delta_x_1, phi);
## }
## generated quantities {
## real log_lik;
## array[n_obs] int sim_y;
## log_lik = neg_binomial_2_lpmf(y | delta_x_1, phi);
## sim_y = neg_binomial_2_rng(delta_x_1, phi);
## }
stan_path <- "./Stan_files/SEIR_Cumberland.stan"
write_file(stan_file, file = stan_path)
stan_d <- list(n_obs = nrow(Maryland),
y = Maryland$Cumberland,
ts = 1:max(Maryland$time))
mod <- cmdstan_model(stan_path)
fit <- mod$sample(data = stan_d,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 100,
save_warmup = FALSE)
Don’t forget the diagnostics.
fit$cmdstan_diagnose()
## Processing csv files: /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR_Cumberland-202404221732-1-6b3946.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR_Cumberland-202404221732-2-6b3946.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR_Cumberland-202404221732-3-6b3946.csv, /var/folders/51/j15clly97bl0z25f9q1s9hdm0000gn/T/RtmpLxEEdc/SEIR_Cumberland-202404221732-4-6b3946.csv
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## Split R-hat values satisfactory all parameters.
##
## Processing complete, no problems detected.
posterior_df <- as_draws_df(fit$draws())
summary_y <- summarise_predicted_observations(posterior_df, "sim_y")
plot_predictive_checks <- function(summary_y, Maryland) {
ggplot(summary_y, aes(time, mean)) +
geom_line(colour = "grey60") +
geom_ribbon(aes(ymin = q25, ymax = q75), fill = "grey90", alpha = 0.5) +
geom_ribbon(aes(ymin = q2.5, ymax = q97.5), fill = "grey90", alpha = 0.25) +
geom_point(data = Maryland, aes(time, Cumberland), colour = "#E35A43") +
labs(y = "Incidence", x = "Day") +
theme_classic() +
theme(axis.line = element_line(colour = "grey70"),
axis.text = element_text(colour = "grey70"),
axis.ticks = element_line(colour = "grey70"))
}
plot_predictive_checks(summary_y, Maryland)
We can estimate the 95% uncertainty interval of \(\Re_0\) using the posterior distribution.
# \Re_0 = beta / gamma
gamma_val <- constants_df |> filter(name == "par_sigma") |>
pull(value) |> as.numeric()
R0_expected_value <- posterior_df$par_beta / gamma_val
quantile(R0_expected_value, c(0.025, 0.975)) |> round(2)
## 2.5% 97.5%
## 2.23 2.42
However, the estimation of the reproduction number is sensitive to the chosen mean generation time. To confirm this observation, we employ the parameterisation suggested by Andrade and Duggan (2023). In particular, we use the alternative SEI2R parameterisation with one stage in the latent compartment and two stages in the infectious class. In this parameterisation, the mean generation time (\(\tau\)) is an exogenous parameter.
Recall that the mean generation time in the SEIR model can be expressed as a function (Eq (4.1)) that depends on the number of stages of the infectious compartment (\(j\)), and the latent (\(\sigma^{-1}\)) and infectious (\(\gamma^{-1}\)). According to Eq (4.1), the assumed mean generation time in the model fitted in the previous section is \((2) + \frac{(1+1)}{(2\times1)}(2) = 4\).
\[\begin{equation} \tau = \sigma ^{-1} + \frac{j +1}{2j} \gamma^{-1} \tag{4.1} \end{equation}\]
Then, I demonstrate that changes in the parameterisation or in the infectious
period distribution do not lead to variations in the estimated reproduction
number, provided that the mean generation time is held at 4. In this example,
I also show how to change the value of fixed parameters without having to create
a new Stan file. Notice the last line of the call to the sd_Bayes function,
where I declare (data_params = "par_tau"
) that par_tau will be configured in
Stan’s data block.
model_path <- "https://jandraor.github.io/models/SE1I2R_alt.stmx"
meas_mdl <- list("y ~ neg_binomial_2(net_flow(C), phi)")
est_params <- list(sd_prior(par_name = "par_inv_R0",
dist = "beta",
dist_pars = c(2, 2)),
sd_prior("par_rho", "beta", c(2, 2)),
sd_prior("I0", "lognormal", c(0, 1), type = "init"))
stan_file <- sd_Bayes(filepath = model_path,
meas_mdl = meas_mdl,
estimated_params = est_params,
# Override values in the .stmx file
const_list = list(N = 5234,
# Initial number of recovered individuals
xi = 0.3),
data_params = "par_tau")
cat(stan_file)
## // Code generated by the R package readsdr v0.3.0.9000
## // See more info at github https://github.com/jandraor/readsdr
## functions {
## vector X_model(real time, vector y, array[] real params) {
## vector[6] dydt;
## real E1_to_I1;
## real C_in;
## real aux_j;
## real aux_tau;
## real var_beta;
## real var_gamma;
## real I2_to_R;
## real S_to_E;
## real I1_to_I2;
## E1_to_I1 = 0.5*y[2];
## C_in = params[2]*E1_to_I1;
## aux_j = (2+1)/(2.0*2);
## aux_tau = params[3]-(1/0.5);
## var_beta = (1/params[1])*(aux_j/aux_tau);
## var_gamma = var_beta*params[1];
## I2_to_R = 2*var_gamma*y[6];
## S_to_E = var_beta*y[1]*(y[3]+y[6])/5234;
## I1_to_I2 = 2*var_gamma*y[3];
## dydt[1] = -S_to_E;
## dydt[2] = S_to_E-E1_to_I1;
## dydt[3] = E1_to_I1-I1_to_I2;
## dydt[4] = I2_to_R;
## dydt[5] = C_in;
## dydt[6] = I1_to_I2-I2_to_R;
## return dydt;
## }
## }
## data {
## int<lower = 1> n_obs;
## array[n_obs] int y;
## array[n_obs] real ts;
## real par_tau;
## }
## parameters {
## real<lower = 0, upper = 1> par_inv_R0;
## real<lower = 0, upper = 1> par_rho;
## real<lower = 0> I0;
## real<lower = 0> inv_phi;
## }
## transformed parameters{
## array[n_obs] vector[6] x; // Output from the ODE solver
## array[3] real params;
## vector[6] x0; // init values
## array[n_obs] real delta_x_1;
## real phi;
## phi = 1 / inv_phi;
## x0[1] = (5234) * (1 - (0.3)) - I0; // S
## x0[2] = 0; // E1
## x0[3] = I0; // I1
## x0[4] = 1570.2; // R
## x0[5] = I0; // C
## x0[6] = 0; // I2
## params[1] = par_inv_R0;
## params[2] = par_rho;
## params[3] = par_tau;
## x = ode_rk45(X_model, x0, 0, ts, params);
## delta_x_1[1] = x[1, 5] - x0[5] + 1e-5;
## for (i in 1:n_obs-1) {
## delta_x_1[i + 1] = x[i + 1, 5] - x[i, 5] + 1e-5;
## }
## }
## model {
## par_inv_R0 ~ beta(2, 2);
## par_rho ~ beta(2, 2);
## I0 ~ lognormal(0, 1);
## inv_phi ~ exponential(5);
## y ~ neg_binomial_2(delta_x_1, phi);
## }
## generated quantities {
## real log_lik;
## array[n_obs] int sim_y;
## log_lik = neg_binomial_2_lpmf(y | delta_x_1, phi);
## sim_y = neg_binomial_2_rng(delta_x_1, phi);
## }
stan_path <- "./Stan_files/SEI2R_Cumberland.stan"
write_file(stan_file, file = stan_path)
mod <- cmdstan_model(stan_path)
stan_d <- list(n_obs = nrow(Maryland),
y = Maryland$Cumberland,
ts = 1:max(Maryland$time),
par_tau = 4)
fit <- mod$sample(data = stan_d,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 100,
save_warmup = FALSE)
posterior_df <- as_draws_df(fit$draws())
summary_y <- summarise_predicted_observations(posterior_df, "sim_y")
plot_predictive_checks(summary_y, Maryland)
Despite I changed the infectious period distribution (from exponential to Erlang), the 95% credible interval is nearly identical to that calculated in section 4.2.1. As I stated, the only quantity that matters is the mean generation time.
R0_expected_value <- 1/posterior_df$par_inv_R0
quantile(R0_expected_value, c(0.025, 0.975)) |> round(2)
## 2.5% 97.5%
## 2.22 2.41
However, if I run the sampler with a different mean generation time (2.85 days), a value obtained from the literature (Wallinga and Lipsitch 2007; Hirotsu et al. 2004),…
stan_d <- list(n_obs = nrow(Maryland),
y = Maryland$Cumberland,
ts = 1:max(Maryland$time),
par_tau = 2.85)
fit <- mod$sample(data = stan_d,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 100,
save_warmup = FALSE)
I obtain a similar fit…
posterior_df <- as_draws_df(fit$draws())
summary_y <- summarise_predicted_observations(posterior_df, "sim_y")
plot_predictive_checks(summary_y, Maryland)
But lower \(\Re_0\) estimates (first row in the table below).
pars_df <- posterior_df |> select(par_rho, I0, par_inv_R0, phi) |>
mutate(R0 = 1/ par_inv_R0, .before = everything()) |>
select(-par_inv_R0)
# Credible intervals
cr_I <- apply(pars_df, 2, function(par) quantile(par, c(0.025, 0.975))) |>
t() |> as.data.frame()
par_names <- c("\u211c", "\u03C1", "I", "\u03D5")
cr_I |> mutate(Parameter = par_names, .before = everything()) |>
gt() |> fmt_number(columns = 2:3, decimals = 2) |>
text_transform(
locations = cells_body(columns = c(Parameter), rows = c(1, 3)),
fn = function(x) str_glue("{x}<sub>0</sub>"))
Parameter | 2.5% | 97.5% |
---|---|---|
ℜ0 | 1.97 | 2.10 |
ρ | 0.82 | 0.98 |
I0 | 1.66 | 3.89 |
ϕ | 1.91 | 4.28 |
Now, imagine that you only had access to the first 30 measurements. How would that change the estimate, and would the model forecast the remaining data points accurately?
first_30 <- Maryland |> filter(time < 31)
To generate forecasts, we set the parameter forecast
to TRUE
in the
sd_bayes
function. This parameter instructs Stan to simulate n_fcst
days
after the last measurement. The user specifies the value of n_fcst
in the data block. In this case, we will forecast 61 days.
model_path <- "https://jandraor.github.io/models/SE1I2R_alt.stmx"
meas_mdl <- list("y ~ neg_binomial_2(net_flow(C), phi)")
est_params <- list(sd_prior(par_name = "par_inv_R0",
dist = "beta",
dist_pars = c(2, 2)),
sd_prior("par_rho", "beta", c(2, 2)),
sd_prior("I0", "lognormal", c(0, 1), type = "init"))
stan_file <- sd_Bayes(filepath = model_path,
meas_mdl = meas_mdl,
estimated_params = est_params,
# Override values in the .stmx file
const_list = list(N = 5234,
# Initial number of recovered individuals
xi = 0.3,
par_tau = 2.85),
forecast = TRUE)
cat(stan_file)
## // Code generated by the R package readsdr v0.3.0.9000
## // See more info at github https://github.com/jandraor/readsdr
## functions {
## vector X_model(real time, vector y, array[] real params) {
## vector[6] dydt;
## real E1_to_I1;
## real C_in;
## real aux_j;
## real aux_tau;
## real var_beta;
## real var_gamma;
## real I2_to_R;
## real S_to_E;
## real I1_to_I2;
## E1_to_I1 = 0.5*y[2];
## C_in = params[2]*E1_to_I1;
## aux_j = (2+1)/(2.0*2);
## aux_tau = 2.85-(1/0.5);
## var_beta = (1/params[1])*(aux_j/aux_tau);
## var_gamma = var_beta*params[1];
## I2_to_R = 2*var_gamma*y[6];
## S_to_E = var_beta*y[1]*(y[3]+y[6])/5234;
## I1_to_I2 = 2*var_gamma*y[3];
## dydt[1] = -S_to_E;
## dydt[2] = S_to_E-E1_to_I1;
## dydt[3] = E1_to_I1-I1_to_I2;
## dydt[4] = I2_to_R;
## dydt[5] = C_in;
## dydt[6] = I1_to_I2-I2_to_R;
## return dydt;
## }
## }
## data {
## int<lower = 1> n_obs;
## array[n_obs] int y;
## array[n_obs] real ts;
## int<lower = 1> n_fcst;
## }
## parameters {
## real<lower = 0, upper = 1> par_inv_R0;
## real<lower = 0, upper = 1> par_rho;
## real<lower = 0> I0;
## real<lower = 0> inv_phi;
## }
## transformed parameters{
## array[n_obs] vector[6] x; // Output from the ODE solver
## array[2] real params;
## vector[6] x0; // init values
## array[n_obs] real delta_x_1;
## real phi;
## phi = 1 / inv_phi;
## x0[1] = (5234) * (1 - (0.3)) - I0; // S
## x0[2] = 0; // E1
## x0[3] = I0; // I1
## x0[4] = 1570.2; // R
## x0[5] = I0; // C
## x0[6] = 0; // I2
## params[1] = par_inv_R0;
## params[2] = par_rho;
## x = ode_rk45(X_model, x0, 0, ts, params);
## delta_x_1[1] = x[1, 5] - x0[5] + 1e-5;
## for (i in 1:n_obs-1) {
## delta_x_1[i + 1] = x[i + 1, 5] - x[i, 5] + 1e-5;
## }
## }
## model {
## par_inv_R0 ~ beta(2, 2);
## par_rho ~ beta(2, 2);
## I0 ~ lognormal(0, 1);
## inv_phi ~ exponential(5);
## y ~ neg_binomial_2(delta_x_1, phi);
## }
## generated quantities {
## real log_lik;
## array[n_obs] int sim_y;
## array[n_fcst] int fcst_y;
## array[n_fcst] vector[6] x_fcst; // Forecast
## array[n_fcst] real t_fcst;
## vector[6] x_fcst0; // Forecast init values
## array[n_fcst] real delta_x_fcst_1;
## log_lik = neg_binomial_2_lpmf(y | delta_x_1, phi);
## // Simulate forecast
## x_fcst0 = x[n_obs, :];
## t_fcst = linspaced_array(n_fcst, 1, n_fcst);
## x_fcst = ode_rk45(X_model, x_fcst0, 0, t_fcst, params);
## delta_x_fcst_1[1] = x_fcst[1, 5] - x_fcst0[5] + 1e-5;
## for (i in 1:n_fcst-1) {
## delta_x_fcst_1[i + 1] = x_fcst[i + 1, 5] - x_fcst[i, 5] + 1e-5;
## }
## sim_y = neg_binomial_2_rng(delta_x_1, phi);
## fcst_y = neg_binomial_2_rng(delta_x_fcst_1, phi);
## }
stan_path <- "./Stan_files/SEI2R_forecast.stan"
write_file(stan_file, file = stan_path)
We run the inference process…
mod <- cmdstan_model(stan_path)
stan_d <- list(n_obs = nrow(first_30),
y = first_30$Cumberland,
ts = 1:max(first_30$time),
n_fcst = 61)
fit <- mod$sample(data = stan_d,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 100,
save_warmup = FALSE)
To understand the code, let’s define two key variables. sim_y
represents
simulated measurements up to and including the last available data point. In
other words, it’s a hindcast (grey ribbon). Conversely, fcst_y
represents
simulated measurements after the last available data point, making it a
forecast (blue ribbon).
posterior_df <- as_draws_df(fit$draws())
summary_y_tilde <- summarise_predicted_observations(posterior_df, "sim_y")
summary_y_tilde_fcst <- summarise_predicted_observations(posterior_df,
"fcst_y") |>
mutate(time = time + 30) |>
bind_rows(summary_y_tilde |> tail(1))
ggplot(summary_y_tilde, aes(time, mean)) +
geom_line(colour = "grey60") +
geom_ribbon(aes(ymin = q25, ymax = q75), fill = "grey90", alpha = 0.5) +
geom_ribbon(aes(ymin = q2.5, ymax = q97.5), fill = "grey90", alpha = 0.25) +
geom_line(data = summary_y_tilde_fcst, colour = "#7B92DC") +
geom_ribbon(data = summary_y_tilde_fcst,
aes(ymin = q25, ymax = q75), fill = "#7B92DC", alpha = 0.2) +
geom_ribbon(data = summary_y_tilde_fcst,
aes(x = time, ymin = q2.5, ymax = q97.5), fill = "#7B92DC", alpha = 0.1) +
geom_point(data = Maryland, aes(time, Cumberland), colour = "#E35A43") +
labs(y = "Incidence", x = "Day") +
theme_classic() +
theme(axis.line = element_line(colour = "grey70"),
axis.text = element_text(colour = "grey70"),
axis.ticks = element_line(colour = "grey70"))
While using only the first 30 data points leads to a less accurate capture of transmission dynamics compared to using all the data, the estimates of \(\Re_0\) remain fairly similar.
R0_expected_value <- 1/posterior_df$par_inv_R0
quantile(R0_expected_value, c(0.025, 0.975)) |> round(2)
## 2.5% 97.5%
## 1.93 2.23
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Dublin
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.0 stringr_1.5.0 readsdr_0.3.0.9000 readr_2.1.4
## [5] posterior_1.4.1 gt_0.9.0 ggplot2_3.4.4 dplyr_1.1.2
## [9] cmdstanr_0.6.1
##
## loaded via a namespace (and not attached):
## [1] tensorA_0.36.2 sass_0.4.7 utf8_1.2.4
## [4] generics_0.1.3 xml2_1.3.5 jpeg_0.1-10
## [7] stringi_1.7.12 hms_1.1.3 digest_0.6.33
## [10] magrittr_2.0.3 evaluate_0.21 grid_4.3.1
## [13] bookdown_0.35 fastmap_1.1.1 jsonlite_1.8.7
## [16] processx_3.8.2 backports_1.4.1 deSolve_1.36
## [19] ps_1.7.5 purrr_1.0.2 fansi_1.0.6
## [22] scales_1.3.0 jquerylib_0.1.4 abind_1.4-5
## [25] cli_3.6.1 rlang_1.1.2 munsell_0.5.0
## [28] withr_2.5.2 cachem_1.0.8 yaml_2.3.7
## [31] tools_4.3.1 tzdb_0.4.0 checkmate_2.2.0
## [34] colorspace_2.1-0 curl_5.0.2 vctrs_0.6.5
## [37] R6_2.5.1 lifecycle_1.0.4 pkgconfig_2.0.3
## [40] pillar_1.9.0 bslib_0.5.1 gtable_0.3.4
## [43] data.table_1.14.8 glue_1.6.2 highr_0.10
## [46] xfun_0.40 tibble_3.2.1 tidyselect_1.2.0
## [49] rstudioapi_0.15.0 knitr_1.43 farver_2.1.1
## [52] htmltools_0.5.6 labeling_0.4.3 rmarkdown_2.24
## [55] compiler_4.3.1 distributional_0.3.2