library(alien)
library(rstan)
#> Loading required package: StanHeaders
#>
#> rstan version 2.32.6 (Stan version 2.32.2)
#> For execution on a local, multicore CPU with excess RAM we recommend calling
#> options(mc.cores = parallel::detectCores()).
#> To avoid recompilation of unchanged Stan programs, we recommend calling
#> rstan_options(auto_write = TRUE)
#> For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
#> change `threads_per_chain` option:
#> rstan_options(threads_per_chain = 1)
#> Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file
Here we show an example fitting of a Bayesian heirarchical model to data, using the proportion of undiscovered alien species to the total number of undiscovered species (natives and aliens) as the probability of detecting a new alien species. The model is described in full in Belmaker et al (2009), with some modifications described in Buba et al (2024).
We will demonstrate that using the code provided in the alien package:
model_file <- system.file("stan/modified_belmaker_et_al_2009_model.stan", package = "alien")
readLines(model_file)
#> [1] "data{"
#> [2] " int <lower = 1> N; // number of rows in the data"
#> [3] " int <lower = 1> native_total; // assumed size of native pool"
#> [4] " array[N] int <lower = 0> dI; // observed number of yearly records invasive"
#> [5] " array[N] int <lower = 0> dN; // observed number of yearly records native"
#> [6] " vector <lower = 0>[N] t; // time from start"
#> [7] " real b0_mu; // prior for betas"
#> [8] " real b1_mu; // prior for betas"
#> [9] " real b0_sd; // prior for betas"
#> [10] " real b1_sd; // prior for betas"
#> [11] "}"
#> [12] ""
#> [13] "transformed data {"
#> [14] " array[N] int <lower = 0> unrecorded_N;"
#> [15] " array[N] int <lower = 0> yearly_detections;"
#> [16] " array[N] int <lower = 0> recorded_I; // cumulative recorded invasives"
#> [17] " array[N] int <lower = 0> recorded_N; // cumulative recorded natives"
#> [18] ""
#> [19] " recorded_I = cumulative_sum(dI);"
#> [20] " recorded_N = cumulative_sum(dN);"
#> [21] ""
#> [22] " for (i in 1:N){"
#> [23] " unrecorded_N[i] = native_total - recorded_N[i];"
#> [24] " yearly_detections[i] = dI[i] + dN[i];"
#> [25] " }"
#> [26] ""
#> [27] "}"
#> [28] ""
#> [29] "parameters {"
#> [30] " real b0;"
#> [31] " real b1;"
#> [32] "}"
#> [33] ""
#> [34] "transformed parameters {"
#> [35] " vector<lower = 0>[N] mu_t;"
#> [36] " vector<lower = 0>[N] unrecorded_I;"
#> [37] ""
#> [38] " mu_t = exp(b0 + b1 .* t);"
#> [39] ""
#> [40] " for (i in 1:N){"
#> [41] " unrecorded_I[i] = cumulative_sum(mu_t)[i] - recorded_I[i];"
#> [42] " }"
#> [43] ""
#> [44] "}"
#> [45] ""
#> [46] "model{"
#> [47] ""
#> [48] " dI ~ beta_binomial(yearly_detections, unrecorded_I, unrecorded_N);"
#> [49] ""
#> [50] " //priors"
#> [51] " b0 ~ normal(b0_mu, b0_sd);"
#> [52] " b1 ~ normal(b1_mu , b1_sd);"
#> [53] "}"
#> [54] ""
#> [55] "generated quantities {"
#> [56] " array[N] int y = beta_binomial_rng(yearly_detections, unrecorded_I, unrecorded_N);"
#> [57] "}"
We will the medfish
data included in the alien
package:
data(medfish)
head(medfish)
#> year time natives aliens
#> 1 1927 0 52 5
#> 2 1930 3 0 1
#> 3 1931 4 12 0
#> 4 1934 7 18 0
#> 5 1935 8 17 1
#> 6 1937 10 1 1
The data has several columns: 1. year
- The year of the
observations. 2. time
- how much time has passed from the
first observation in the data until this point. 3. natives
- how many natives were newly described in this year. 4.
alien
- how many aliens were newly described in this
year.
We will use the rstan package to fit the model to our data, first we read the model script to define a stan model:
The next step is to create a data list to be used by rstan:
readLines(model_file)[1:11]
#> [1] "data{"
#> [2] " int <lower = 1> N; // number of rows in the data"
#> [3] " int <lower = 1> native_total; // assumed size of native pool"
#> [4] " array[N] int <lower = 0> dI; // observed number of yearly records invasive"
#> [5] " array[N] int <lower = 0> dN; // observed number of yearly records native"
#> [6] " vector <lower = 0>[N] t; // time from start"
#> [7] " real b0_mu; // prior for betas"
#> [8] " real b1_mu; // prior for betas"
#> [9] " real b0_sd; // prior for betas"
#> [10] " real b1_sd; // prior for betas"
#> [11] "}"
We can set it up using data from medfish. The only thing we need to assume is the native pool size (See Belmaker et al 2009 for more detail)
data_for_stan <- list(
N = nrow(medfish),
native_total = 600,
dI = medfish$aliens,
dN = medfish$natives,
t = medfish$t
)
An important aspect of Bayesian modelling is the ability to use prior for the model parameters. The model uses priors for both the mean and standard error of the parameters \(\beta_0\) and \(\beta_1\):
readLines(model_file)[7:10]
#> [1] " real b0_mu; // prior for betas" " real b1_mu; // prior for betas"
#> [3] " real b0_sd; // prior for betas" " real b1_sd; // prior for betas"
We can start with a naive model of alien introduction to get an idea of how rapid is the alien introduction rate:
naive_model <- stats::glm(aliens ~ time, data = medfish, family = "poisson")
stats::summary.glm(naive_model)
#>
#> Call:
#> stats::glm(formula = aliens ~ time, family = "poisson", data = medfish)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.319591 0.280405 -1.140 0.254391
#> time 0.014949 0.004338 3.446 0.000568 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for poisson family taken to be 1)
#>
#> Null deviance: 123.07 on 59 degrees of freedom
#> Residual deviance: 110.33 on 58 degrees of freedom
#> AIC: 220.23
#>
#> Number of Fisher Scoring iterations: 6
We will use these estimates as priors for our model.
coef_table <- summary(naive_model)$coefficients
priors <- c(
b0_mu = coef_table[1,1],
b0_sd = coef_table[1,2],
b1_mu = coef_table[2,1],
b1_sd = coef_table[2,2]
)
priors
#> b0_mu b0_sd b1_mu b1_sd
#> -0.319590861 0.280404647 0.014949156 0.004337772
We join the priors with the rest of the data:
We can then call the sampling
function to fit the
model:
After the sampling, we can extract the model prediction for the
number of undiscovered alien species in each year -
unrecorded_I
:
stan_summary <- summary(model_output, pars = "unrecorded_I")$summary |>
tibble::as_tibble() |>
tibble::add_column(year = medfish$year)
ggplot2::ggplot(medfish)+
ggplot2::aes(x = year) +
ggplot2::geom_point(ggplot2::aes(y = cumsum(natives)), shape = 21, size = 2, fill = "#377EB8") +
ggplot2::geom_point(ggplot2::aes(y = cumsum(aliens)), shape = 21, size = 2, fill = "#E41A1C") +
ggplot2::geom_ribbon(data = stan_summary, ggplot2::aes(ymin = `2.5%`, ymax = `97.5%`), alpha = 0.2) +
ggplot2::geom_line(data = stan_summary, ggplot2::aes(y = mean)) +
ggplot2::labs(x = "Year", y = "Cumulative number\nof discovered species")
NOTE: This is just an example of the usage of such a model - additional tests should be performed to examine the validity of the results.