# model packages
library(MASS) # have to read this in before tidyverse
library(lme4)
library(glmmTMB) # ok if you don't have this - just comment it out
# diagnostics and model info
library(DHARMa)
library(MuMIn)
library(ggeffects)
library(lmtest)
library(broom)
# general usage
library(tidyverse)
library(here)
library(naniar)
library(skimr)
library(GGally)
library(flextable)
<- read_csv(here("data", "salamanders.csv")) salamanders
Data info from glmmTMB
:
site
: name of a location where repeated samples were taken
mined
: factor indicating whether the site was affected by mountain top removal coal mining
cover
: amount of cover objects in the stream (scaled)
sample
: repeated sample
DOP
: Days since precipitation (scaled)
Wtemp
: water temperature (scaled)
DOY
: day of year (scaled)
spp
: abbreviated species name, possibly also life stage
count
: number of salamanders observed
Explore the data set:
histogram of counts:
ggplot(salamanders, aes(x = count)) +
geom_histogram(bins = 17)
Missingness:
gg_miss_var(salamanders) # nothing missing!
Skim:
skim(salamanders)
Name | salamanders |
Number of rows | 644 |
Number of columns | 9 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
site | 0 | 1 | 3 | 5 | 0 | 23 | 0 |
mined | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
spp | 0 | 1 | 2 | 5 | 0 | 7 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
cover | 0 | 1 | 0.00 | 0.98 | -1.59 | -0.70 | -0.05 | 0.60 | 1.89 | ▅▇▇▅▃ |
sample | 0 | 1 | 2.50 | 1.12 | 1.00 | 1.75 | 2.50 | 3.25 | 4.00 | ▇▇▁▇▇ |
DOP | 0 | 1 | 0.00 | 0.98 | -2.20 | -0.30 | -0.09 | 0.00 | 3.17 | ▂▇▅▂▁ |
Wtemp | 0 | 1 | 0.00 | 0.98 | -3.02 | -0.61 | 0.04 | 0.60 | 2.21 | ▁▃▇▆▂ |
DOY | 0 | 1 | 0.00 | 1.00 | -2.71 | -0.57 | -0.06 | 0.97 | 1.46 | ▁▅▇▅▇ |
count | 0 | 1 | 1.32 | 2.64 | 0.00 | 0.00 | 0.00 | 2.00 | 36.00 | ▇▁▁▁▁ |
Pairs plot:
%>%
salamanders select(!site) %>%
ggpairs()
Question: How does salamander count vary with mined status, species, and stream cover?
Build models
# linear model, we know this is wrong
<- lm(count ~ cover + mined + spp, data = salamanders)
salmod1
# generalized linear model with Poisson distribution
<- glm(count ~ cover + mined + spp, data = salamanders, family = "poisson")
salmod2 <- glm(count ~ cover + mined + spp, data = salamanders, family = "poisson")
salmod2.a
# generalized linear model with negative binomial distribution
<- glm.nb(count ~ cover + mined + spp, data = salamanders)
salmod3 <- glmmTMB(count ~ cover + mined + spp, data = salamanders, family = "nbinom2")
salmod3.a
# generalized linear model with Poisson distribution and random effect of site
<- glmer(count ~ cover + mined + spp + (1|site), data = salamanders, family = "poisson")
salmod4 <- glmmTMB(count ~ cover + mined + spp + (1|site), data = salamanders, family = "poisson")
salmod4.a
# generalized linear model with negative binomial distribution and random effect of site
<- glmer.nb(count ~ cover + mined + spp + (1|site), data = salamanders)
salmod5 <- glmmTMB(count ~ cover + mined + spp + (1|site), data = salamanders, family = "nbinom2") salmod5.a
Look at residuals
# check diagnostics
plot(simulateResiduals(salmod1)) # bad
plot(simulateResiduals(salmod2)) # bad
plot(simulateResiduals(salmod3)) # ok?
plot(simulateResiduals(salmod4)) # bad
plot(simulateResiduals(salmod5)) # ok?
Which distribution to use?
::model.sel(salmod1, salmod2, salmod3, salmod4, salmod5) MuMIn
Model selection table
(Intrc) cover mined spp family class init.theta link random
salmod5 1.409 -0.1042 + + NB(0.9424,l) glmerMod s
salmod3 1.465 -0.1418 + + NB(0.8333,l) negbin 0.833 log
salmod4 1.385 -0.1205 + + p(l) glmerMod s
salmod2 1.486 -0.2309 + + p(l) glm
salmod1 3.447 -0.3426 + + g(i) lm
df logLik AICc delta weight
salmod5 11 -825.964 1674.3 0.00 1
salmod3 10 -836.039 1692.4 18.08 0
salmod4 10 -972.141 1964.6 290.28 0
salmod2 9 -1001.066 2020.4 346.07 0
salmod1 10 -1457.026 2934.4 1260.05 0
Abbreviations:
family: g(i) = 'gaussian(identity)',
NB(0.8333,l) = 'Negative Binomial(0.8333,log)',
NB(0.9424,l) = 'Negative Binomial(0.9424,log)', p(l) = 'poisson(log)'
Models ranked by AICc(x)
Random terms:
s: 1 | site
Model summary
# model object
salmod3
Call: glm.nb(formula = count ~ cover + mined + spp, data = salamanders,
init.theta = 0.8332590711, link = log)
Coefficients:
(Intercept) cover minedyes sppDF sppDM sppEC-A
1.4647 -0.1418 -2.1802 -0.4806 -0.3982 -1.4829
sppEC-L sppGP sppPR
-0.2155 -0.8030 -2.0647
Degrees of Freedom: 643 Total (i.e. Null); 635 Residual
Null Deviance: 885.6
Residual Deviance: 549.2 AIC: 1692
# summary
summary(salmod3)
Call:
glm.nb(formula = count ~ cover + mined + spp, data = salamanders,
init.theta = 0.8332590711, link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7966 -0.8511 -0.6062 0.0812 3.4038
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.46467 0.15733 9.309 < 2e-16 ***
cover -0.14180 0.07715 -1.838 0.066062 .
minedyes -2.18017 0.17186 -12.686 < 2e-16 ***
sppDF -0.48061 0.21439 -2.242 0.024977 *
sppDM -0.39819 0.21264 -1.873 0.061123 .
sppEC-A -1.48291 0.24619 -6.023 1.71e-09 ***
sppEC-L -0.21550 0.20909 -1.031 0.302713
sppGP -0.80299 0.22230 -3.612 0.000304 ***
sppPR -2.06472 0.27822 -7.421 1.16e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Negative Binomial(0.8333) family taken to be 1)
Null deviance: 885.60 on 643 degrees of freedom
Residual deviance: 549.16 on 635 degrees of freedom
AIC: 1692.1
Number of Fisher Scoring iterations: 1
Theta: 0.833
Std. Err.: 0.108
2 x log-likelihood: -1672.078
# confidence intervals
confint(salmod3)
2.5 % 97.5 %
(Intercept) 1.1619452 1.78160994
cover -0.2951066 0.01123174
minedyes -2.5140207 -1.85565670
sppDF -0.9135120 -0.04834588
sppDM -0.8214220 0.02428889
sppEC-A -1.9694420 -1.00469459
sppEC-L -0.6235862 0.19129668
sppGP -1.2339240 -0.37486002
sppPR -2.6278646 -1.52359253
# adjusted R2
r.squaredGLMM(salmod3)
R2m R2c
delta 0.4341864 0.4341864
lognormal 0.5806880 0.5806880
trigamma 0.2400616 0.2400616
# model object in table
%>%
salmod3 as_flextable()
Estimate | Standard Error | z value | Pr(>|z|) | ||
---|---|---|---|---|---|
(Intercept) | 1.465 | 0.157 | 9.309 | 0.0000 | *** |
cover | -0.142 | 0.077 | -1.838 | 0.0661 | . |
minedyes | -2.180 | 0.172 | -12.686 | 0.0000 | *** |
sppDF | -0.481 | 0.214 | -2.242 | 0.0250 | * |
sppDM | -0.398 | 0.213 | -1.873 | 0.0611 | . |
sppEC-A | -1.483 | 0.246 | -6.023 | 0.0000 | *** |
sppEC-L | -0.215 | 0.209 | -1.031 | 0.3027 |
|
sppGP | -0.803 | 0.222 | -3.612 | 0.0003 | *** |
sppPR | -2.065 | 0.278 | -7.421 | 0.0000 | *** |
Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 | |||||
| |||||
(Dispersion parameter for Negative Binomial(0.8333) family taken to be 1) | |||||
Null deviance: 885.6 on 643 degrees of freedom | |||||
Residual deviance: 549.2 on 635 degrees of freedom |
Visualizing:
<- ggpredict(salmod3, terms = c("cover", "mined", "spp")) %>%
predictions rename(mined = group,
spp = facet)
ggplot(salamanders, aes(x = cover, y = count, fill = mined)) +
geom_point(aes(color = mined), alpha = 0.5) +
facet_wrap(~spp, scales = "free_y") +
geom_line(data = predictions, aes(x = x, y = predicted, color = mined)) +
geom_ribbon(data = predictions, aes(x = x, y = predicted, ymin = conf.low, ymax = conf.high, fill = mined), alpha = 0.2) +
scale_fill_manual(values = c("yes" = "blue", "no" = "orange")) +
scale_color_manual(values = c("yes" = "blue", "no" = "orange")) +
theme_bw() +
facet_wrap(~spp, scales = "free_y") +
labs(fill = "Mined", color = "Mined")
Citation
@online{bui2023,
author = {Bui, An},
title = {Coding Workshop: {Week} 10},
date = {2023-06-07},
url = {https://an-bui.github.io/ES-193DS-W23/workshop/workshop-10_2023-06-07.html},
langid = {en}
}