### practice problem, Methods I F 2022, D Jenkins

library(tidyverse)
library(gamlss)
library(MASS)
library(performance)
library(bbmle)
library(ggplot2)

##### SO2 problem
data <- read_delim("https://sciences.ucf.edu/biology/d4lab/wp-content/uploads/sites/23/2019/12/sulphur.dioxide.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)
View(data)

#1 Evaluate potential predictors of Pollution and justify your choices to be used in models below.
pairs(data, panel=panel.smooth)
# Industry & Popn seem to push up high, then weird patterns with weather data
# wind & rain should reduce, wet days? Try interactions of those 3
# but Industry and Population are very closely related - vifs? If too high, then use Pop - cars and stacks

#2 compute & select potential predictors based on collinearity
addmod <- lm(Pollution ~ Industry + Population + Temp + Wet.days + Wind + Rain, data=data) # full model
# check for collinearity
check_model(addmod) # yup - gonna use Popn not Industry - more inclusive(?) for cars, etc.
# and assumptions look a bit off, also when scaled;
addmods <- lm(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind) + scale(Rain), data=data) # full model
check_model(addmods) # could use a neg binom?

m1<-fitDist(residuals(fullmods))
m1$fits
m1$failed

# now work with distributions and trying a more complex weather schtick based on pairs plots
m2a <- gamlss(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data, family=SN2, control = gamlss.control(n.cyc = 200)) # don't know what RG does and not sure what SN2 does...
plot(m2a) # not bad
summary(m2a)

m2b <- gamlss(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data, family=NBI) # trying neg binom - Pollution is whole numbers >0, like counts
plot(m2b) # maybe a bit better or same and neg binom is familiar - going with that one

m2c <- glm.nb(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data)
check_model(m2c) # I'll stick with gamlss with NBI 
check_overdispersion(m2c)

# now try some alternatives NOTE: these are example models - you could run better ones
m3 <- gamlss(Pollution ~ scale(Temp) + scale(Wet.days)*scale(Wind)*scale(Rain), data=data, family=NBI) # weather only
m4 <- gamlss(Pollution ~ Population, data=data, family=NBI) # humans only
m5 <- gamlss(Pollution ~ scale(Population) + scale(Temp), data=data, family=NBI) # combo 1
m6 <- gamlss(Pollution ~ scale(Population) + scale(Wet.days)*scale(Wind)*scale(Rain), data=data, family=NBI) # complex weather interaction
m7 <- gamlss(Pollution ~ 1, data=data, family=NBI) # null

#3 Compute and select among models to ID a most-plausible version. How well does it work? Which variable(s) are most important?
AICctab(m2b,m3,m4,m5,m6,m7, base=T,delta=T,weights=T) # m2b looks solid
summary(m2b)
r2(m2b)

### Interpretation:
# Popn has strong + effect, as it should; more exhaust, etc.
# warmer temps strongly reduce SO2; photo-oxidation?
# and wind & rain interact, where wind dilutes but rain increases SO2

#4 Graph model, where Pollution is a function of strongest predictor(s) and color is a second predictor
preds <- predict(m2b)
ggplot(data=data) +
  geom_point(aes(Temp, exp(preds), col=Rain)) # I think this is more clear - related to coeffs? 
# or
ggplot(data=data) +
  geom_point(aes(Wind, exp(preds), col=Rain))

#5 Evaluate residuals of model to ascertain it's legit
plot(m2b) 
plot(exp(preds),data$Pollution)