### practice problem, Methods I F 2022, D Jenkins library(tidyverse) library(gamlss) library(MASS) library(performance) library(bbmle) library(ggplot2) ##### SO2 problem data <- read_delim("https://sciences.ucf.edu/biology/d4lab/wp-content/uploads/sites/23/2019/12/sulphur.dioxide.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE) View(data) #1 Evaluate potential predictors of Pollution and justify your choices to be used in models below. pairs(data, panel=panel.smooth) # Industry & Popn seem to push up high, then weird patterns with weather data # wind & rain should reduce, wet days? Try interactions of those 3 # but Industry and Population are very closely related - vifs? If too high, then use Pop - cars and stacks #2 compute & select potential predictors based on collinearity addmod <- lm(Pollution ~ Industry + Population + Temp + Wet.days + Wind + Rain, data=data) # full model # check for collinearity check_model(addmod) # yup - gonna use Popn not Industry - more inclusive(?) for cars, etc. # and assumptions look a bit off, also when scaled; addmods <- lm(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind) + scale(Rain), data=data) # full model check_model(addmods) # could use a neg binom? m1<-fitDist(residuals(fullmods)) m1$fits m1$failed # now work with distributions and trying a more complex weather schtick based on pairs plots m2a <- gamlss(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data, family=SN2, control = gamlss.control(n.cyc = 200)) # don't know what RG does and not sure what SN2 does... plot(m2a) # not bad summary(m2a) m2b <- gamlss(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data, family=NBI) # trying neg binom - Pollution is whole numbers >0, like counts plot(m2b) # maybe a bit better or same and neg binom is familiar - going with that one m2c <- glm.nb(Pollution ~ scale(Population) + scale(Temp) + scale(Wet.days) + scale(Wind)*scale(Rain), data=data) check_model(m2c) # I'll stick with gamlss with NBI check_overdispersion(m2c) # now try some alternatives NOTE: these are example models - you could run better ones m3 <- gamlss(Pollution ~ scale(Temp) + scale(Wet.days)*scale(Wind)*scale(Rain), data=data, family=NBI) # weather only m4 <- gamlss(Pollution ~ Population, data=data, family=NBI) # humans only m5 <- gamlss(Pollution ~ scale(Population) + scale(Temp), data=data, family=NBI) # combo 1 m6 <- gamlss(Pollution ~ scale(Population) + scale(Wet.days)*scale(Wind)*scale(Rain), data=data, family=NBI) # complex weather interaction m7 <- gamlss(Pollution ~ 1, data=data, family=NBI) # null #3 Compute and select among models to ID a most-plausible version. How well does it work? Which variable(s) are most important? AICctab(m2b,m3,m4,m5,m6,m7, base=T,delta=T,weights=T) # m2b looks solid summary(m2b) r2(m2b) ### Interpretation: # Popn has strong + effect, as it should; more exhaust, etc. # warmer temps strongly reduce SO2; photo-oxidation? # and wind & rain interact, where wind dilutes but rain increases SO2 #4 Graph model, where Pollution is a function of strongest predictor(s) and color is a second predictor preds <- predict(m2b) ggplot(data=data) + geom_point(aes(Temp, exp(preds), col=Rain)) # I think this is more clear - related to coeffs? # or ggplot(data=data) + geom_point(aes(Wind, exp(preds), col=Rain)) #5 Evaluate residuals of model to ascertain it's legit plot(m2b) plot(exp(preds),data$Pollution)