######  Ordinations in R, by Dave Jenkins & Leo Ohyama, using data collected by Leo! ##################

# Here we use Redundancy Analysis (RDA) and Non-metric Multi-dimensional Scaling (NMDS) 
# on the same data set (diplo2017.csv). RDA in this case is the same as Principal Components Analysis PCA)
# because no gradient (constraint) is specified - we seek only to summarize pattern.
# Data list ant species (columns) per site (rows).
# We rely on the awesome vegan package for both types of analyses, with the goal to answer:
# 1. Do ant communities differ between the two habitat types?
# 2. Are certain species more associated with specific habitat types?
# 3. How dissimilar are habitats in terms of beta diversity? (for NMDS)

##### Data import and set up
## Import the "diplo2017.csv". 
dim(diplo2017) # 16 species x 32 sites matrix, containing species abundances
# The first 16 rows are sites in pine flatwoods and the next 16 are in high pine sandhill 
habitats<-diplo2017[,1:2] # to make a dataframe of just habitat and site columns
df<-diplo2017[,3:17] #to make a dataframe of just species abundances
colSums(df) # Get total counts per species
# Notice that there are four species only found once
# These are called singletons and are often removed from the data because they are over-
# influential in the analysis

library(vegan)

##### Squint at data: correlations mean ordinations can simplify
# where correlations may be L-shaped graphs (where one goes high, the other goes low)

pairs(df, lower.panel=NULL)

##### Evaluate multivariate homogeneity of variance: an RDA is a linear method and not legit without it
## first calculate Bray-Curtis distances between samples (requires the numbers-only data)
dis <- vegdist(df)
groups <- habitats[[1]] # where the [[1]] returns a vector (not a df) of the first column 
### Calculate multivariate dispersions
homog <- betadisper(dis, groups)
homog

## Perform anova of group homogeneities
anova(homog)

## And a permutation test for F between groups
permutest(homog, pairwise = TRUE)

## And a Tukey's HSD for multi-group differences, with a plot
(homog.HSD <- TukeyHSD(homog))
plot(homog.HSD) # if 95% CI overlaps 0 there is not a sig dif in variance between groups

##### Run an RDA: here use rda in vegan. RDA can also work with an expected regression-like gradient, where
# you have both predictors and response variables. Here we use only response variables - ants - so it is 
# the same as PCA.

antspca <- rda(df)
summary(antspca) # You should report the eigenvalues and % explained for RDA plots
screeplot(antspca, bstick=T)# How many axes are enough to capture the story? Look for a break in the plot below the broken stick line

# Now plot the RDA
biplot(antspca) # black are sites, red are species vectors
ordihull(antspca, group=habitats$Habitat) # simple "hull: around points of groups

### Now the questions:
# 1. Do ant communities differ between the two habitat types?
# 2. Are certain species more associated with specific habitat types?
# Note that we cannot really address Question 3 yet

##### Run NMDS on the df data, again in vegan, and needing no assumptions tests
# We run the nmds analysis using the metaMDS() function

antsmds=metaMDS(df, k=2, trymax=1000, autotransform = F, distance = 'bray')
# Where:
#    k indicates the number of dimensions we choose to ordinate the data
#    trymax= indicates the number of runs the analysis is done with, it will
#       stop when the analysis successfully runs. Note the analysis ran till the 20th run
#    autotransform applies a transformation to the data if it's set to true
#    distance specifies which measure of dissimilarity we use to ordinate 
#    Bray-Curtis dissimilarity index is used in this case because we deal with abundance-based data
#       This is a strong measure of beta diversity

antsmds$stress # you must report the stress value of the result (how well the analysis was able to ordinate the points in multivariate space)
stressplot(antsmds) # a good visual of stress; ideally a linear step-wise increase
#acceptable stress is seen between 0.10 to 0.20 although this is debatable

# Try increasing k to reduce stress, which then calls for more axes in plots
# Is the plot more linear?

#For the sake of plotting let's go with the first analysis at k=2

plot(antsmds) #not really informative yet but can see what will be used next

habs=c(rep("Flatwoods",16),rep("Sandhills",16)) #create an object specifying habitat types
#used in the sampling. Note these correspond to each row, the first 16 rows are sites in 
#pine flatwoods and the next 16 were in high pine sandhill 
colors=c(rep("dodgerblue3",16),rep("grey",16)) #make another object specifying colors 

ordiplot(antsmds,type="n", main = "NMDS Between Habitats")
#code above gives us a blank nmds plot with a title, to see what next plot is using
ordispider(antsmds, groups = habs) 
# Plots lines from each habitat's centroid to its sites
points(antsmds, cex = 1, pch = c(rep(15,16), rep(16,16)), col = c(rep("dodgerblue3",16), rep("grey",16)))
# Greater distance in the plot indicates greater Bray-Curtis dissimilarity (beta diversity)

# For species position in this space:
orditorp(antsmds,display="species",col="black",air=0.6, cex=0.5)

### Now the questions again:
# 1. Do ant communities differ between the two habitat types?
# 2. Are certain species more associated with specific habitat types?
# 3. How dissimilar are habitats in terms of beta diversity?

# Let's get a littly fancy with the graphics

for(i in unique(habs)){
  ordihull(antsmds$point[grep(i,habs),],draw="polygon",
           groups=habs[habs==i], col=colors[grep(i,habs)], alpha = 0.3,label=F)}

orditorp(antsmds,display="species",col="black",air=0.6, cex=0.5)
legend("topright", legend = c("Flatwoods", "Sandhill"), col = c("dodgerblue3", "grey"), 
       pch = 15, bty = "l", cex = 1, text.col = c("black", "black"), xpd = T)

#the polygon colors may be slightly overboard because the separation of sites and habitats
#based on beta diversity is pretty clear
#note the ordispider function is better for analyses run at 2+ dimensions or k > 2