Commit 7980941b authored by Christiane Hassenrück's avatar Christiane Hassenrück
Browse files

updated multivariate

parents
#################################################
# MULTIVARIATE STATISTICS #
#################################################
#################################################
# PART 1. DATA GROUPING #
#################################################
#################################################
# E1. CLUSTERING #
#################################################
###################################################################################
# a. Clustering (of sites according to species biomass) #
###################################################################################
# PREPARING THE DATASET FOR MULTIVARIATE ANALYSIS #################################
# Clean workspace and memory
rm(list=ls())
# Set the working directory:
setwd("C://multivar")
# Load the dataset
biomass <- read.table("fish_biomass.txt", h = T, sep = "\t")
# Cast the dataset to have species across the top as columns
# and compute zeroes
require(reshape) # (load necessary package)
biomass.wide<-cast(biomass,site+transect~species,sum)
# Melt and compute means per transects (incorporating zeroes)
biomass.melted <- melt(biomass.wide)
# Getting rid of odd first row with species names:
row.names(biomass.melted) <- 1:length(biomass.melted[,1])
# Renaming "value" to "biomass"
names(biomass.melted) <- c(names(biomass.melted)[1:2],'biomass',names(biomass.melted)[4])
# Repositioning biomass at the end:
biomass.melted <- subset(biomass.melted,select=c(1:2,4,3))
# check is all good
View(biomass.melted)
# Note the abundance of zeroes
# calculate means per site (among transects)
mean.biomass <- aggregate(biomass ~ site+species,FUN=mean,data=biomass.melted)
# The dataset is ready for multivariate analysis, begin by casting it again
# to obtain a community biomass matrix (starting point of all multivariate analyses)
meanbiomass.wide<-cast(biomass,site~species,sum)
# turn the matrix into a dataframe
meanbiomass.wide.df <- data.frame(meanbiomass.wide[,-1], row.names=meanbiomass.wide[,1])
# START OF WORKFLOW DESCRIBED IN CLASS####################################################
require(vegan)
# (1) Choose transformation and association coefficient
# (a) Does the raw data come from equal sample sizes?
# YES We know that our data come from transects (30 x 4 m) so was taken in samples
# of equal size. Moreover in our case, we are using transect data aggregated in site means
# Therefore >> NO NEED TO STANDARDISE
# (b) Are we concerned about common or rare species?
# We do not want to focus specifically on common species, as rare species
# may convey key ecological functions. However, we dont want to go to the extreme of
# focusing just on rare species. We therefore apply a mild transformation
# (c) Do we intend to continue asking more questions of the data with ordination
# or redundancy analysis?
# Yes we do,
# (2) Therefore we need to transform the data using Hellinger transformation
# Hellinger-transform the data
bio.hel<-decostand(meanbiomass.wide.df,"hel")
# (3) Compute the Hellinger distance (triangular) matrix
d<-vegdist(bio.hel, "euc")
d
# (4) RUN THE CLUSTERING ANALYSIS: As presented in class,
# Best choices are hierarchical clustering (average method)
# and non-hierarchical K-means clustering
# AVERAGE LINKAGE (RECOMMENDED AMONG HIERARCHICAL OPTIONS)
h.clust.av <- hclust(d, method = "average")
plot(h.clust.av)
# WARD
clust.ward <- hclust(d, method = "ward.D2")
plot(clust.ward)
# To compare and understand the philosophies of each linkage method:
# Run the clustering with other types of hierarchical clustering:
# SIMPLE LINKAGE
h.clust.sim <- hclust(d, method = "single")
plot(h.clust.sim)
# COMPLETE LINKAGE
h.clust.com <- hclust(d, method = "complete")
plot(h.clust.com)
# (5) CHOOSING AND PLOTTING THE BEST DENDROGRAM AND INTERPRETING IT
# Load necessary packages
require(cluster)
require(stats)
# 5.1. how to decide which dendrogram retains the closest relationship
# to the ditance matrix?
# Through Cophenetic correlation
# Compute the cophenetic correlation for each method
# Simple
simple.coph <- cophenetic(h.clust.sim)
cor(d, simple.coph)
# Complete
complete.coph <- cophenetic(h.clust.com)
cor(d, complete.coph)
# average
average.coph <- cophenetic(h.clust.av)
cor(d, average.coph)
# WARD
ward.coph <- cophenetic(clust.ward)
cor(d, ward.coph)
# which one retains the closest relationship to the distance matrix?
# The average linkage method
# (5.2) Looking for Interpretable clusters in the best cluster
# Visually, you could say that there are at least 2 groups
# and a single site (i.e. 3 groups)
# You could just add this info to the dendogram:
rect.hclust(h.clust.av,3)
# Find out which site belongs to which cluster:
cl<-cutree(h.clust.av,3)
cl
# sdrop is out of the red box, but thats no problem,
# we will fix the aesthetics of the plot later.
# But What problem do you see with setting up the number of clusters visually?
# How to judge and decide the optimal number of clusters
# i.e. at what level should the dendrogram be cut?
# Method: Average silhouette widths
# First create an empty vector in which to write the Average Silouetthe widths
# the number and names of rows are set to be as those in the matrix "trans"
asw<-numeric(nrow(bio.hel))
# Retrieve the asw values into the created vector
for(k in 2: (nrow(bio.hel)-1)) {
sil<-silhouette(cutree(h.clust.av, k=k),d)
asw[k] <-summary(sil)$avg.width
}
# Best (largest) silouette width
k.best<-which.max(asw)
# Produce plot
plot(1:nrow(bio.hel), asw, type="h", main="silhouette-optimal number of clusters",
xlab="k (number of groups)", ylab="Average silhouette width")
axis(1, k.best, paste("optimum", k.best, sep="\n"),col="red",
font=2, col.axis="red")
points(k.best, max(asw), pch=16, col="red", cex=1.5)
# This says that you have 2 interpretable clusters
# Optimal number of clusters according to Mantel Statistic
grpdist<-function(X)
{
require(cluster)
gr<-as.data.frame(as.factor(X))
distgr<-daisy(gr,"gower")
distgr
}
# Run it based on the average linkage clustering
kt <-data.frame(k=1:nrow(bio.hel), r=0)
for(i in 2:(nrow(bio.hel)-1)) {
gr <-cutree(h.clust.av, i)
distgr<-grpdist(gr)
mt<-cor(d, distgr, method="pearson")
kt[i,2] <-mt
}
kt
k.best<-which.max(kt$r)
# plot
plot(kt$k, kt$r, type="h", main="Mantel optimal number of clusters - average",
xlab="K(number of groups", ylab="pearsons correlation")
axis(1, k.best, paste ("optimum", k.best, sep="\n"), col="red",
font=2, col.axis="red")
points(k.best, max(kt$r), pch=16, col="red", cex=1.5)
# This analysis says that there are 4 clusters
# Silhouette PLot of the final partition
# You chose the number of clusters
# lets trust the second analysis, and chose 4 clusters
k<-4
# Silhouette plot
cutg <-cutree(h.clust.av, k=k)
sil<-silhouette(cutg,d)
silo<-sortSilhouette(sil)
rownames(silo)<-row.names(bio.hel)[attr(silo,"iOrd")]
plot(silo, main="silohouette plot - Clustering Average",
cex.names=0.8, col=cutg+1, nmax.lab = 100)
# All coloured horizontal bars represent positive values,
# therefore this means that all 4 groups are coherent, and there
# are no misclassified sites
# Lets take this option forward and plot the final dendrogram:
# First reorder sites so that their order in the distance matrix is
# respected as much as posible
ordclust<-reorder(h.clust.av, d)
plot(ordclust, hang=-1, xlab="4 groups", sub="", ylab="height",
main= "Final Dendrogram", labels=cutree(ordclust, k=k))
rect.hclust(ordclust, k=k)
# (6) INTERPRET THE ECOLOGICAL MEANING
# Find what is different in one group from the others?
# USe the vegan function "tabasco"
tabasco(bio.hel, ordclust)
###################################################################################
# b. Clustering (of sites according to environmental data) #
###################################################################################
# Set the working directory:
library(vegan)
setwd("C://multivar")
# (1) Load the raw dataset
env <- read.table("environmental_quant.txt", h = T, sep = "\t")
# Note that this file is different from the one you have worked with
# as wave exposure is quantitative rather than categorical
# THE END ################################################################################
##########################################################################################
# Report tips
##########################################################################################
# - Report in the methods the type of clustering analysis used, the algorythm used
# to compute the association coefficients, the clustering method used,
# and the strategy used to identify the number of significant clusters identified
# - Describe in RESULTS
# Whehter there is any grouping of the sites apparent according to their community
# structure.
# - Whehter there is any grouping of the sites apparent according to their environmental
# characteristics.
# - What species and common environemental characteristics appear to be separating the
# groups?
##########################################################################################
File added
This diff is collapsed.
This diff is collapsed.
#Library files for courses provided by: Highland Statistics Ltd.
#To cite these functions, use:
#Mixed effects models and extensions in ecology with R. (2009).
#Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer.
#Copyright Highland Statistics LTD.
#####################################################################
#VIF FUNCTION.
#To use: corvif(YourDataFile)
corvif <- function(dataz) {
dataz <- as.data.frame(dataz)
#correlation part
#cat("Correlations of the variables\n\n")
#tmp_cor <- cor(dataz,use="complete.obs")
#print(tmp_cor)
#vif part
form <- formula(paste("fooy ~ ",paste(strsplit(names(dataz)," "),collapse=" + ")))
dataz <- data.frame(fooy=1,dataz)
lm_mod <- lm(form,dataz)
cat("\n\nVariance inflation factors\n\n")
print(myvif(lm_mod))
}
#Support function for corvif. Will not be called by the user
myvif <- function(mod) {
v <- vcov(mod)
assign <- attributes(model.matrix(mod))$assign
if (names(coefficients(mod)[1]) == "(Intercept)") {
v <- v[-1, -1]
assign <- assign[-1]
} else warning("No intercept: vifs may not be sensible.")
terms <- labels(terms(mod))
n.terms <- length(terms)
if (n.terms < 2) stop("The model contains fewer than 2 terms")
if (length(assign) > dim(v)[1] ) {
diag(tmp_cor)<-0
if (any(tmp_cor==1.0)){
return("Sample size is too small, 100% collinearity is present")
} else {
return("Sample size is too small")
}
}
R <- cov2cor(v)
detR <- det(R)
result <- matrix(0, n.terms, 3)
rownames(result) <- terms
colnames(result) <- c("GVIF", "Df", "GVIF^(1/2Df)")
for (term in 1:n.terms) {
subs <- which(assign == term)
result[term, 1] <- det(as.matrix(R[subs, subs])) * det(as.matrix(R[-subs, -subs])) / detR
result[term, 2] <- length(subs)
}
if (all(result[, 2] == 1)) {
result <- data.frame(GVIF=result[, 1])
} else {
result[, 3] <- result[, 1]^(1/(2 * result[, 2]))
}
invisible(result)
}
#END VIF FUNCTIONS
##################################################################
##################################################################
#Here are some functions that we took from the pairs help file and
#modified, or wrote ourselves. To cite these, use the r citation: citation()
panel.cor <- function(x, y, digits=1, prefix="", cex.cor = 6)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r1=cor(x,y,use="pairwise.complete.obs")
r <- abs(cor(x, y,use="pairwise.complete.obs"))
txt <- format(c(r1, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) { cex <- 0.9/strwidth(txt) } else {
cex = cex.cor}
text(0.5, 0.5, txt, cex = cex * r)
}
##################################################################
panel.smooth2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "black", span = 2/3, iter = 3, ...)
{
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
ok <- is.finite(x) & is.finite(y)
if (any(ok))
lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
col = 1, ...)
}
##################################################################
panel.lines2=function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, ...)
{
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
ok <- is.finite(x) & is.finite(y)
if (any(ok)){
tmp=lm(y[ok]~x[ok])
abline(tmp)}
}
##################################################################
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
##################################################################
##################################################################
##################################################################
##################################################################
#Functions for variograms
#To cite these functions, use:
#Mixed effects models and extensions in ecology with R. (2009).
#Zuur, AF, Ieno, EN, Walker, N, Saveliev, AA, and Smith, GM. Springer.
#Make a variogram for one variable
#To use, type: MyVariogram(XUTM, YUTM, E , MyDistance=10)
# XUTM is x coordinates
# XUTM is y coordinates
# E is variable used in sample variogram
# MyDistance is the cutoff value for the distances
MyVariogram <- function(x,y,z, MyDistance) {
library(gstat)
mydata <- data.frame(z, x, y)
coordinates(mydata) <- c("x", "y")
Var <- variogram(z ~ 1, mydata, cutoff = MyDistance)
data.frame(Var$np, Var$dist, Var$gamma)
}
#Function for making multiple variograms in an xyplot
#To use, type: MultiVariogram(Z, MyVar,XUTM, YUTM, MyDistance=10)
# Z is a data frame with all the data
# Character string with variable names that will be used in the xyplot
# XUTM is x coordinates
# XUTM is y coordinates
# MyDistance is the cutoff value for the distances
MultiVariogram <- function(Z, MyVar, x, y, MyDistance) {
#Z is the data frame with data
#MyVar is a list of variables for for which variograms are calculated
#x, y: spatial coordinates
#MyDistance: limit for distances in the variogram
library(lattice)
VarAll<- c(NA,NA,NA,NA)
for (i in MyVar){
vi <- MyVariogram(x,y,Z[,i], MyDistance)
vii <- cbind(vi, i)
VarAll <- rbind(VarAll,vii)
}
VarAll <- VarAll[-1,]
P <- xyplot(Var.gamma ~ Var.dist | factor(i), col = 1, type = "p", pch = 16,
data = VarAll,
xlab = "Distance",
ylab = "Semi-variogram",
strip = function(bg='white', ...)
strip.default(bg='white', ...),
scales = list(alternating = T,
x = list(relation = "same"),
y = list(relation = "same"))
)
print(P)
}
#End variogram code
##########################################################
#Function for multi-panel Cleveland dotplot.
#The input file must contain no categorical variables
Mydotplot <- function(DataSelected){
P <- dotplot(as.matrix(as.matrix(DataSelected)),
groups=FALSE,
strip = strip.custom(bg = 'white',
par.strip.text = list(cex = 1.2)),
scales = list(x = list(relation = "free", draw = TRUE),
y = list(relation = "free", draw = FALSE)),
col=1, cex = 0.5, pch = 16,
xlab = list(label = "Value of the variable", cex = 1.5),
ylab = list(label = "Order of the data from text file", cex = 1.5))
print(P)
}
#Add more code here:
Mybwplot <- function(Z, MyVar, TargetVar){
#Multipanel boxplots
#Z: data set
#MyVar: character string
#TargetVar: variable for the x-axis..must be a factor
AllY <- as.vector(as.matrix(Z[,MyVar]))
AllX <- rep(Z[,TargetVar], length(MyVar))
ID <- rep(MyVar, each = nrow(Z))
P <- bwplot(AllY ~ factor(AllX) | ID, horizontal = FALSE,
ylab = "", xlab = "",
scales = list(alternating = T,cex.lab = 1.5,
x = list(relation = "same",rot =90, abbreviate = TRUE, cex = 1.5),
y = list(relation = "free", draw = FALSE)),
strip = strip.custom(bg = 'white',
par.strip.text = list(cex = 1.2)),
cex = .5,
par.settings = list(
box.rectangle = list(col = 1),
box.umbrella = list(col = 1),
plot.symbol = list(cex = .5, col = 1)))
print(P)
}
#######################################################
MyxyplotBin <- function(Z, MyV, NameY1) {
AllX <- as.vector(as.matrix(Z[,MyV]))
AllY <- rep(Z[,NameY1] , length(MyV))
AllID <- rep(MyV, each = nrow(Z))
library(mgcv)
library(lattice)
P <- xyplot(AllY ~ AllX | factor(AllID), col = 1,
strip = function(bg='white', ...) strip.default(bg='white', ...),
scales = list(alternating = T,
x = list(relation = "free"),
y = list(relation = "same")),
xlab = "Covariate",
ylab = "Probability of presence",
panel=function(x,y){
panel.grid(h=-1, v= 2)
panel.points(x,y,col=1)
tmp<-gam(y~s(x, k = 4), family = binomial)
MyData <- data.frame(x = seq(min(x), max(x), length = 25))
p1 <- predict(tmp, newdata = MyData, type ="response")
panel.lines(MyData$x,p1, col = 1, lwd = 3)
})
print(P)
}
#######################################################
#######################################################
Myxyplot <- function(Z, MyV, NameY1,MyYlab="") {
AllX <- as.vector(as.matrix(Z[,MyV]))
AllY <- rep(Z[,NameY1] , length(MyV))
AllID <- rep(MyV, each = nrow(Z))
library(mgcv)
library(lattice)
P <- xyplot(AllY ~ AllX|factor(AllID), col = 1,
xlab = list("Explanatory variables", cex = 1.5),
#ylab = list("Response variable", cex = 1.5),
#ylab = list("Pearson residuals", cex = 1.5),
ylab = list(MyYlab, cex = 1.5),
#layout = c(2,2), #Modify
strip = function(bg='white', ...)
strip.default(bg='white', ...),
scales = list(alternating = T,
x = list(relation = "free"),
y = list(relation = "same")),
panel=function(x, y){
panel.grid(h=-1, v= 2)
panel.points(x, y, col = 1)
panel.loess(x, y, span = 0.8,col = 1, lwd = 2)})
print(P)
}
#######################################################
MyxyplotPolygon <- function(Z, MyV, NameY1) {
AllX <- as.vector(as.matrix(Z[,MyV]))
AllY <- rep(Z[,NameY1] , length(MyV))
AllID <- rep(MyV, each = nrow(Z))
library(mgcv)
library(lattice)
Z <- xyplot(AllY ~ AllX|factor(AllID), col = 1,
xlab = list(label = "Explanatory variables", cex = 1.5),
ylab = "",
strip = function(bg='white',cex.lab = 1.5,...)
strip.default(bg='white', ...),
scales = list(alternating = T,
x = list(relation = "free"),
y = list(relation = "same")),
panel=function(x, y){
t1 <- gam(y~s(x))
MD1 <- data.frame(x=seq(from = min(x, na.rm = TRUE),
to = max(x, na.rm = TRUE),
length = 100))
P1 <- predict(t1, se.fit = TRUE)
I1 <- order(x)
xs <- sort(x)
panel.lines(xs, P1$fit[I1], col = 1)
panel.polygon(c(xs, rev(xs)),
c(P1$fit[I1]-2*P1$se.fit[I1],
rev(P1$fit[I1]+2*P1$se.fit[I1])),