############################################################################################
#                                                                                          #
# Metabolomics analysis - PCA/PLSDA/SPLSDA                                                 #
#                                                                                          #
#                                                                                          #
# Victor J. Carrion                                                                        #
# Netherlands Institute of Ecology                                                         #
# v.carrion@nioo.knaw.nl                                                                   #  
# victorcarrionbravo@hotmail.com                                                           #
#                                                                                          #
############################################################################################

library(mixOmics)
library(ggplot2)
library(reshape2)
library(ggdendro)
library(grid)


#Figure 2####
#Figure 2B
inputVOCs <- read.table("1_input_DI.txt", header = TRUE, sep = "\t")
p <- ggplot(data = inputVOCs, aes(x=Treatment, y=DI)) + geom_boxplot(aes(fill=Treatment), width = 0.8) +
  theme_bw() + 
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())
p + 
  scale_fill_manual(values=c("white","white","white","white","white","white","gray","black","white"))  + theme (axis.text=element_text(size=15))

#Figure 3####
#Figure 3B
inputVOCs <- read.table("2_input_in_vitro_VOCs.txt", header = TRUE, sep = "\t")
p <- ggplot(data = inputVOCs, aes(x=Treatment, y=day4)) + geom_boxplot(aes(fill=Treatment), width = 0.8) +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())
p + 
  scale_fill_manual(values=c("green4", "royalblue","sienna2","grey","palevioletred3", "white")) + theme (axis.text=element_text(size=15))


#Figure 3C

wild <- read.table("3_input_pareto_norm.txt", sep="\t", header = TRUE)

head(wild)
tail(wild)
X_wild <- (wild[, 3:503])
head(X_wild)
tail(X_wild)
group_wild <- wild[, 2]
head(group_wild)

#Preliminary analysis with PCA: PCA analysis on the volatomic data.
pca.wild <- pca(X_wild, ncomp = 3, center = TRUE, scale = TRUE)
pca.wild
plot(pca.wild)

#PCA with centroids
plotIndiv(pca.wild, group = wild$Group, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'VOCs Burkholderia spp.', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)

#PCA without centroids
plotIndiv(pca.wild, group = wild$Group, ind.names = FALSE, 
          ellipse = FALSE, legend = TRUE, title = 'VOCs Burkholderia spp.', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)


#PLS-DA analysis
#The PLS-DA and sPLS-DA analyses below will help refine the clusters of samples in a supervised fashion. 
Y_wild <- wild$Group
summary(Y_wild)


#this chunk takes ~ 5 min to run
set.seed(32) # for reproducibility of the outputs of this code that performs random cross-validation sampling. 
wild.plsda.perf <- plsda(X_wild, Y_wild, ncomp = 6)
# to speed up computation in this example we choose 5 folds repeated 10 times:
wild.perf.plsda <- perf(wild.plsda.perf, validation = 'Mfold', folds = 6,
                        progressBar = FALSE, nrepeat = 10)

head(wild.perf.plsda$error.rate)

plot(wild.perf.plsda, overlay = 'measure', sd=TRUE)

wild.plsda <- plsda(X_wild, Y_wild, ncomp = 10)

#PLS-DA with centroids
plotIndiv(wild.plsda , comp = c(1,2),
          group = wild$Group, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'wild, PLSDA comp 1 - 2')

plot(wild.perf.plsda, overlay = 'measure', sd=TRUE)



wild.plsda <- plsda(X_wild, Y_wild, ncomp = 10)

#PLS-DA without centroids
plotIndiv(wild.plsda , comp = c(1,2),
          group = wild$Group, style = 'ggplot2', ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'VOCs Burkholderia spp.', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)

#Figure 3D
#differential abundance wild

df <-read.delim("4_stats_KW_BH_dot_plot.txt", header = T, sep = "\t")
head(df, 10)

g<-ggplot(df, aes(x=RT, y=Pgave_ratio)) +
  geom_point(size=6, colour=df$Class_ratio_stats, pch=20)  +
  theme_bw() 
g +   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
            panel.background = element_rect(fill = "white", colour = "black",
                                            linetype = 0.5, size = 1.5),
            axis.ticks = element_line(colour = "black"),
            axis.title.x = element_text(face="bold", colour="black", size=10),
            axis.title.y = element_text(face="bold", colour="black", size=10),
            axis.text.x = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25),
            axis.text.y = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25)) 

#Figure 3E
#heatmap only sulf
FW.data <- read.table("5_heatmap_KW_BH_sulf_values.txt",header = TRUE,sep = "\t")
head(FW.data)
tail(FW.data)

Sample.names <- read.table("6_heatmap_ID.txt",header = TRUE,sep = "\t")
head(Sample.names)
tail(Sample.names)

Metabolite.names <- read.table("7_heatmap_sulf_metabolites.txt",header = TRUE,sep = "\t")
head(Metabolite.names)
tail(Metabolite.names)



x <- as.matrix(scale(subset(FW.data, select=c(3:37)))) 
row.names(x) = Sample.names[,1] 
colnames(x) = Metabolite.names[,1] 
#hclust per columns
dd.col <- as.dendrogram(hclust(dist(x), method="ward.D")) 
col.ord <- order.dendrogram(dd.col) 

#hclust per rows
dd.row <- as.dendrogram(hclust(dist(t(x)), method="ward.D")) 
row.ord <- order.dendrogram(dd.row) 

xx <- scale(x)[col.ord, row.ord] 
head(xx)
xx_names <- attr(xx, "dimnames")
df <- as.data.frame(xx)
head(df)
colnames(df) <- xx_names[[2]] 
df$samples <- xx_names[[1]] 
df$samples <- with(df, factor(samples, levels=samples, ordered=TRUE)) 

mdf <- melt(df, id.vars="samples") 

ddata_x <- dendro_data(dd.row) 
ddata_y <- dendro_data(dd.col) 

# Set up a blank theme
theme_none <- theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_blank(),
  axis.title.x = element_text(colour=NA),
  axis.title.y = element_blank(),
  axis.text.x = element_text(size = 1), element_blank(),
  axis.text.y = element_blank(),
  axis.line = element_blank()
  #axis.ticks.length = element_blank()
)

# Create plot components #    
# Heatmap
p1 <- ggplot(mdf, aes(x=samples, y=variable)) + 
  geom_tile(aes(fill=value)) + 
  scale_fill_gradient2()
p1


# Dendrogram 1
p2 <- ggplot(segment(ddata_y)) + 
  geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) + 
  theme_none + theme(axis.title.x=element_blank())
p2



# Dendrogram 2
p3 <- ggplot(segment(ddata_x)) + 
  geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) + 
  coord_flip() + theme_none
p3

### Draw graphic 

grid.newpage()
print(p1, vp=viewport(0.8, 0.8, x=0.4, y=0.4))
print(p2, vp=viewport(0.74, 0.2, x=0.42, y=0.9))
print(p3, vp=viewport(0.2, 0.8, x=0.9, y=0.4))





#Figure 5####
#Figure 5A, mutants in vitro
indexes <- read.table("8_input_in_vitro_VOCs_mutants.txt", header = TRUE, sep = "\t")
p <- ggplot(data = indexes, aes(x=Treatment, y=day4)) + geom_boxplot(aes(fill=Treatment), width = 0.8) +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank()) +
  theme(axis.text=element_text(size=25),
        axis.title=element_text(size=14,face="bold"))
p + 
  scale_fill_manual(values=c("orange","red1","red4", "white", "white"))



#Figure 5B, disease incidence mutants in vivo
indexes <- read.table("9_input_DI_mutants.txt", header = TRUE, sep = "\t")
p <- ggplot(data = indexes, aes(x=Treatment, y=DIS.POT)) + geom_boxplot(aes(fill=Treatment), width = 0.8) +
  theme_bw() +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank()) +
  theme(axis.text=element_text(size=25),
        axis.title=element_text(size=14,face="bold"))
p + 
  scale_fill_manual(values=c("orange","red1","red4", "white", "white"))

#VOCs collected from Burkholderia mutants#
#Figure 5C, PLS-DA
noPDA <- read.table("10_input_pareto_norm_mutants.txt", sep="\t", header = TRUE)

head(noPDA)
tail(noPDA)
X_noPDA <- (noPDA[, 3:503])
head(X_noPDA)
tail(X_noPDA)
group_noPDA <- noPDA[, 2]
head(group_noPDA)

#Preliminary analysis with PCA
#We start a preliminary investigation with PCA analysis on the volatomic data.
pca.noPDA <- pca(X_noPDA, ncomp = 3, center = TRUE, scale = TRUE)
pca.noPDA
plot(pca.noPDA)

#The PCA numerical output shows that 28% of the total variance is explained with 3 principal components. The barplot above shows the variance explained per component.

#Note that it is preferable to first run a PCA with a large number of components (e.g. 10), then visualise on the barplot when the 'elbow' (sudden drop) appear to choose the final number of PCs.


plotIndiv(pca.noPDA, group = noPDA$Group, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'B. graminis and mutants', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)


plotIndiv(pca.noPDA, group = noPDA$Group, ind.names = FALSE, 
          ellipse = FALSE, legend = TRUE, title = 'B. graminis and mutants', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)


#PLS-DA analysis
#The PLS-DA and sPLS-DA analyses below will help refine the clusters of samples in a supervised fashion. For a supervised analysis, we set up the Y as a factor indicating the class membership of each tumour.
Y_noPDA <- noPDA$Group
summary(Y_noPDA)


#this chunk takes ~ 5 min to run
set.seed(32) # for reproducibility of the outputs of this code that performs random cross-validation sampling. To be removed in proper analysis
noPDA.plsda.perf <- plsda(X_noPDA, Y_noPDA, ncomp = 3)
# to speed up computation in this example we choose 5 folds repeated 10 times:
noPDA.perf.plsda <- perf(noPDA.plsda.perf, validation = 'Mfold', folds = 3,
                         progressBar = FALSE, nrepeat = 3)



head(noPDA.perf.plsda$error.rate)

plot(noPDA.perf.plsda, overlay = 'measure', sd=TRUE)

noPDA.plsda <- plsda(X_noPDA, Y_noPDA, ncomp = 3)

plotIndiv(noPDA.plsda , comp = c(1,2),
          group = noPDA$Group, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'noPDA, PLSDA comp 1 - 2')

plot(noPDA.perf.plsda, overlay = 'measure', sd=TRUE)



noPDA.plsda <- plsda(X_noPDA, Y_noPDA, ncomp = 3)

plotIndiv(noPDA.plsda , comp = c(1,2),
          group = noPDA$Group, style = 'ggplot2', ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, title = 'B. graminis and mutants', 
          size.xlabel = 20, size.ylabel = 20, size.axis = 25, pch = 15, cex = 5)






#Figure 5D, differential abundance Pg vs dsr
df <-read.delim("11_stats_KW_BH_dot_plot_mutants.txt", header = T, sep = "\t")
head(df, 10)

g<-ggplot(df, aes(x=Ret.min., y=Ratio_Pg_dsr)) +
  geom_point(size=6, colour=df$Class_ratio_stats_dsr, pch=20)  +
  theme_bw() 
g +   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
            panel.background = element_rect(fill = "white", colour = "black",
                                            linetype = 0.5, size = 1.5),
            axis.ticks = element_line(colour = "black"),
            axis.title.x = element_text(face="bold", colour="black", size=10),
            axis.title.y = element_text(face="bold", colour="black", size=10),
            axis.text.x = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25),
            axis.text.y = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25)) 

#to add labels
#+ geom_text(aes(label=df$label),hjust=2, vjust=3)

#Figure 5E, differential abundance Pg vs cs

g<-ggplot(df, aes(x=Ret.min., y=Ratio_Pg_cd)) +
  geom_point(size=6, colour=df$Class_ratio_stats_cd, pch=20)  +
  theme_bw() 
g +   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
            panel.background = element_rect(fill = "white", colour = "black",
                                            linetype = 0.5, size = 1.5),
            axis.ticks = element_line(colour = "black"),
            axis.title.x = element_text(face="bold", colour="black", size=10),
            axis.title.y = element_text(face="bold", colour="black", size=10),
            axis.text.x = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25),
            axis.text.y = element_text(face="bold", colour = "black", angle=360,
                                       vjust=0.5, size=25)) 



#Figure 5F
#heatmap only sulf

FW.data <- read.table("12_heatmap_KW_BH_sulf_values_mutants.txt",header = TRUE,sep = "\t")
head(FW.data)
tail(FW.data)

Sample.names <- read.table("13_heatmap_ID_mutants.txt",header = TRUE,sep = "\t")
head(Sample.names)
tail(Sample.names)

Metabolite.names <- read.table("14_heatmap_sulf_metabolites_mutants.txt",header = TRUE,sep = "\t")
head(Metabolite.names)
tail(Metabolite.names)



x <- as.matrix(scale(subset(FW.data, select=c(3:60)))) 
row.names(x) = Sample.names[,1] 
colnames(x) = Metabolite.names[,1] 
#hclust per columns
dd.col <- as.dendrogram(hclust(dist(x), method="ward.D")) 
col.ord <- order.dendrogram(dd.col) 

#hclust per rows
dd.row <- as.dendrogram(hclust(dist(t(x)), method="ward.D")) 
row.ord <- order.dendrogram(dd.row) 

xx <- scale(x)[col.ord, row.ord]  
head(xx)
xx_names <- attr(xx, "dimnames") 
df <- as.data.frame(xx) 
head(df)
colnames(df) <- xx_names[[2]] 
df$samples <- xx_names[[1]] 
df$samples <- with(df, factor(samples, levels=samples, ordered=TRUE)) 

mdf <- melt(df, id.vars="samples") 

ddata_x <- dendro_data(dd.row) 
ddata_y <- dendro_data(dd.col) 

### Set up a blank theme
theme_none <- theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_blank(),
  axis.title.x = element_text(colour=NA),
  axis.title.y = element_blank(),
  axis.text.x = element_text(size = 1), element_blank(),
  axis.text.y = element_blank(),
  axis.line = element_blank()
  #axis.ticks.length = element_blank()
)

### Create plot components ###    
# Heatmap
p1 <- ggplot(mdf, aes(x=samples, y=variable)) + 
  geom_tile(aes(fill=value)) + 
  scale_fill_gradient2()
p1

# Dendrogram 1
p2 <- ggplot(segment(ddata_y)) + 
  geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) + 
  theme_none + theme(axis.title.x=element_blank())
p2

# Dendrogram 2
p3 <- ggplot(segment(ddata_x)) + 
  geom_segment(aes(x=x, y=y, xend=xend, yend=yend)) + 
  coord_flip() + theme_none
p3

### Draw graphic 

grid.newpage()
print(p1, vp=viewport(0.8, 0.8, x=0.4, y=0.4))
print(p2, vp=viewport(0.74, 0.2, x=0.42, y=0.9))
print(p3, vp=viewport(0.2, 0.8, x=0.9, y=0.4))





















