rm(list=ls())
library(kmed)
library(ape)
library(ggplot2)

x <- read.FASTA("H7_HA_IRDB_2019_1_14_aligned_muscle_cleaned.fas", type="DNA")
y <- dist.dna(x)


# Plotting the WSS
wss <- vector()
for (i in 1:10) wss[i] <- sum(kmeans(y, iter.max=1000, nstart=100,
                                     centers=i)$tot.withinss)
Clusters <- c(1:10)

wss_data <- data.frame(cbind(Clusters,wss))
ggplot(data=wss_data, aes(x=Clusters, y=wss))+
  geom_line(size=1)+
  geom_point(size=1.5)+
  labs(x="Number of Clusters", y="Total of the Within Clusters Sum of the Squares")+
  scale_x_continuous(breaks=c(1:10))


# K-Means Cluster Analysis
fit7 <- kmeans(y, 7, iter.max=1000, nstart=100) # 7 cluster solution
fit6 <- kmeans(y, 6, iter.max=1000, nstart=100) # 6 cluster solution
fit5 <- kmeans(y, 5, iter.max=1000, nstart=100) # 5 cluster solution
fit4 <- kmeans(y, 4, iter.max=1000, nstart=100) # 4 cluster solution
fit3 <- kmeans(y, 3, iter.max=1000, nstart=100) # 3 cluster solution
# append cluster assignment
mydata <- data.frame(cbind(fit3$cluster,fit4$cluster,fit5$cluster,fit6$cluster,fit7$cluster))
write.csv(mydata, "clustered_HA_kmeans_RNA.csv")
fit3$size
fit3$iter
fit3$betweenss
fit3$withinss

#Diagnostics

kmboot <- function(x, nclust) {
  res <- kmeans(x, nclust)
  return(res$cluster)
}
kmeansboot <- clustboot(y, nclust=6, kmboot, nboot=1000, diss = FALSE)
#write.csv(kmeansboot, "bootstrap_clustered_HA_4_kmeans_RNA.csv")
wardorder <- function(x, nclust) {
  res <- hclust(x, method = "ward.D2")
  member <- cutree(res, nclust)
  return(member)
}
consensuskmeans <- consensusmatrix(kmeansboot, nclust=6, wardorder)
clustheatmap(consensuskmeans, "HA Clustering")

usearch <- read.csv("usearch_clustering.csv", header=TRUE)

ggplot(data=usearch, aes(x=Identity, y=Clusters, group=Alignment))+
  geom_line(aes(colour=Alignment), size=1.5)+
  geom_point(aes(shape=Alignment), size=2)

