# Introduction ----
# UT_PCA_script.R
# Started by Libby Natola on 10 Feb 2021.
# Based on R code written by Darren Irwin for Greenish Warbler analysis, and then the NA warbler analyses, and then the "TOWA_GBS_withMaddie_R_analysis_script.R", and finally "TOWA_BTNW_HEWA_8plates_R_analysis_script.R". Made using UTexas ddRAD Reads demultiplexed in Stacks.

# Initial setup ----

#I think he used this because there are so many data points and we want to see them all
options(max.print = 10000)

source("genomics_R_functions.R")

# For pcaMethods, need to do these commands:
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(version = "3.12")
BiocManager::install(c("pcaMethods"))
library(pcaMethods) 

# PCA whole-genome ----
# Load vcf file containing only variable sites throughout genome;
# construct a PCA based on all sites passing an Fst threshold between the "groups" below;
# and all individuals in "groups.to.plot.PCA" according to colors in "group.colors.PCA"

groups_and_colors <- rbind(c("RNSA", "black"),
                           c("HYSA", "grey"),
                           c("YBSA", "white")
)

groups.to.plot.PCA <- groups_and_colors[,1]
group.colors.PCA <- groups_and_colors[,2]
groups <- c("RNSA", "YBSA", "HYSA")  # for purpose of calculating pairwise Fst and Fst_group (to determine SNPs)

base.file.name <- "UofLsnps/utexas4010FINAL.tab"
pos <- read.table(paste0(base.file.name, ".012_whut.pos"), col.names = c("chrom", "position"))
column_names <- c("null", paste("c", pos$chrom, pos$position, sep="."))
geno <- read.table(paste0(base.file.name, ".012NA"), colClasses = "integer", col.names = column_names)
### could not get this to run with ashley's vcf output. I used the .pos file from my stacks filtering protocol bc it was the same number of loci and i don't think it matters what theyre called

SNPnum <- length(geno[1,]) -1   # because the first column is not a SNP (just a count from zero)
ind <- read.table(paste0(base.file.name, ".012.indv"))
locations <- read.table("UT.Fst_groups4010.txt", header=TRUE)
num_loc_cols <- ncol(locations)
ind_with_locations <- cbind(ind,locations)
combo <- cbind(ind_with_locations[,2:(num_loc_cols+1)],geno[,2:length(geno[1,])])

# determine number of missing SNPs per bird, and filter out those with more than X% missing SNPs
X <- 85  # this is the percentage threshold 
threshold_NA <- SNPnum * X/100
numNAs <- rowSums(is.na(combo[(num_loc_cols+1):ncol(combo)]))
numNAs_by_ID <- data.frame(combo$ID, numNAs)  # useful to see numNAs per sample: numNAs_by_ID
selection <- (numNAs < threshold_NA)
if(any(is.na(selection))) cat("selection contains NA values\n")  # this is a check for noticing errors / bugs
combo.NApass.all <- combo[selection,]
combo$ID[which(selection==F)]
#[1] "RNSA_RN_CAB006"

###starting SNPnum:36(60_60), 132(40_60), 3192(20_60)
# filter out SNPs with too many missing genotypes:
SNP_NAs <- colSums(is.na(combo.NApass.all[,(num_loc_cols+1):ncol(combo.NApass.all)]))
X <- 85  # this is the percentage threshold
threshold_SNP_NAs <- length(combo.NApass.all[,1]) * X/100
selection <- (SNP_NAs <= threshold_SNP_NAs)
if(any(is.na(selection))) cat("selection contains NA values\n")  # this is a check for noticing errors / bugs
combo.NApass.subset <- combo.NApass.all[, c(rep(TRUE, times=num_loc_cols),selection)]
pos.subset <- pos[selection,]
### finished 31(60_60), 126 (40_60), 3187 (20_60)

# option to filter out all but selected chromosome (or set of them):
choose.chrom <- F
if (choose.chrom == TRUE) {
  chrom <- "Z"
  # selection <- (pos.subset$chrom == chrom)
  selection <- (pos.subset$chrom == chrom)
  if(any(is.na(selection))) cat("selection contains NA values\n")  # this is a check for noticing errors / bugs
  #pos.subset.one.chr <- pos.subset[selection,]
  #loci.selection <- c(rep(TRUE, times=num_loc_cols), selection)  # add placeholders for info columns
  # which(loci.selection == T)    # to check which entries are TRUE
  combo.NApass <- combo.NApass.subset[, c(rep(TRUE, times=num_loc_cols), selection)]
  pos.NApass <- pos.subset[selection,]
  region.text <- paste0("chr", chrom)	
}	else {
  region.text <- "whole_genome"
  combo.NApass <- combo.NApass.subset
  pos.NApass <- pos.subset
}

# Calculate allele freqs and sample sizes (use column Fst_group)
temp.list <- getFreqsAndSampleSizes(combo.NApass, num_loc_cols, groups)
freqs <- temp.list$freqs
sample_size <- temp.list$sample_size
rm(temp.list)
# calculate WC84_Fst 
temp.list <- getWC84Fst(freqs, sample_size, groups, among=TRUE)  # set among to FALSE if no among Fst wanted (some things won't work without it)
WC84_Fst <- temp.list$WC84_Fst
rm(temp.list)


# make the figure:
Fst.filter <- F   # option to filter to high-Fst markers only, using cutoff below
Fst.cutoff <- 0.25  # has no effect if Fst.filter is FALSE
# choose whether to filter by Fst between pair of populations, or by Fst_among (as defined above)
groups.to.compare <- "Fst_among"
axes <- 3
PCA_results <- plotPCA(Fst.filter, Fst.cutoff, groups.to.compare, WC84_Fst, combo.NApass, num_loc_cols, region.text,
                             groups.to.plot.PCA, group.colors.PCA, axes, flip1=T, flip2=F)

PCA_table <- data.frame(PCA_results$scores, row.names = PCA_results$data$ID)

PCA_results$var_explained
# [1] 0.5046197 0.6188376 0.6927196
0.5046197 - 0.6188376
# PC2 is [1] -0.1142179
PCA_results$loadings
### Make it purty

PCAs <- read.csv("UT.PCtable.csv")

library(ggplot2)
library(tidyr)
library(dplyr)


theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))

ggsave(file = "~/Documents/Lethbridge/Sapsuckers/Chapter 3/Publication/PubImages/UTexasPCA.pdf", dpi =500, dev=pdf)
UT_PCA <- ggplot(PCAs, aes(x=PC1, y=PC2, color = Species, fill=Species, shape = Species)) + geom_point(size=4) + xlab("PC1 50.4%") + ylab("PC2 11.4%") + ggtitle("ddRAD PCA") + scale_fill_manual(values=c("grey", "black", "white")) + scale_color_manual(values=c("black", "black", "black")) + scale_shape_manual(values = c(23,24,21)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
UT_PCA
dev.off()

ggsave(file = "~/Documents/Lethbridge/Sapsuckers/Chapter 3/Publication/PubImages/UTexasPCAsympallo.pdf", dpi = 500, dev=pdf)
UT_PCA_sympallo <- ggplot(PCAs, aes(x=PC1, y=PC2, color = Location, fill = Species, shape = Species)) + geom_point(size=4) + xlab("PC1 50.4%") + ylab("PC2 11.4%") + ggtitle("ddRAD PCA") + scale_fill_manual(values=c("grey", "black", "white")) + scale_color_manual(values=c("red", "black", "grey")) + scale_shape_manual(values = c(23,24,21)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
UT_PCA_sympallo
dev.off()
