######################################################
## Jessi Rick and Lucia Combrink
## Makes a PCA from VCF SNPs  
## Updated by Lucia Combrink 3/19/2022
## This script loads the VCF (SNP data) and metadata needed to make a vcf. 
######################################################

library(vcfR)
library(tidyverse)
library(reshape2)
library(broman)
library(ggthemes)
library(adegenet)
library(ggpubr)
library(dartR)
#library(alpha)
library(ggrepel)
library(patchwork)
library(ggrepel)

setwd("~/Desktop/WRProjectData/")

##Steps: 
#1) Define a function to calculate PCA 
#2) Import and clean data. 
#3) Do a PCA and combine with metadata
#4) Make PCA plots. 

### 1) Function to calculate PCA from genetic variances ####################################

do.pca<-function(gmat){  ### inds in columns, loci in rows
  gmn<-apply(gmat,1,mean, na.rm=T)  
  gmnmat<-matrix(gmn,nrow=nrow(gmat),ncol=ncol(gmat))
  gprime<-gmat-gmnmat ## remove mean
  gcovarmat<-matrix(NA,nrow=ncol(gmat),ncol=ncol(gmat))
  for(i in 1:ncol(gmat)){
    for(j in i:ncol(gmat)){
      if (i==j){
        gcovarmat[i,j]<-cov(gprime[,i],gprime[,j], use="pairwise.complete.obs")
      }
      else{
        gcovarmat[i,j]<-cov(gprime[,i],gprime[,j], use="pairwise.complete.obs")
        gcovarmat[j,i]<-gcovarmat[i,j]
      }
    }
  }
  prcomp(~.,data=as.data.frame(gcovarmat),center=TRUE,scale=FALSE,na.action=na.exclude)
}

### 2) Import and Clean VCF and metadata #####################################################

vcfCUT <- read.vcfR("forPCACUT_all_miss0.5_maf0.03.recode.vcf") #This vcf has not yet been filtered for missing data within individuals. 
metadata <- read.csv("forPCACUT_metadata.csv", header=TRUE, stringsAsFactors = FALSE) # The individuals in this metadata match to the individuals in the VCF

wrcut <- vcfR2genlight(vcfCUT) #Converting vcf to a genlight object
ploidy(wrcut) <- 2 #Specifying that ploidy = 2
wrcut <- gl.compliance.check(wrcut) #Check for monomorphic loci, found none so that's good.  

##Calculate proportion of missing data: 
#test <- apply(vcfCUT@gt,2, function(x) length(grep("^[.]/[.]",x))/length(x) )
#hist(test, labels = T) #Shows that I do have individuals with 50% missing data. 

wrcut_filt <- gl.filter.callrate(wrcut,method="ind",threshold=0.5,mono.rm=T,recalc=FALSE,v=2) # Removing individuals with < 50% data.

wrcut_filt_info <- left_join(data.frame(longID = indNames(wrcut_filt)),metadata, by="longID") #Combine filtered genotype data with metadata. 
pop(wrcut_filt) <- wrcut_filt_info$location #Adding population info to the filtered genotype data.

wrcut_filt_alleles <- t(as.matrix(wrcut_filt)) ## pull matrix of SNPs


### 3) Do a PCA and combine with metadata ##################################################################
#This PCA is done with Liz Mandeville & Alex Buerkle's code (modified by Jessi Rick and Lucia Combrink): 

wrcut_filt_pca <- do.pca(wrcut_filt_alleles) #Do PCA with do.pca() function. 
pcSummary_wrcut_filt <- summary(wrcut_filt_pca) #Summary of PCA

scree <- plot(wrcut_filt_pca, type="lines") #Scree plot to see which PC axes explain most of the variation. 

#Combine PCA with metadata. 
pcaAll_filt <- data.frame(names = wrcut_filt_info$longID,
                     location = as.factor(wrcut_filt_info$location),
                     strain = as.factor(wrcut_filt_info$strain),
                     lake_catagory = as.factor(wrcut_filt_info$lake_catagory), 
                     EV1 = wrcut_filt_pca$x[,1],    # the first eigenvector
                     EV2 = wrcut_filt_pca$x[,2],    # the second eigenvector
                     EV3 = wrcut_filt_pca$x[,3],    # the third eigenvector
                     EV4 = wrcut_filt_pca$x[,4],
                     EV5 = wrcut_filt_pca$x[,5],
                     stringsAsFactors = FALSE)


### 4) Make PCA plots #####################################################

#This is a "summary dataframe" for assigning easy-to-read labels below: 
pcaALL_EV_summary <- pcaAll_filt %>% group_by(location) %>% 
  summarize(meanEV1 = mean(EV1), #This identifies the mean location of each EV for printing labels. 
            meanEV2 = mean(EV2))
pcaALL_EV_summary$location <- dplyr::recode(pcaALL_EV_summary$location, 
                AuburnHatchery = "HatcheryYSC", #Change names. 
                TensleepHatchery = "HatcherySRC")


#First rename lakes for plot: 
pcaAll_filt$location <- dplyr::recode(pcaAll_filt$location, 
                                      HatcheryYSC = "HatcheryYSC", 
                                      HatcherySRC = "HatcherySRC", 
                                      SFOwlCreek = "SFOwlCrk", 
                                      `No Name(W)` = "NoNameW", 
                                      `Cutthroat(NFK)` = "CuttNFK", 
                                      `L. Black Joe` = "LBlackJoe", 
                                      Windy = "Windy", 
                                      `U. Jean` = "UJean", 
                                      `L. Jean` = "LJean")

#Finally, plot the pca! 
#pdf(file = "~/Desktop/final_figures/PCA.pdf", width = 7, height = 9) 
PCAplot <- ggplot() + 
  geom_point(pcaAll_filt, mapping=aes(x=EV1, y=EV2, color = as.factor(lake_catagory)), size = 6, alpha=0.6) + 
  #coord_trans(x = magnify_trans(intercept = -0.03, reducer = 10)) +
  geom_point(pcaAll_filt, mapping=aes(x=EV1, y=EV2, shape=as.factor(location)), size=2, alpha=0.8, color="black") + 
  scale_color_manual(values = c("#caaf92", "#6c704d", "#adbb6b"), 
                     labels = c("Hatchery/Reference", "Historic", "Recent/Historic")) + 
  theme_few() + scale_shape_manual(values=c(1,2,3,4,5,6,7,8,9)) + 
  scale_x_continuous(breaks=seq(-0.3, 1.3, 0.07)) + 
  scale_y_continuous(breaks=seq(-0.3, 1.2, 0.07)) + 
  theme(axis.text.x = element_text(angle=90)) + 
  geom_label_repel(data=pcaALL_EV_summary, mapping=aes(x=meanEV1,y=meanEV2, label=location),nudge_x = 0.15, nudge_y = 0.10, label.size=0.1, size=3.4) + 
  #box.padding = 1,
  #min.segment.length = 0) + 
  #scale_x_break(c(-0.06, 0.54), scales=0.4) + 
  #geom_label(aes(y=meanEV2, x=meanEV1, label=location), data=pcaALL_EV_summary, 
             #hjust="inward", vjust="inward", nudge_x = 0.01, nudge_y = 0.01, label.size=0.1, size=3.4) + 
  labs(x="PC1 (59.28%)", y="PC2 (30.40%)", shape="Lake", color="Population Category") + 
  theme(legend.position = c(0.5, 0.7)) + 
  #theme(legend.position = "none") +
  theme(legend.key.size = unit(0.4, 'cm'), #change legend key size
        legend.key.height = unit(0.4, 'cm'), #change legend key height
        legend.key.width = unit(0.4, 'cm'), #change legend key width
        legend.title = element_text(size=8), #change legend title font size
        legend.text = element_text(size=8)) #change legend text font size
#dev.off()


#Putting together a patchwork plot: 
#jpeg("~/Desktop/final figures/PCA_FST_noLabel.jpg",units="in", width=13, height=7, res=1000)
#pdf(file = "~/Desktop/final_figures/PCA_Legend.pdf", width = 13, height = 7) 
PCAplot + Fst_Plot + plot_annotation(tag_levels = 'A') + plot_layout(widths = c(3,2))
dev.off()








