# The following is a list of commands used to generate the data and run analyses in O'Donnell and Sullivan: "Low coverage whole genome sequencing reveals molecular markers for spawning season and sex identification in Gulf of Maine Atlantic cod (Gadus morhua, Linnaeus 1758)"

##### SEQUENCE TRIMMING ####

# All of these commands will loop over a Samples.txt file containing a list of hte sample names on each line

while read -r line; do
# Trim reads with Trimmomatic 
java -jar trimmomatic-0.38.jar PE -phred33 /data/prj/Cod/R1_R2_concat/$line\_concat_L1234_R1_001.fastq /data/prj/Cod/R1_R2_concat/$line\_concat_L1234_R2_001.fastq /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR1_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR1_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR2_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR2_001.fastq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36

##### MAPPING #####

# Align reads to reference genome using bowtie

# Align unpaired reads
./bowtie2 --very-sensitive-local -p 8 -x /data/prj/Cod/Assembly/gadMor2ref -U /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR1_001.fastq,$line\_concat_L1234_UR2_001.fastq -S /data/prj/Cod/Assembly/codGenomes/$line\_U.sam
# Align paired reads
./bowtie2 --very-sensitive-local -p 8 -x /data/prj/Cod/Assembly/gadMor2ref -1 /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR1_001.fastq -2 $line\_concat_L1234_PR2_001.fastq -S /data/prj/Cod/Assembly/codGenomes/$line\_P.sam

#### SEQUENCE FILTERING ####

# Convert and Organize files using SAMtools
# Convert unpaired and paired sam to bam with mapping scores greater than 20
samtools view -S -b -q 20 -o /data/prj/Cod/Assembly/codGenomesBAM/$line\_U.bam /data/prj/Cod/Assembly/codGenomes/$line\_U.sam
samtools view -S -b -q 20 -o /data/prj/Cod/Assembly/codGenomesBAM/$line\_P.bam /data/prj/Cod/Assembly/codGenomes/$line\_P.sam

# Sort the bam files
samtools sort -l 0 /data/prj/Cod/Assembly/codGenomesBAM/$line\_P.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Psorted
samtools sort -l 0 /data/prj/Cod/Assembly/codGenomesBAM/$line\_U.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Usorted

# Merge the paired and unpaired bam files
samtools merge /data/prj/Cod/Assembly/codGenomesBAM/$line\_merged.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Psorted.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Usorted.bam

# Soft clip read ends to retain only the read with the highest quality score in overlapping regions using the clipOverlap in BAMUTIL
./bam clipOverlap --in /data/prj/Cod/Assembly/codGenomesBAM/$line\_merged.bam --out /data/prj/Cod/Assembly/codGenomesBAM/$line\_clipped.bam --stats --unmapped

# Duplicate reads were removed using the MarkDuplicates module of PICARD 
java -jar picard.jar MarkDuplicates I=/data/prj/Cod/Assembly/codGenomesBAM/$line\_clipped.bam O=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_rm_dups.bam M=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_Dup_Metrics.txt REMOVE_DUPLICATES=true

#### COVERAGE ESTIMATION ####

# Estimate the coverage and mapping depth for each individual using the SAMtools mpileup
samtools mpileup -A -B -Q 0 -f /data/prj/Cod/Assembly/gadMor2.fasta -a /data/prj/Cod/Assembly/codGenomes_rmdups/$line\_rm_dups.bam > /data/prj/Cod/Assembly/Coverage/mpileup/$line\.pileup

# Custom script to estimate mapping depth after excluding positions with >4× the mean mapping depth
# Count the reads looping over each sample
echo count reads $line;
awk -F'\t' '{sum+=$4}END{print sum;}' /data/prj/Cod/Assembly/Coverage/mpileup/$line\.pileup >> /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads.txt
# Estimate the coverage without filtering out any positions
echo estimate coverage $line;
awk '{print $1,$1/643000000,4*($1/643000000)}' /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads.txt | tail -1 >> CodCoverage.txt
# Exclude positions with >4x the mean mapping depth
echo filter by four times coverage $line;
awk -v fourXreads="$(tail -1 CodCoverage.txt | awk '{ print $3 }')" '{ if ($4 < fourXreads) print }' /data/prj/Cod/Assembly/Coverage/mpileup/$line.pileup > /data/prj/Cod/Assembly/Coverage/mpileup/$line\_filtered.pileup
# Count the reads on filtered pileup file
echo count reads on filtered pileup $line;
awk -F'\t' '{sum+=$4}END{print sum;}' /data/prj/Cod/Assembly/Coverage/mpileup/$line\_filtered.pileup >> /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads_filtered.txt
# Estimate filtered coverage. NOTE: the end of the loop with the 'Samples.txt' file occurs here
echo estimate filtered coverage $line;
awk '{print $1,$1/643000000}' /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads_filtered.txt | tail -1 >> CodCoverage_filtered.txt; done < Samples.txt
# Add sample name labels to the coverage estimates you just generated
echo add labels to CodCoverage files;
paste Samples.txt CodCoverage.txt | column -s $'\t' -t > /data/prj/Cod/Assembly/Coverage/mpileup/Summary/CodCoverage_labeled.txt
paste Samples.txt CodCoverage_filtered.txt | column -s $'\t' -t > /data/prj/Cod/Assembly/Coverage/mpileup/Summary/CodCoverage_filtered_labeled.txt

# When all individuals had been sequenced to ~1× depth use PICARD Tools to add read groups and create index files for each individual

#### REALIGN AROUND INDELS ####
# Add read groups
# loop over list of file names in 'AddReadGroupsFiles.txt'
while read -r line; do java -jar picard.jar AddOrReplaceReadGroups \
       I=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\.bam \
       O=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_wRG.bam \
       RGID=1 \
       RGLB=lib1 \
       RGPL=illumina \
       RGPU=unit1 \
       RGSM=20; done < AddReadGroupsFiles.txt

# Create index files
while read -r line; do java -jar picard.jar BuildBamIndex \
      I=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_wRG.bam; done < AddReadGroupsFiles.txt

# Realign around indels using IndelRealigner within GATK v3.8

# In the first step run RealignerTargetCreator to look for all the indels
java -Xmx48g -jar GenomeAnalysisTK.jar \
   -T RealignerTargetCreator \
   -R /data/prj/Cod/Assembly/gadMor2.fasta \
   -I CodFinalBamFiles.list \
   -o CodFinalBamFiles_forIndelRealigner.intervals \
   -drf BadMate

# Then use IndelRealigner to realign around all of the target indels created in the previous step
java -Xmx48g -jar GenomeAnalysisTK.jar \
   -T IndelRealigner \
   -R /data/prj/Cod/Assembly/gadMor2.fasta \
   -I CodFinalBamFiles.list \
   -targetIntervals CodFinalBamFiles_forIndelRealigner.intervals \
   --consensusDeterminationModel USE_READS  \
   --nWayOut realigned.bam

#### SNP CALLING AND GENOTYPE LIKELIHOOD ESTIMATION WITH ANGSD ####

# The 'FinalCodRealignedList.txt' is a full path to all of the BAM files after realigning around indels
# The code below calls SNPs and estimates genotype likelihoods for all individuals including GB
./angsd -b FinalCodRealignedList.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out N222_HWE -P 32 \
	-GL 2 -doGlf 2 -doMajorMinor 1 \
	-doCounts 1 -doDepth 1 -dumpCounts 2 \
	-setMinDepth 50 -setMaxDepth 500 -minInd 150 -minQ 20 \
	-remove_bads 0 -only_proper_pairs 0 \
	-doGeno 3 -doPost 2 -doMaf 1 -HWE_pval 1e-6 -SNP_pval 1e-6 -minMaf 0.05

# Doing the same analyses but for only GOM individuals 
./angsd -b FinalCodRealignedListGOMOnly.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out Beagle_GL_GOM -P 32 \
	-GL 2 -doGlf 2 -doMajorMinor 1 \
	-doCounts 1 -doDepth 1 -dumpCounts 2 \
	-setMinDepth 50 -setMaxDepth 500 -minInd 150 -minQ 20 \
	-remove_bads 0 -only_proper_pairs 0 \
	-doGeno 3 -doPost 2 -doMaf 1 -HWE_pval 1e-6 -SNP_pval 1e-6 -minMaf 0.05

#### Fst Calculations in ANGSD ####

# All Fst calculations were performed for cod separated by season and also by sex. Only seasonal analyses are shown.
# Calculate a folded minor allele frequency spectrum by specifiying the reference genome as the ancenstral genome
for POP in GB Winter Spring
do
        echo $POP
        ./angsd -b $POP'_BAMs.txt' -ref /data/prj/Cod/Assembly/gadMor2.fasta -anc /data/prj/Cod/Assembly/gadMor2.fasta -out ReanalysisResults/$POP.folded \
	-GL 2 -doSaf 1 -doMajorMinor 1 \
	-doCounts 1 -doDepth 1 -dumpCounts 2 \
	-setMinDepth 5 -setMaxDepth 500 -minInd 5 -minQ 20 \
	-remove_bads 0 -only_proper_pairs 0 \
	-doPost 2 -doMaf 1 -SNP_pval 1e-6 -minMaf 0.05
done

# Compute overall folded SFS by specifcing the -fold 1 option as part of the realSFS command. 
for POP in GB Winter Spring
do
        echo $POP
        ./realSFS /data/app/angsd/ReanalysisResults/$POP.folded.saf.idx -fold 1 > /data/app/angsd/ReanalysisResults/$POP.folded.sfs
done

# Do the 2dSFS for pairwise comparisons
./realSFS /data/app/angsd/ReanalysisResults/GB.folded.saf.idx /data/app/angsd/ReanalysisResults/Winter.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/GB.Winter.folded.sfs 
./realSFS /data/app/angsd/ReanalysisResults/GB.folded.saf.idx /data/app/angsd/ReanalysisResults/Spring.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/GB.Spring.folded.sfs
./realSFS /data/app/angsd/ReanalysisResults/Winter.folded.saf.idx /data/app/angsd/ReanalysisResults/Spring.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/Winter.Spring.folded.sfs

# Start of Fst estimation
DIR=/data/app/angsd/ReanalysisResults/
export DIR
./realSFS fst index "$DIR"GB.folded.saf.idx "$DIR"Winter.folded.saf.idx "$DIR"Spring.folded.saf.idx -sfs "$DIR"GB.Winter.folded.sfs -sfs "$DIR"GB.Spring.folded.sfs -sfs "$DIR"Winter.Spring.folded.sfs -fstout "$DIR"Spring.folded.pbs -whichFst 1

# Convert Fst results from binary to txt. This will be helpful in calculating global Fst.
./realSFS fst print /data/app/angsd/ReanalysisResults/Spring.folded.pbs.fst.idx > /data/app/angsd/ReanalysisResults/SeasonFstList.txt

#Sliding window win = 1,000 and step = 100
./realSFS fst stats2 /data/app/angsd/ReanalysisResults/Spring.folded.pbs.fst.idx -win 1000 -step 100 > /data/app/angsd/ReanalysisResults/Season.folded.win1000step100.pbs.fst.txt

# Plot locus-by-locus Fst results in R as GWAS

#####################################################################################
setwd("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan")

library(ggplot2)
library(dplyr)
#pull in Fst data
pbs = read.table("Season.folded.Win1000Step100.pbs.fst.txt", header = T)
#Assign more informative Fst column names
colnames(pbs)[5:7]<-c("Fst_GB.Winter","Fst_GB.Spring","Fst_Winter.Spring")

#Remove scaffolds
pbssub<-pbs[grep("LG",pbs$chr),]

#Calculate overall Fst values
mean(pbssub$Fst_GB.Spring, na.rm=TRUE)
mean(pbssub$Fst_GB.Winter, na.rm=TRUE)
mean(pbssub$Fst_Winter.Spring, na.rm=TRUE)

#Set any negative Fst values to 0
pbssub$Fst_GB.Spring[which(pbssub$Fst_GB.Spring<0)]=0
pbssub$Fst_GB.Winter[which(pbssub$Fst_GB.Winter<0)]=0
pbssub$Fst_Winter.Spring[which(pbssub$Fst_Winter.Spring<0)]=0

#Recalculate overall Fst values with 0s for negatives and no scaffolds
mean(pbssub$Fst_GB.Spring, na.rm=TRUE)
mean(pbssub$Fst_GB.Winter, na.rm=TRUE)
mean(pbssub$Fst_Winter.Spring, na.rm=TRUE)

pbssub$BP<-cumsum(as.numeric(pbssub$midPos))

axisdf = pbssub %>% group_by(chr) %>% summarize(center=( max(BP) + min(BP) ) / 2 )
axisdf$chr<-c(1:23)

jpeg("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan/FinalPlots/FstBySeasonWin1000Step100.jpeg", res=600, width=8, height=3, unit="in")
pdf("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan/FinalPlots/FstBySeasonWin1000Step100.pdf", paper="special", width=8, height=3)

ggplot(pbssub, aes(x=BP, y=Fst_Winter.Spring)) +
  
  # Show all points
  geom_point(aes(color=as.factor(chr)), alpha=0.8, size=1.3) +
  #geom_segment(aes(x=min(Mandatasub$BP),xend=max(Mandatasub$BP),y=7.3,yend=7.3), color="red") + 
  scale_color_manual(values = rep(c("black", "grey"), 22 )) +
  
  # custom X axis:
  scale_x_continuous( label = axisdf$chr, breaks= axisdf$center ) +
  scale_y_continuous(expand = c(0, 0) ) +   # remove space between plot area and x axis
  xlab("Linkage Group") + 
  ylab(bquote(italic("F")[ST])) +
  #ylim(0.0001,0.6) +
  #adjust ylim after seeing first plot
  expand_limits(y=0.6) +
  
  # Custom the theme:
  theme_bw() +
  theme( 
    axis.text.x = element_text(size=8),
    legend.position="none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )
dev.off()

#####################################################################################


#### PCA in PCAngsd ####

# PCA was performed for cod separated by spawning season and also sex. Only seasonal analyses are shown here.
# Use PCAngsd to generate covariation matrix from beagle GL file produced by angsd. 
python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/Beagle_GL_GOM.beagle.gz -out PCAResults/CovMatGOM3101296SNPs -threads 32

# Plot PCA in R

#####################################################################################
#The following code will make a PCA from the output of PCAngsd
library(ggplot2)
setwd("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd")

# read in collection information
Coll.info<-read.csv("VcfIndOrderGOMonly.csv")
#Coll.info$SS<-paste(Coll.info$Stock, Coll.info$Season, sep="")

#Load the covariance matrix
cov <- as.matrix(read.table("CovMatGOM3101296SNPs.cov", header = F))
mme.pca <- eigen(cov) #perform the pca using the eigen function. 

eigenvectors = mme.pca$vectors #extract eigenvectors 
pca.vectors = as.data.frame(cbind(Coll.info, eigenvectors)) #combine with our population assignments

pca.eigenval.sum = sum(mme.pca$values) #sum of eigenvalues
varPC1 <- (mme.pca$values[1]/pca.eigenval.sum)*100 #Variance explained by PC1
varPC2 <- (mme.pca$values[2]/pca.eigenval.sum)*100 #Variance explained by PC2

cols<-c("#fdb863", "#5e3c99")

pdf("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason3101296.pdf", paper="special", width=8, height=6)
jpeg("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason3101296.jpeg", res=600, width=8, height=6, unit="in")

#Produce plot
ggplot() +
  geom_point(data=pca.vectors, colour="black", shape=21,size=2.5,aes(fill=factor(Season),x=`1`,y=`2`)) +
  
  scale_fill_manual(values=cols) +
  #scale_fill_manual(values=dichromat(cols)) + #Use this code to see how the figure looks to someone who is colorblind
  #scale_fill_manual(values=ColToGrey(colsRGB)) + #Use this code to see how the figure looks in greyscale
  
  theme_bw() +
  labs(fill='Spawning Season') +
  xlab(paste0("PC1 (",round(varPC1,2),"% Variance Explained)")) +
  ylab(paste0("PC2 (",round(varPC2,2),"% Variance Explained)")) +
  theme(legend.position=c(0.094,0.079)) +
  theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) +
  theme(
    axis.text=element_text(size=11),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())
dev.off()

######################################################################################

#### Selection scan in PCAngsd ####

# Use the pcadapt selection scan tool in PCAngsd to find loci under selection
python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/BeagleGL_OGFiltersGOM.beagle.gz -out PCAResults/GOMpcadapt -pcadapt -sites_save -threads 128

#### Informative Locus Selection ####

# Use GRRF in R to select loci

###################################################################################
library(randomForest)
library(RRF)

setwd("/data/usr/timo/R.directory/Cod/RandomForest")

print("Start map file recoding")
#Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .ped
#file so that ACGT is replaced with 1234 and 0's with NA's. Then use the next chunk of 
#code to take the associated .map file and use it to create a header of locus names
first<-c("FID", "IID", "IDF", "IDM", "Sex", "Phenotype")
map<-read.table("BySeasonPED.map", sep="",header=FALSE)
order<-as.data.frame(c(1:nrow(map)))
df1<-as.data.frame(cbind(order,map$V2))
df2<-as.data.frame(cbind(order,map$V2))
rbind(df1,df2)->locusnames
colnames(locusnames)<-c("order","SNP")                   
locusnames<-locusnames[order(locusnames$order),]
LNvec<-as.vector(t(locusnames$SNP))
header<-c(first,LNvec)
print("Start read in .ped file")
ped<-read.table("BySeasonPEDforRRF.ped",sep=" ",header=FALSE)
colnames(ped)<-header
rm(df1,df2,locusnames,map,order,first,LNvec,header)
print("Finish .ped file read in")

#Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .bim
#file to export the minor and major alleles and recode them ACGT for 1234. The following
#chunk of code takes those minor and major allele columns for each locus and doubles
#them and transposes them to work with the loop in the next section.
print("Read in and edit MinMaj")
MinMaj<-read.table("MinMaj.txt")
order<-as.data.frame(c(1:nrow(MinMaj)))
df1<-as.data.frame(cbind(order,MinMaj))
df2<-as.data.frame(cbind(order,MinMaj))
rbind(df1,df2)->MinMajcombo
colnames(MinMajcombo)<-c("order","Min","Maj")    
MinMajcombo<-MinMajcombo[order(MinMajcombo$order),]
MMvec<-t(MinMajcombo[,2:3])
rm(df1,df2,MinMaj,MinMajcombo,order)

#indicate the pop and ind IDs from the ped file and isolate the genotype data in 'ready'
POPID<-ped[,6]
IID<-ped[,2]
ready<-ped[,7:ncol(ped)]

fmat<-as.matrix(sapply(ready, as.numeric))
rm(ready,ped)

print("Start recoding of .ped file")
#take your genoytpe data and rewrite it so that 0 codes for minor allele and 1 codes
#for the major allele at each locus
rewrite<-matrix(nrow=nrow(fmat), ncol=ncol(fmat))
for (i in 1:ncol(fmat)){
  result<-ifelse(fmat[1:nrow(fmat),i]==MMvec[1,i],0,ifelse(fmat[1:nrow(fmat),i]==MMvec[2,i],1,FALSE))
  rewrite[,i]<-result
  }
colnames(rewrite)<-colnames(fmat)

#combining columns into single feature so 0/0=0 0/1=0.5 and 1/1=2
oddindex<-c(((1:(ncol(rewrite)/2))*2-1))
evenindex<-c(((1:(ncol(rewrite)/2))*2))
even<-rewrite[,evenindex]
odd<-rewrite[,oddindex]
new<-(odd+even)/2
rm(even,odd,MMvec,fmat,rewrite,evenindex,oddindex,i,result)
print("Start impute step")
#impute missing data using RF
imputed<-rfImpute(new, POPID, iter=10, ntree=2000)
print("Finish impute step")

imputedfeatures<-imputed[,-1]
done<-cbind(as.character(POPID), as.character(IID), imputedfeatures) #optional to write as table to save data in this format
rm(new,imputed)
print("write RFdata table")
write.table(done, file="RFdatafile.txt", row.names=FALSE, col.names=TRUE, quote=FALSE) #optional to write as table to save data in this format
#you now have a file suitable for RF!
#object 'POPID' can be used for 'y' for RF classification. Object 'new' is data frame or 'x'.
#example run of RF
print("start randomforest step")
rf<-randomForest(imputedfeatures, POPID, ntree=5000, mtry=(sqrt(ncol(imputedfeatures))), importance=TRUE, do.trace=500)
print("finish randomforest step")

print("write results")                 
#code used to obtain ranked list of loci above MDA = 0 
select<-rf$importance
MDA<-as.data.frame(select[,1])
MDA$loci<-rownames(MDA)
above0<-as.data.frame(MDA[which(MDA[,1] > 0),])
selected<-above0[order(-above0[,1]),] #shouldn't be necessary, importance should already be ranked
colnames(selected)<-c("%IncMSE","loci")
print("write results to table")
write.table(selected, file="CodAllLociRFrank.txt", row.names=FALSE, col.names=TRUE, quote=FALSE)       

###################################################################################

#### Evaluating population assignment accuracy ####

# Use assignPOP in R to evaluate locus suites

###################################################################################
setwd("C:/Users/todonnell/Documents/R.directory/Cod/assignPOP/N262GRRF")
library(assignPOP)
library(diveRsity)
library(randomForest)
library(klaR)


print("Start map file recoding")
#Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .ped
#file so that ACGT is replaced with 1234 and 0's with NA's. Then use the next chunk of 
#code to take the associated .map file and use it to create a header of locus names
first<-c("FID", "IID", "IDF", "IDM", "Sex", "Phenotype")
map<-read.table("GRRFN262Final.map", sep="",header=FALSE)
order<-as.data.frame(c(1:nrow(map)))
df1<-as.data.frame(cbind(order,map$V2))
df2<-as.data.frame(cbind(order,map$V2))
rbind(df1,df2)->locusnames
colnames(locusnames)<-c("order","SNP")                   
locusnames<-locusnames[order(locusnames$order),]
LNvec<-as.vector(t(locusnames$SNP))
header<-c(first,LNvec)
print("Start read in .ped file")
ped<-read.table("N262ForRRF.ped",sep=" ",header=FALSE)
colnames(ped)<-header
rm(df1,df2,locusnames,map,order,first,LNvec,header)
print("Finish .ped file read in")

#Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .bim
#file to export the minor and major alleles and recode them ACGT for 1234. The following
#chunk of code takes those minor and major allele columns for each locus and doubles
#them and transposes them to work with the loop in the next section.
print("Read in and edit MinMaj")
MinMaj<-read.table("N262forRRFMinMaj.txt")
order<-as.data.frame(c(1:nrow(MinMaj)))
df1<-as.data.frame(cbind(order,MinMaj))
df2<-as.data.frame(cbind(order,MinMaj))
rbind(df1,df2)->MinMajcombo
colnames(MinMajcombo)<-c("order","Min","Maj")    
MinMajcombo<-MinMajcombo[order(MinMajcombo$order),]
MMvec<-t(MinMajcombo[,2:3])
rm(df1,df2,MinMajcombo,order)

#indicate the pop and ind IDs from the ped file and isolate the genotype data in 'ready'
POPID<-as.factor(ped[,6])
IID<-ped[,2]
ready<-ped[,7:ncol(ped)]

fmat<-as.matrix(sapply(ready, as.numeric))
rm(ready,ped)

print("Start recoding of .ped file")
#take your genoytpe data and rewrite it so that 0 codes for minor allele and 1 codes
#for the major allele at each locus
rewrite<-matrix(nrow=nrow(fmat), ncol=ncol(fmat))
for (i in 1:ncol(fmat)){
  result<-ifelse(fmat[1:nrow(fmat),i]==MMvec[1,i],0,ifelse(fmat[1:nrow(fmat),i]==MMvec[2,i],1,FALSE))
  rewrite[,i]<-result
}
colnames(rewrite)<-colnames(fmat)

#combining columns into single feature so 0/0=0 0/1=0.5 and 1/1=2
oddindex<-c(((1:(ncol(rewrite)/2))*2-1))
evenindex<-c(((1:(ncol(rewrite)/2))*2))
even<-rewrite[,evenindex]
odd<-rewrite[,oddindex]
new<-(odd+even)/2
rm(even,odd,MMvec,fmat,rewrite,evenindex,oddindex,i,result)
print("Start impute step")
#impute missing data using RF
imputed<-rfImpute(new, POPID, iter=10, ntree=2000)
imputedfeatures<-imputed[,-1]

#Round imputed data frame so that all values equal 0, 0.5 or 1.
imputedfeatures[imputedfeatures>=0.75]<-1
imputedfeatures[imputedfeatures<=0.7499999999 & imputedfeatures>=0.5000001]<-0.5
imputedfeatures[imputedfeatures<=0.249999999]<-0
imputedfeatures[imputedfeatures>=0.2499999999 & imputedfeatures<=0.5]<-0.5

#transpose MinMaj df and replace 1,2,3,4 with A,C,G,T
tMinMaj<-t(MinMaj)
tMinMaj[tMinMaj==1]<-"A"
tMinMaj[tMinMaj==2]<-"C"
tMinMaj[tMinMaj==3]<-"G"
tMinMaj[tMinMaj==4]<-"T"

#revert the transposed dataframe back to A,C,G,T
rewrite<-matrix(nrow=nrow(imputedfeatures), ncol=ncol(imputedfeatures))
for (i in 1:ncol(imputedfeatures)){
  result<-ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==0,paste(tMinMaj[1,i],tMinMaj[1,i],sep=""),ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==1,paste(tMinMaj[2,i],tMinMaj[2,i],sep=""),ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==0.5,paste(tMinMaj[1,i],tMinMaj[2,i],sep=""),FALSE)))
  rewrite[,i]<-result
}

read.csv("VcfIndOrdernew.csv")->Coll.info
rbind(colnames(imputedfeatures),rewrite)->rewrite
c("SNP_ID",as.character(Coll.info$pop))->firstcol
cbind(firstcol,rewrite)->rewrite
GRRFsnps<-t(rewrite)
GRRFsnps<-as.data.frame(GRRFsnps)
write.table(GRRFsnps,file="GRRFsnpstest.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)

#Open GRRFsnpstest.txt in Excel and sort the loci according to GRRF Rank

read.table(file="GRRFsnpstest.txt",sep="\t")->GRRFsnps
snp2gen(infile=GRRFsnps,prefix_length = 6)

read.Genepop("snp2gen_converted.gen",pop.names=c("winter","spring"),haploid=FALSE)->GRRFgp

kfold=c(2:10)
seq(0.02,1,0.02)->train
assign.kfold( GRRFgp, k.fold=kfold, train.loci=train,
              loci.sample="fst", model="lda", dir="GRRFResults_lda_curve/")
accuKF<-accuracy.kfold(dir = "GRRFResults_lda_curve/")
membership.plot(dir="GRRFResults_lda_cure/")

#accuSummarytrain<-aggregate(accuKF[,4:6],list(accuKF$train.loci),mean)
#numloci<-train*262
#numloci<-round(numloci)
#accuSummarytrain$numloci<-numloci
#plot(accuSummarytrain$numloci,accuSummarytrain$assign.rate.all)


accuSummary<-aggregate(accuKF[,4:6],list(accuKF$KF),mean)
accuSummary$Group.1<-as.numeric(levels(accuSummary$Group.1))[accuSummary$Group.1]
accuSummary<-accuSummary[order(accuSummary$Group.1),]
write.table(accuSummary,file="GRRFN262AccurarySummaryTable.txt",sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
#plot(accuSummary$Group.1,accuSummary$assign.rate.all)

###################################################################################

#### Redo GL analysis using only selected SNPs ####

# Starting with .txt file of LG and position (no header) sorted by LG and position
# Use angsd to make a binary version of the file
./angsd sites index SeasonSNPlist_sorted.txt

# Make a chr list using this code
cut -f1 SeasonSNPlist_sorted.txt |sort|uniq > SeasonChrs.txt

# Then redo Beagle file generation by filtering by the indexed Season  SNP list (-sites) and the selected chromosomes (-rf)
./angsd -b FinalCodRealignedListGOMOnly.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out ReanalysisResults/BeagleGL_OGFiltersGOM_SeasonSNPs -P 32 \
	-GL 2 -doGlf 2 -doMajorMinor 1 \
	-doCounts 1 -doDepth 1 -dumpCounts 2 \
	-minQ 20 -sites SeasonSNPlist_sorted.txt -rf SeasonChrs.txt \
	-remove_bads 0 -only_proper_pairs 0 \
	-doGeno 3 -doPost 2 -doMaf 1 -SNP_pval 1e-6 -minMaf 0.05

#Do PCA using pcangsd for Season. 
python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/BeagleGL_OGFiltersGOM_SeasonSNPs.beagle.gz -e 2 -out PCAResults/CovMatSeason25SNPs -threads 32

# Plot PCA in R for selected SNPs

###################################################################################
#The following code will make a PCA from the output of PCAngsd
library(ggplot2)
library(devtools)
library(ggbiplot)
setwd("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd")

# read in collection information
Coll.info<-read.csv("VcfIndOrderGOMonly.csv")
#Coll.info$SS<-paste(Coll.info$Stock, Coll.info$Season, sep="")

#Load the covariance matrix
cov <- as.matrix(read.table("CovMatSeason25SNPs.cov", header = F))

mme.pca <- eigen(cov) #perform the pca using the eigen function. 

eigenvectors = mme.pca$vectors #extract eigenvectors 
pca.vectors = as.data.frame(cbind(Coll.info, eigenvectors)) #combine with our population assignments

pca.eigenval.sum = sum(mme.pca$values) #sum of eigenvalues
varPC1 <- (mme.pca$values[1]/pca.eigenval.sum)*100 #Variance explained by PC1
varPC2 <- (mme.pca$values[2]/pca.eigenval.sum)*100 #Variance explained by PC2

Season.pca<-prcomp(cov, scale.=TRUE)
cols<-c("#fdb863", "#5e3c99")

pdf("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason25_ggbiplot.pdf", paper="special", width=8, height=6)
jpeg("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason25_ggbiplot.jpeg", res=600, width=8, height=6, unit="in")

ggbiplot(Season.pca, ellipse=TRUE,groups=Coll.info$Season, var.axes=FALSE) +
  geom_point(colour="black", shape=21,size=2.5,aes(fill=factor(Coll.info$Season))) +
  scale_fill_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=cols) +
  scale_color_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=cols) +
  #scale_fill_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=dichromat(cols)) +
  #scale_color_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=dichromat(cols)) +
  
  theme_bw() +
  labs(fill='Sex') +
  xlab(paste0("PC1 (",round(varPC1,2),"% Variance Explained)")) +
  ylab(paste0("PC2 (",round(varPC2,2),"% Variance Explained)")) +
  theme(legend.position=c(0.182,0.079)) +
  theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) +
  theme(
    axis.text=element_text(size=11),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())
dev.off()