# The following is a list of commands used to generate the data and run analyses in O'Donnell and Sullivan: "Low coverage whole genome sequencing reveals molecular markers for spawning season and sex identification in Gulf of Maine Atlantic cod (Gadus morhua, Linnaeus 1758)" ##### SEQUENCE TRIMMING #### # All of these commands will loop over a Samples.txt file containing a list of hte sample names on each line while read -r line; do # Trim reads with Trimmomatic java -jar trimmomatic-0.38.jar PE -phred33 /data/prj/Cod/R1_R2_concat/$line\_concat_L1234_R1_001.fastq /data/prj/Cod/R1_R2_concat/$line\_concat_L1234_R2_001.fastq /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR1_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR1_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR2_001.fastq.gz /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR2_001.fastq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 ##### MAPPING ##### # Align reads to reference genome using bowtie # Align unpaired reads ./bowtie2 --very-sensitive-local -p 8 -x /data/prj/Cod/Assembly/gadMor2ref -U /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_UR1_001.fastq,$line\_concat_L1234_UR2_001.fastq -S /data/prj/Cod/Assembly/codGenomes/$line\_U.sam # Align paired reads ./bowtie2 --very-sensitive-local -p 8 -x /data/prj/Cod/Assembly/gadMor2ref -1 /data/prj/Cod/R1_R2_concat/Trimmed/$line\_concat_L1234_PR1_001.fastq -2 $line\_concat_L1234_PR2_001.fastq -S /data/prj/Cod/Assembly/codGenomes/$line\_P.sam #### SEQUENCE FILTERING #### # Convert and Organize files using SAMtools # Convert unpaired and paired sam to bam with mapping scores greater than 20 samtools view -S -b -q 20 -o /data/prj/Cod/Assembly/codGenomesBAM/$line\_U.bam /data/prj/Cod/Assembly/codGenomes/$line\_U.sam samtools view -S -b -q 20 -o /data/prj/Cod/Assembly/codGenomesBAM/$line\_P.bam /data/prj/Cod/Assembly/codGenomes/$line\_P.sam # Sort the bam files samtools sort -l 0 /data/prj/Cod/Assembly/codGenomesBAM/$line\_P.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Psorted samtools sort -l 0 /data/prj/Cod/Assembly/codGenomesBAM/$line\_U.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Usorted # Merge the paired and unpaired bam files samtools merge /data/prj/Cod/Assembly/codGenomesBAM/$line\_merged.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Psorted.bam /data/prj/Cod/Assembly/codGenomesBAM/$line\_Usorted.bam # Soft clip read ends to retain only the read with the highest quality score in overlapping regions using the clipOverlap in BAMUTIL ./bam clipOverlap --in /data/prj/Cod/Assembly/codGenomesBAM/$line\_merged.bam --out /data/prj/Cod/Assembly/codGenomesBAM/$line\_clipped.bam --stats --unmapped # Duplicate reads were removed using the MarkDuplicates module of PICARD java -jar picard.jar MarkDuplicates I=/data/prj/Cod/Assembly/codGenomesBAM/$line\_clipped.bam O=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_rm_dups.bam M=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_Dup_Metrics.txt REMOVE_DUPLICATES=true #### COVERAGE ESTIMATION #### # Estimate the coverage and mapping depth for each individual using the SAMtools mpileup samtools mpileup -A -B -Q 0 -f /data/prj/Cod/Assembly/gadMor2.fasta -a /data/prj/Cod/Assembly/codGenomes_rmdups/$line\_rm_dups.bam > /data/prj/Cod/Assembly/Coverage/mpileup/$line\.pileup # Custom script to estimate mapping depth after excluding positions with >4× the mean mapping depth # Count the reads looping over each sample echo count reads $line; awk -F'\t' '{sum+=$4}END{print sum;}' /data/prj/Cod/Assembly/Coverage/mpileup/$line\.pileup >> /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads.txt # Estimate the coverage without filtering out any positions echo estimate coverage $line; awk '{print $1,$1/643000000,4*($1/643000000)}' /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads.txt | tail -1 >> CodCoverage.txt # Exclude positions with >4x the mean mapping depth echo filter by four times coverage $line; awk -v fourXreads="$(tail -1 CodCoverage.txt | awk '{ print $3 }')" '{ if ($4 < fourXreads) print }' /data/prj/Cod/Assembly/Coverage/mpileup/$line.pileup > /data/prj/Cod/Assembly/Coverage/mpileup/$line\_filtered.pileup # Count the reads on filtered pileup file echo count reads on filtered pileup $line; awk -F'\t' '{sum+=$4}END{print sum;}' /data/prj/Cod/Assembly/Coverage/mpileup/$line\_filtered.pileup >> /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads_filtered.txt # Estimate filtered coverage. NOTE: the end of the loop with the 'Samples.txt' file occurs here echo estimate filtered coverage $line; awk '{print $1,$1/643000000}' /data/prj/Cod/Assembly/Coverage/mpileup/Summary/countreads_filtered.txt | tail -1 >> CodCoverage_filtered.txt; done < Samples.txt # Add sample name labels to the coverage estimates you just generated echo add labels to CodCoverage files; paste Samples.txt CodCoverage.txt | column -s $'\t' -t > /data/prj/Cod/Assembly/Coverage/mpileup/Summary/CodCoverage_labeled.txt paste Samples.txt CodCoverage_filtered.txt | column -s $'\t' -t > /data/prj/Cod/Assembly/Coverage/mpileup/Summary/CodCoverage_filtered_labeled.txt # When all individuals had been sequenced to ~1× depth use PICARD Tools to add read groups and create index files for each individual #### REALIGN AROUND INDELS #### # Add read groups # loop over list of file names in 'AddReadGroupsFiles.txt' while read -r line; do java -jar picard.jar AddOrReplaceReadGroups \ I=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\.bam \ O=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_wRG.bam \ RGID=1 \ RGLB=lib1 \ RGPL=illumina \ RGPU=unit1 \ RGSM=20; done < AddReadGroupsFiles.txt # Create index files while read -r line; do java -jar picard.jar BuildBamIndex \ I=/data/prj/Cod/Assembly/codGenomes_rmdups/$line\_wRG.bam; done < AddReadGroupsFiles.txt # Realign around indels using IndelRealigner within GATK v3.8 # In the first step run RealignerTargetCreator to look for all the indels java -Xmx48g -jar GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R /data/prj/Cod/Assembly/gadMor2.fasta \ -I CodFinalBamFiles.list \ -o CodFinalBamFiles_forIndelRealigner.intervals \ -drf BadMate # Then use IndelRealigner to realign around all of the target indels created in the previous step java -Xmx48g -jar GenomeAnalysisTK.jar \ -T IndelRealigner \ -R /data/prj/Cod/Assembly/gadMor2.fasta \ -I CodFinalBamFiles.list \ -targetIntervals CodFinalBamFiles_forIndelRealigner.intervals \ --consensusDeterminationModel USE_READS \ --nWayOut realigned.bam #### SNP CALLING AND GENOTYPE LIKELIHOOD ESTIMATION WITH ANGSD #### # The 'FinalCodRealignedList.txt' is a full path to all of the BAM files after realigning around indels # The code below calls SNPs and estimates genotype likelihoods for all individuals including GB ./angsd -b FinalCodRealignedList.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out N222_HWE -P 32 \ -GL 2 -doGlf 2 -doMajorMinor 1 \ -doCounts 1 -doDepth 1 -dumpCounts 2 \ -setMinDepth 50 -setMaxDepth 500 -minInd 150 -minQ 20 \ -remove_bads 0 -only_proper_pairs 0 \ -doGeno 3 -doPost 2 -doMaf 1 -HWE_pval 1e-6 -SNP_pval 1e-6 -minMaf 0.05 # Doing the same analyses but for only GOM individuals ./angsd -b FinalCodRealignedListGOMOnly.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out Beagle_GL_GOM -P 32 \ -GL 2 -doGlf 2 -doMajorMinor 1 \ -doCounts 1 -doDepth 1 -dumpCounts 2 \ -setMinDepth 50 -setMaxDepth 500 -minInd 150 -minQ 20 \ -remove_bads 0 -only_proper_pairs 0 \ -doGeno 3 -doPost 2 -doMaf 1 -HWE_pval 1e-6 -SNP_pval 1e-6 -minMaf 0.05 #### Fst Calculations in ANGSD #### # All Fst calculations were performed for cod separated by season and also by sex. Only seasonal analyses are shown. # Calculate a folded minor allele frequency spectrum by specifiying the reference genome as the ancenstral genome for POP in GB Winter Spring do echo $POP ./angsd -b $POP'_BAMs.txt' -ref /data/prj/Cod/Assembly/gadMor2.fasta -anc /data/prj/Cod/Assembly/gadMor2.fasta -out ReanalysisResults/$POP.folded \ -GL 2 -doSaf 1 -doMajorMinor 1 \ -doCounts 1 -doDepth 1 -dumpCounts 2 \ -setMinDepth 5 -setMaxDepth 500 -minInd 5 -minQ 20 \ -remove_bads 0 -only_proper_pairs 0 \ -doPost 2 -doMaf 1 -SNP_pval 1e-6 -minMaf 0.05 done # Compute overall folded SFS by specifcing the -fold 1 option as part of the realSFS command. for POP in GB Winter Spring do echo $POP ./realSFS /data/app/angsd/ReanalysisResults/$POP.folded.saf.idx -fold 1 > /data/app/angsd/ReanalysisResults/$POP.folded.sfs done # Do the 2dSFS for pairwise comparisons ./realSFS /data/app/angsd/ReanalysisResults/GB.folded.saf.idx /data/app/angsd/ReanalysisResults/Winter.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/GB.Winter.folded.sfs ./realSFS /data/app/angsd/ReanalysisResults/GB.folded.saf.idx /data/app/angsd/ReanalysisResults/Spring.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/GB.Spring.folded.sfs ./realSFS /data/app/angsd/ReanalysisResults/Winter.folded.saf.idx /data/app/angsd/ReanalysisResults/Spring.folded.saf.idx -P 32 -fold 1 > /data/app/angsd/ReanalysisResults/Winter.Spring.folded.sfs # Start of Fst estimation DIR=/data/app/angsd/ReanalysisResults/ export DIR ./realSFS fst index "$DIR"GB.folded.saf.idx "$DIR"Winter.folded.saf.idx "$DIR"Spring.folded.saf.idx -sfs "$DIR"GB.Winter.folded.sfs -sfs "$DIR"GB.Spring.folded.sfs -sfs "$DIR"Winter.Spring.folded.sfs -fstout "$DIR"Spring.folded.pbs -whichFst 1 # Convert Fst results from binary to txt. This will be helpful in calculating global Fst. ./realSFS fst print /data/app/angsd/ReanalysisResults/Spring.folded.pbs.fst.idx > /data/app/angsd/ReanalysisResults/SeasonFstList.txt #Sliding window win = 1,000 and step = 100 ./realSFS fst stats2 /data/app/angsd/ReanalysisResults/Spring.folded.pbs.fst.idx -win 1000 -step 100 > /data/app/angsd/ReanalysisResults/Season.folded.win1000step100.pbs.fst.txt # Plot locus-by-locus Fst results in R as GWAS ##################################################################################### setwd("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan") library(ggplot2) library(dplyr) #pull in Fst data pbs = read.table("Season.folded.Win1000Step100.pbs.fst.txt", header = T) #Assign more informative Fst column names colnames(pbs)[5:7]<-c("Fst_GB.Winter","Fst_GB.Spring","Fst_Winter.Spring") #Remove scaffolds pbssub<-pbs[grep("LG",pbs$chr),] #Calculate overall Fst values mean(pbssub$Fst_GB.Spring, na.rm=TRUE) mean(pbssub$Fst_GB.Winter, na.rm=TRUE) mean(pbssub$Fst_Winter.Spring, na.rm=TRUE) #Set any negative Fst values to 0 pbssub$Fst_GB.Spring[which(pbssub$Fst_GB.Spring<0)]=0 pbssub$Fst_GB.Winter[which(pbssub$Fst_GB.Winter<0)]=0 pbssub$Fst_Winter.Spring[which(pbssub$Fst_Winter.Spring<0)]=0 #Recalculate overall Fst values with 0s for negatives and no scaffolds mean(pbssub$Fst_GB.Spring, na.rm=TRUE) mean(pbssub$Fst_GB.Winter, na.rm=TRUE) mean(pbssub$Fst_Winter.Spring, na.rm=TRUE) pbssub$BP<-cumsum(as.numeric(pbssub$midPos)) axisdf = pbssub %>% group_by(chr) %>% summarize(center=( max(BP) + min(BP) ) / 2 ) axisdf$chr<-c(1:23) jpeg("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan/FinalPlots/FstBySeasonWin1000Step100.jpeg", res=600, width=8, height=3, unit="in") pdf("C:/Users/todonnell/Documents/R.directory/Cod/Fst/AngsdToMan/FinalPlots/FstBySeasonWin1000Step100.pdf", paper="special", width=8, height=3) ggplot(pbssub, aes(x=BP, y=Fst_Winter.Spring)) + # Show all points geom_point(aes(color=as.factor(chr)), alpha=0.8, size=1.3) + #geom_segment(aes(x=min(Mandatasub$BP),xend=max(Mandatasub$BP),y=7.3,yend=7.3), color="red") + scale_color_manual(values = rep(c("black", "grey"), 22 )) + # custom X axis: scale_x_continuous( label = axisdf$chr, breaks= axisdf$center ) + scale_y_continuous(expand = c(0, 0) ) + # remove space between plot area and x axis xlab("Linkage Group") + ylab(bquote(italic("F")[ST])) + #ylim(0.0001,0.6) + #adjust ylim after seeing first plot expand_limits(y=0.6) + # Custom the theme: theme_bw() + theme( axis.text.x = element_text(size=8), legend.position="none", panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) dev.off() ##################################################################################### #### PCA in PCAngsd #### # PCA was performed for cod separated by spawning season and also sex. Only seasonal analyses are shown here. # Use PCAngsd to generate covariation matrix from beagle GL file produced by angsd. python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/Beagle_GL_GOM.beagle.gz -out PCAResults/CovMatGOM3101296SNPs -threads 32 # Plot PCA in R ##################################################################################### #The following code will make a PCA from the output of PCAngsd library(ggplot2) setwd("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd") # read in collection information Coll.info<-read.csv("VcfIndOrderGOMonly.csv") #Coll.info$SS<-paste(Coll.info$Stock, Coll.info$Season, sep="") #Load the covariance matrix cov <- as.matrix(read.table("CovMatGOM3101296SNPs.cov", header = F)) mme.pca <- eigen(cov) #perform the pca using the eigen function. eigenvectors = mme.pca$vectors #extract eigenvectors pca.vectors = as.data.frame(cbind(Coll.info, eigenvectors)) #combine with our population assignments pca.eigenval.sum = sum(mme.pca$values) #sum of eigenvalues varPC1 <- (mme.pca$values[1]/pca.eigenval.sum)*100 #Variance explained by PC1 varPC2 <- (mme.pca$values[2]/pca.eigenval.sum)*100 #Variance explained by PC2 cols<-c("#fdb863", "#5e3c99") pdf("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason3101296.pdf", paper="special", width=8, height=6) jpeg("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason3101296.jpeg", res=600, width=8, height=6, unit="in") #Produce plot ggplot() + geom_point(data=pca.vectors, colour="black", shape=21,size=2.5,aes(fill=factor(Season),x=`1`,y=`2`)) + scale_fill_manual(values=cols) + #scale_fill_manual(values=dichromat(cols)) + #Use this code to see how the figure looks to someone who is colorblind #scale_fill_manual(values=ColToGrey(colsRGB)) + #Use this code to see how the figure looks in greyscale theme_bw() + labs(fill='Spawning Season') + xlab(paste0("PC1 (",round(varPC1,2),"% Variance Explained)")) + ylab(paste0("PC2 (",round(varPC2,2),"% Variance Explained)")) + theme(legend.position=c(0.094,0.079)) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme( axis.text=element_text(size=11), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) dev.off() ###################################################################################### #### Selection scan in PCAngsd #### # Use the pcadapt selection scan tool in PCAngsd to find loci under selection python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/BeagleGL_OGFiltersGOM.beagle.gz -out PCAResults/GOMpcadapt -pcadapt -sites_save -threads 128 #### Informative Locus Selection #### # Use GRRF in R to select loci ################################################################################### library(randomForest) library(RRF) setwd("/data/usr/timo/R.directory/Cod/RandomForest") print("Start map file recoding") #Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .ped #file so that ACGT is replaced with 1234 and 0's with NA's. Then use the next chunk of #code to take the associated .map file and use it to create a header of locus names first<-c("FID", "IID", "IDF", "IDM", "Sex", "Phenotype") map<-read.table("BySeasonPED.map", sep="",header=FALSE) order<-as.data.frame(c(1:nrow(map))) df1<-as.data.frame(cbind(order,map$V2)) df2<-as.data.frame(cbind(order,map$V2)) rbind(df1,df2)->locusnames colnames(locusnames)<-c("order","SNP") locusnames<-locusnames[order(locusnames$order),] LNvec<-as.vector(t(locusnames$SNP)) header<-c(first,LNvec) print("Start read in .ped file") ped<-read.table("BySeasonPEDforRRF.ped",sep=" ",header=FALSE) colnames(ped)<-header rm(df1,df2,locusnames,map,order,first,LNvec,header) print("Finish .ped file read in") #Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .bim #file to export the minor and major alleles and recode them ACGT for 1234. The following #chunk of code takes those minor and major allele columns for each locus and doubles #them and transposes them to work with the loop in the next section. print("Read in and edit MinMaj") MinMaj<-read.table("MinMaj.txt") order<-as.data.frame(c(1:nrow(MinMaj))) df1<-as.data.frame(cbind(order,MinMaj)) df2<-as.data.frame(cbind(order,MinMaj)) rbind(df1,df2)->MinMajcombo colnames(MinMajcombo)<-c("order","Min","Maj") MinMajcombo<-MinMajcombo[order(MinMajcombo$order),] MMvec<-t(MinMajcombo[,2:3]) rm(df1,df2,MinMaj,MinMajcombo,order) #indicate the pop and ind IDs from the ped file and isolate the genotype data in 'ready' POPID<-ped[,6] IID<-ped[,2] ready<-ped[,7:ncol(ped)] fmat<-as.matrix(sapply(ready, as.numeric)) rm(ready,ped) print("Start recoding of .ped file") #take your genoytpe data and rewrite it so that 0 codes for minor allele and 1 codes #for the major allele at each locus rewrite<-matrix(nrow=nrow(fmat), ncol=ncol(fmat)) for (i in 1:ncol(fmat)){ result<-ifelse(fmat[1:nrow(fmat),i]==MMvec[1,i],0,ifelse(fmat[1:nrow(fmat),i]==MMvec[2,i],1,FALSE)) rewrite[,i]<-result } colnames(rewrite)<-colnames(fmat) #combining columns into single feature so 0/0=0 0/1=0.5 and 1/1=2 oddindex<-c(((1:(ncol(rewrite)/2))*2-1)) evenindex<-c(((1:(ncol(rewrite)/2))*2)) even<-rewrite[,evenindex] odd<-rewrite[,oddindex] new<-(odd+even)/2 rm(even,odd,MMvec,fmat,rewrite,evenindex,oddindex,i,result) print("Start impute step") #impute missing data using RF imputed<-rfImpute(new, POPID, iter=10, ntree=2000) print("Finish impute step") imputedfeatures<-imputed[,-1] done<-cbind(as.character(POPID), as.character(IID), imputedfeatures) #optional to write as table to save data in this format rm(new,imputed) print("write RFdata table") write.table(done, file="RFdatafile.txt", row.names=FALSE, col.names=TRUE, quote=FALSE) #optional to write as table to save data in this format #you now have a file suitable for RF! #object 'POPID' can be used for 'y' for RF classification. Object 'new' is data frame or 'x'. #example run of RF print("start randomforest step") rf<-randomForest(imputedfeatures, POPID, ntree=5000, mtry=(sqrt(ncol(imputedfeatures))), importance=TRUE, do.trace=500) print("finish randomforest step") print("write results") #code used to obtain ranked list of loci above MDA = 0 select<-rf$importance MDA<-as.data.frame(select[,1]) MDA$loci<-rownames(MDA) above0<-as.data.frame(MDA[which(MDA[,1] > 0),]) selected<-above0[order(-above0[,1]),] #shouldn't be necessary, importance should already be ranked colnames(selected)<-c("%IncMSE","loci") print("write results to table") write.table(selected, file="CodAllLociRFrank.txt", row.names=FALSE, col.names=TRUE, quote=FALSE) ################################################################################### #### Evaluating population assignment accuracy #### # Use assignPOP in R to evaluate locus suites ################################################################################### setwd("C:/Users/todonnell/Documents/R.directory/Cod/assignPOP/N262GRRF") library(assignPOP) library(diveRsity) library(randomForest) library(klaR) print("Start map file recoding") #Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .ped #file so that ACGT is replaced with 1234 and 0's with NA's. Then use the next chunk of #code to take the associated .map file and use it to create a header of locus names first<-c("FID", "IID", "IDF", "IDM", "Sex", "Phenotype") map<-read.table("GRRFN262Final.map", sep="",header=FALSE) order<-as.data.frame(c(1:nrow(map))) df1<-as.data.frame(cbind(order,map$V2)) df2<-as.data.frame(cbind(order,map$V2)) rbind(df1,df2)->locusnames colnames(locusnames)<-c("order","SNP") locusnames<-locusnames[order(locusnames$order),] LNvec<-as.vector(t(locusnames$SNP)) header<-c(first,LNvec) print("Start read in .ped file") ped<-read.table("N262ForRRF.ped",sep=" ",header=FALSE) colnames(ped)<-header rm(df1,df2,locusnames,map,order,first,LNvec,header) print("Finish .ped file read in") #Use the Linux code in the BIOINFORMATICS WORKFLOW word doc to reformat the plink .bim #file to export the minor and major alleles and recode them ACGT for 1234. The following #chunk of code takes those minor and major allele columns for each locus and doubles #them and transposes them to work with the loop in the next section. print("Read in and edit MinMaj") MinMaj<-read.table("N262forRRFMinMaj.txt") order<-as.data.frame(c(1:nrow(MinMaj))) df1<-as.data.frame(cbind(order,MinMaj)) df2<-as.data.frame(cbind(order,MinMaj)) rbind(df1,df2)->MinMajcombo colnames(MinMajcombo)<-c("order","Min","Maj") MinMajcombo<-MinMajcombo[order(MinMajcombo$order),] MMvec<-t(MinMajcombo[,2:3]) rm(df1,df2,MinMajcombo,order) #indicate the pop and ind IDs from the ped file and isolate the genotype data in 'ready' POPID<-as.factor(ped[,6]) IID<-ped[,2] ready<-ped[,7:ncol(ped)] fmat<-as.matrix(sapply(ready, as.numeric)) rm(ready,ped) print("Start recoding of .ped file") #take your genoytpe data and rewrite it so that 0 codes for minor allele and 1 codes #for the major allele at each locus rewrite<-matrix(nrow=nrow(fmat), ncol=ncol(fmat)) for (i in 1:ncol(fmat)){ result<-ifelse(fmat[1:nrow(fmat),i]==MMvec[1,i],0,ifelse(fmat[1:nrow(fmat),i]==MMvec[2,i],1,FALSE)) rewrite[,i]<-result } colnames(rewrite)<-colnames(fmat) #combining columns into single feature so 0/0=0 0/1=0.5 and 1/1=2 oddindex<-c(((1:(ncol(rewrite)/2))*2-1)) evenindex<-c(((1:(ncol(rewrite)/2))*2)) even<-rewrite[,evenindex] odd<-rewrite[,oddindex] new<-(odd+even)/2 rm(even,odd,MMvec,fmat,rewrite,evenindex,oddindex,i,result) print("Start impute step") #impute missing data using RF imputed<-rfImpute(new, POPID, iter=10, ntree=2000) imputedfeatures<-imputed[,-1] #Round imputed data frame so that all values equal 0, 0.5 or 1. imputedfeatures[imputedfeatures>=0.75]<-1 imputedfeatures[imputedfeatures<=0.7499999999 & imputedfeatures>=0.5000001]<-0.5 imputedfeatures[imputedfeatures<=0.249999999]<-0 imputedfeatures[imputedfeatures>=0.2499999999 & imputedfeatures<=0.5]<-0.5 #transpose MinMaj df and replace 1,2,3,4 with A,C,G,T tMinMaj<-t(MinMaj) tMinMaj[tMinMaj==1]<-"A" tMinMaj[tMinMaj==2]<-"C" tMinMaj[tMinMaj==3]<-"G" tMinMaj[tMinMaj==4]<-"T" #revert the transposed dataframe back to A,C,G,T rewrite<-matrix(nrow=nrow(imputedfeatures), ncol=ncol(imputedfeatures)) for (i in 1:ncol(imputedfeatures)){ result<-ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==0,paste(tMinMaj[1,i],tMinMaj[1,i],sep=""),ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==1,paste(tMinMaj[2,i],tMinMaj[2,i],sep=""),ifelse(imputedfeatures[1:nrow(imputedfeatures),i]==0.5,paste(tMinMaj[1,i],tMinMaj[2,i],sep=""),FALSE))) rewrite[,i]<-result } read.csv("VcfIndOrdernew.csv")->Coll.info rbind(colnames(imputedfeatures),rewrite)->rewrite c("SNP_ID",as.character(Coll.info$pop))->firstcol cbind(firstcol,rewrite)->rewrite GRRFsnps<-t(rewrite) GRRFsnps<-as.data.frame(GRRFsnps) write.table(GRRFsnps,file="GRRFsnpstest.txt",quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE) #Open GRRFsnpstest.txt in Excel and sort the loci according to GRRF Rank read.table(file="GRRFsnpstest.txt",sep="\t")->GRRFsnps snp2gen(infile=GRRFsnps,prefix_length = 6) read.Genepop("snp2gen_converted.gen",pop.names=c("winter","spring"),haploid=FALSE)->GRRFgp kfold=c(2:10) seq(0.02,1,0.02)->train assign.kfold( GRRFgp, k.fold=kfold, train.loci=train, loci.sample="fst", model="lda", dir="GRRFResults_lda_curve/") accuKF<-accuracy.kfold(dir = "GRRFResults_lda_curve/") membership.plot(dir="GRRFResults_lda_cure/") #accuSummarytrain<-aggregate(accuKF[,4:6],list(accuKF$train.loci),mean) #numloci<-train*262 #numloci<-round(numloci) #accuSummarytrain$numloci<-numloci #plot(accuSummarytrain$numloci,accuSummarytrain$assign.rate.all) accuSummary<-aggregate(accuKF[,4:6],list(accuKF$KF),mean) accuSummary$Group.1<-as.numeric(levels(accuSummary$Group.1))[accuSummary$Group.1] accuSummary<-accuSummary[order(accuSummary$Group.1),] write.table(accuSummary,file="GRRFN262AccurarySummaryTable.txt",sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE) #plot(accuSummary$Group.1,accuSummary$assign.rate.all) ################################################################################### #### Redo GL analysis using only selected SNPs #### # Starting with .txt file of LG and position (no header) sorted by LG and position # Use angsd to make a binary version of the file ./angsd sites index SeasonSNPlist_sorted.txt # Make a chr list using this code cut -f1 SeasonSNPlist_sorted.txt |sort|uniq > SeasonChrs.txt # Then redo Beagle file generation by filtering by the indexed Season SNP list (-sites) and the selected chromosomes (-rf) ./angsd -b FinalCodRealignedListGOMOnly.txt -ref /data/prj/Cod/Assembly/gadMor2.fasta -out ReanalysisResults/BeagleGL_OGFiltersGOM_SeasonSNPs -P 32 \ -GL 2 -doGlf 2 -doMajorMinor 1 \ -doCounts 1 -doDepth 1 -dumpCounts 2 \ -minQ 20 -sites SeasonSNPlist_sorted.txt -rf SeasonChrs.txt \ -remove_bads 0 -only_proper_pairs 0 \ -doGeno 3 -doPost 2 -doMaf 1 -SNP_pval 1e-6 -minMaf 0.05 #Do PCA using pcangsd for Season. python3 pcangsd.py -beagle /data/app/angsd/ReanalysisResults/BeagleGL_OGFiltersGOM_SeasonSNPs.beagle.gz -e 2 -out PCAResults/CovMatSeason25SNPs -threads 32 # Plot PCA in R for selected SNPs ################################################################################### #The following code will make a PCA from the output of PCAngsd library(ggplot2) library(devtools) library(ggbiplot) setwd("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd") # read in collection information Coll.info<-read.csv("VcfIndOrderGOMonly.csv") #Coll.info$SS<-paste(Coll.info$Stock, Coll.info$Season, sep="") #Load the covariance matrix cov <- as.matrix(read.table("CovMatSeason25SNPs.cov", header = F)) mme.pca <- eigen(cov) #perform the pca using the eigen function. eigenvectors = mme.pca$vectors #extract eigenvectors pca.vectors = as.data.frame(cbind(Coll.info, eigenvectors)) #combine with our population assignments pca.eigenval.sum = sum(mme.pca$values) #sum of eigenvalues varPC1 <- (mme.pca$values[1]/pca.eigenval.sum)*100 #Variance explained by PC1 varPC2 <- (mme.pca$values[2]/pca.eigenval.sum)*100 #Variance explained by PC2 Season.pca<-prcomp(cov, scale.=TRUE) cols<-c("#fdb863", "#5e3c99") pdf("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason25_ggbiplot.pdf", paper="special", width=8, height=6) jpeg("C:/Users/todonnell/Documents/R.directory/Cod/PCAngsd/Plots/PCA.GOMSeason25_ggbiplot.jpeg", res=600, width=8, height=6, unit="in") ggbiplot(Season.pca, ellipse=TRUE,groups=Coll.info$Season, var.axes=FALSE) + geom_point(colour="black", shape=21,size=2.5,aes(fill=factor(Coll.info$Season))) + scale_fill_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=cols) + scale_color_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=cols) + #scale_fill_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=dichromat(cols)) + #scale_color_manual(name="Spawning Season", labels=c("Spring", "Winter"), values=dichromat(cols)) + theme_bw() + labs(fill='Sex') + xlab(paste0("PC1 (",round(varPC1,2),"% Variance Explained)")) + ylab(paste0("PC2 (",round(varPC2,2),"% Variance Explained)")) + theme(legend.position=c(0.182,0.079)) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme( axis.text=element_text(size=11), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) dev.off()