setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

library(tidyr)
library(dplyr)
library(stringr)
library(adegenet)
library(plyr)
library(hierfstat)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(car)
library(lme4)
library(data.table)
library(sjmisc)
library(phylotools)
library(ggpubr)
library(ape)
library(Biostrings)
library(ggtree)
library(phangorn)
library(cowplot)
library(colorspace)
library(RColorBrewer)
library(seqinr)
library(grid)
library(LDheatmap)
library(genetics)
library(varhandle)
library(VariantAnnotation)
library(snpStats)
library(rehh)
library(gggenes)
library(pBrackets)
library(pheatmap)
library(ggplotify)

### Aim is to study molecular evolution of lectin-24A in DPGP samples

#### 1. Grouping DPGP populations by region  ####

B_POP <- as.character(read.table("../Input/B_POP")[[1]])
CO_POP <- as.character(read.table("../Input/CO_POP")[[1]])	 
EA_POP <- as.character(read.table("../Input/EA_POP")[[1]])	 
EB_POP <- as.character(read.table("../Input/EB_POP")[[1]])	 
ED_POP <- as.character(read.table("../Input/ED_POP")[[1]])	 
EF_POP <- as.character(read.table("../Input/EF_POP")[[1]])	 
EG_POP <- as.character(read.table("../Input/EG_POP")[[1]])	 
ER_POP <- as.character(read.table("../Input/ER_POP")[[1]])	 
FR_POP <- as.character(read.table("../Input/FR_POP")[[1]])	 
GA_POP <- as.character(read.table("../Input/GA_POP")[[1]])	 
GU_POP <- as.character(read.table("../Input/GU_POP")[[1]])	 
I_POP <- as.character(read.table("../Input/I_POP")[[1]])	 
N_POP <- as.character(read.table("../Input/N_POP")[[1]])	 
NG_POP <- as.character(read.table("../Input/NG_POP")[[1]])  
RAL_POP <- as.character(read.table("../Input/RAL_POP")[[1]])		 
RG_POP <- as.character(read.table("../Input/RG_POP")[[1]])	 
SB_POP <- as.character(read.table("../Input/SB_POP")[[1]])	 
SD_POP <- as.character(read.table("../Input/SD_POP")[[1]])	 
SF_POP <- as.character(read.table("../Input/SF_POP")[[1]])	 
SP_POP <- as.character(read.table("../Input/SP_POP")[[1]])	 
T_POP <- as.character(read.table("../Input/T_POP")[[1]])	 
UG_POP <- as.character(read.table("../Input/UG_POP")[[1]])	 
W_POP <- as.character(read.table("../Input/W_POP")[[1]])	 
ZI_POP <- as.character(read.table("../Input/ZI_POP")[[1]])	 
ZS_POP <- as.character(read.table("../Input/ZS_POP")[[1]])	 
ZW_POP <- as.character(read.table("../Input/ZW_POP")[[1]])


### grouping populations by broader regions
ocenia <- T_POP
asia <- B_POP
america <- c(I_POP,RAL_POP,W_POP)
europe_north_africa <- c(N_POP,FR_POP,EG_POP)
southern_africa <- c(SB_POP,SD_POP,SF_POP,SP_POP,ZI_POP,ZS_POP,ZW_POP)
central_africa <- c(CO_POP,RG_POP,GA_POP)
west_africa <- c(GU_POP,NG_POP)
east_africa <- c(UG_POP,EA_POP,EB_POP,ED_POP,EF_POP,ER_POP)

### creating a list of regions
cont <- list(ocenia,asia,america,europe_north_africa,southern_africa,central_africa,west_africa,east_africa)
names(cont) <- c("Oceania","Asia","North America","Europe & North Africa","Southern Africa","Central Africa","West Africa","East Africa")


#### 2. Extract genotypes for lectin-24A upstream indels in DPGP samples from vcf    ####
## read mapping done to a modified reference containing the three indel sequence for lectin. Since the dm6 does not contain any of the indel sequence. 

dataChunk <- read.table("../Input/all_realign_lectin.vcf") ### VCF generated by CRISP after read mapping to modified reference
dataChunk <- dataChunk[!dataChunk$V9 == "GT:GQ:DP:ADf:ADr",] ### remove SNPs without any genotypes in samples

gt <- as.data.frame(dataChunk[,10:ncol(dataChunk)]) ###  genotype data
fix <- as.data.frame(dataChunk[,1:8]) ###  meta data
fix$V9 <- abs(nchar(as.character(fix$V4)) -nchar(as.character(fix$V5))) ## length of variants

names_files <- read.table("../Input/allbams",stringsAsFactors = FALSE) ### has sample names, sra accessions
### shorten file names
for (n in 1:nrow(names_files)) {
  names_files[n,] <- gsub("_paired_wasp_mapped_mq40_reorder_readgroup_sort_marked_realign.bam","",names_files[n,])
  names_files[n,] <- gsub("_trimmed_mapped_mq40_reorder_readgroup_sort_marked_realign.bam","",names_files[n,])
}
names_files <- as.character(names_files$V1)
colnames(gt) <- names_files

### convert to reference (0) and alternate (1) calls, not accurate for all SNPs but only interested in the biallelic indels for this part
new2 <- ""
for (i in 1:ncol(gt)){
  gt_sub <- gt[i]
  new <- gt_sub %>% separate(colnames(gt_sub), into =c("MLAC","GQ","DP","ADf","ADr","ADb"), sep = ":") ### MLAC contains genotypes called by CRISP
  new$MLAC <- gsub("0/0","R",new$MLAC) ### after / is genotype of sample, expecting single call in homozygous genomes, getting ref calls, three upstream insertions are ref
  new$MLAC <- gsub("[0123456789]/[123456789]","A",new$MLAC) ### after / is genotype of sample, expecting single call in homozygous genomes, getting alt calls, three upstream deletions are alt
  new$MLAC <- gsub("[^RA]+", "NA", new$MLAC)
  new$MLAC <- gsub("^R$", "0", new$MLAC) ### convert ref to 0
  new$MLAC <- gsub("^A$", "1", new$MLAC) ### convert alt to 1
  genosample <- as.data.frame(new$MLAC) ### gt for each sample
  colnames(genosample) <- names_files[i] ### add SRA names to samples
  genosample[as.numeric(new$GQ ) < 10 | as.numeric(new$GQ ) %in% NA,] <- NA  ### if genotype quality less than 10 or NA, make genotype call NA
  new2 <- cbind(new2,genosample)
  
}

new2 <- new2[,-1] ### remove empty row
full <- cbind(fix,new2) ### add genotype and meta data

full <- full[full$V2 %in% c(2035,2141,2311),] ### extract the three upstream indels of interest
full <- full[-c(2),]
full <- full[-c(1:9)] ### remove meta data

rownames(full) <- c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878") ### indel names
full <- as.data.frame(t(full)) ### transpose matrix
full$indelhaplotype <- paste(full$Indel7bp_X3718140,full$Indel8bp_X3718040,full$Indel21bp_X3717878,sep=",")  ### upstream indel haplotype for each sample
full$indelhaplotype <- gsub(".*NA.*","NA",full$indelhaplotype)

### 7 possible haplotypes
full$indelhaplotype <- gsub("0,0,1","0",full$indelhaplotype)
full$indelhaplotype <- gsub("0,1,1","1",full$indelhaplotype)
full$indelhaplotype <- gsub("0,0,0","2",full$indelhaplotype)
full$indelhaplotype <- gsub("0,1,0","3",full$indelhaplotype)
full$indelhaplotype <- gsub("1,0,0","4",full$indelhaplotype)
full$indelhaplotype <- gsub("1,1,0","5",full$indelhaplotype)
full$indelhaplotype <- gsub("1,1,1","6",full$indelhaplotype)

full$Run <- rownames(full) ### make rownames run names

### now convert from sra accession to sample names. Get meta data for samples.
sranames <- full$Run
sranames <- as.data.frame(sranames)
colnames(sranames) <- "Run"

sranames_meta <- read.csv("../Input/SraRunTable_dpgp2.txt")
sranames_meta_all <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_all) <- c("Run","Sample")

sranames_meta <- read.csv("../Input/SraRunTable_dpgp3.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_dgrp_sra2.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$strain)))
sranames_meta_sub$V2 <- gsub("DGRP-","RAL-",sranames_meta_sub$V2)
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_bergman.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Library.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_pool.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_nuzhdin1.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_clark.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_ages.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)
sranames_meta_all <- na.omit(sranames_meta_all)

### add meta data to sra accessions and modify sample names for compatability
sranames_meta <- left_join(sranames,sranames_meta_all, by =c("Run"="Run"))
sranames_meta$Sample <- gsub("-HE","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("_HE","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("diTAG-","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("_new","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO8-3","CO8N",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO10-3","CO10N",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO13-3","CO13N",sranames_meta$Sample)
individuals <- read.csv(file = "../Input/TableS1_individuals.csv")
individuals <- individuals[,c(1,4,10,11)]
datades <- read.csv(file = "../Input/TableS2_populations.csv")
individuals_des <- left_join(individuals,datades, by =c("Population"="Population.ID") )
sranames_meta_des <- inner_join(sranames_meta,individuals_des, by = c("Sample"="Stock.ID")) ### include sample meta data

### Indel calls for three upstream indels and sample names and meta data
full3 <- inner_join(full,sranames_meta_des, by =c("Run"))
full3 <- cbind(full3[1:5],full3[7])

#### 3. Extract genotypes for lectin-24A SNPs in DPGP samples from vcf  ####

dataChunk <- read.table("../Input/CRISP_dpgp_lectin24a_annot.vcf")  ### variants in DPGP were called jointly for all samples genome-wide using CRISP, vcf with lectin region was previously subsetted
dataChunk <- dataChunk[dataChunk$V2 < 3718142,] ### restrict to upstream lectin region
dataChunk <- dataChunk[!duplicated(dataChunk$V2),] ### remove duplicated positons
dataChunk <- dataChunk[!dataChunk$V9 == "GT:GQ:DP:ADf:ADr",] ### remove SNPs without any genotypes in samples
gt <- as.data.frame(dataChunk[,10:ncol(dataChunk)])  ### genotype data
fix <- as.data.frame(dataChunk[,1:8]) ### meta data

### shorten sample names
names_files <- read.table("../Input/file.list",stringsAsFactors = FALSE)
for (n in 1:nrow(names_files)) {
  names_files[n,] <- gsub("_paired_wasp_mapped_reorder_readgroup_mq20_sort_marked_realign.bam","",names_files[n,])
}
names_files <- as.character(names_files$V1)
colnames(gt) <- names_files

### process vcf files to extract the reference and alternate calls for each sample
new2 <- ""
for (i in 1:ncol(gt)){
  gt_sub <- gt[i] ### per sample unprocessed genotype
  new <- gt_sub %>% separate(colnames(gt_sub), into =c("MLAC","GQ","DP","ADf","ADr","ADb"), sep = ":") ### MLAC contains genotypes called by CRISP
  new <- new %>% separate(MLAC, into = c("ref","gt"),sep="/")  ### after / is genotype of sample, expecting single call in homozygous genomes
  if(nrow(new[new$ref != 0,]) > 0){
    new[new$ref != 0,]$gt <- NA
  }
  genosample <- as.data.frame(new$gt)
  colnames(genosample) <- names_files[i] ### add SRA names to samples
  genosample[as.numeric(new$GQ ) < 10 | as.numeric(new$GQ ) %in% NA,] <- NA  ### if genotype quality less than 10 or NA, make genotype call NA
  new2 <- cbind(new2,genosample)
}
new2 <- new2[,-1] ### remove empty row
full <- cbind (fix,new2) ### add genotype and meta data
full <- full[full$V8 %like% "VT=SNV;",]  ### only keep SNPs

### convert gt calls to numeric
full[,9:ncol(full)] <- apply(full[,9:ncol(full)],2,as.character)
full[,9:ncol(full)] <- apply(full[,9:ncol(full)],2,as.numeric)


#### 4. Get variants annotations and variant names  ####

### Get variants SNPEff annotations from info column in vcf 
### also get protein names for coding region SNPs and DNA based names for the remainder, names obtained from SnpEff

ann=""
ann_allsnps=""
ann_dna_snpname=""
ann_prot_snpname=""
for (i in 1:nrow(full)){
  altalleles <- str_split(full[i,5],",")[[1]]
  ### extract annotation for biallelic SNPs
  if(length(altalleles) < 2){
    ann[i] <- strsplit(full$V8[i], split ="\\|")[[1]][2]
    ann_dna_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][10]
    ann_prot_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][11]
    ann_allsnps <- rbind(ann_allsnps,c(full$V2[i],altalleles,ann[i]))
  }
  ### extract annotation for multiallelic SNPs
  if(length(altalleles) > 1){
    annwrk2 <- strsplit(full$V8[i], split ="=|\\|,")[[1]]
    annwrk2 <- annwrk2[14:length(annwrk2)]
    annwrk2 <- as.data.frame(str_split_fixed(annwrk2,"\\|",12))
    annwrk2 <- annwrk2[1:11]
    annwrk2 <- annwrk2[!duplicated(annwrk2$V1),]
    annwrk2 <- annwrk2[annwrk2$V1 %in% na.omit(altalleles),]
    ### if all alleles have same annotation, use that one
    if(length(table(annwrk2$V2))<1){

      ann[i] <- annwrk2$V2[1]
      ann_dna_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][10]
      ann_prot_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][11]
      ann_allsnps <- rbind(ann_allsnps,c(full$V2[i],annwrk2$V2[1:2]))
    } else{
      ### if one of the alternate alleles is at much higher frequency, extract annotation for that SNP
      gencount <- table(as.numeric(full[i,9:ncol(full)]))
      gencount <- gencount[-c(which(names(gencount) %in% 0))]
      if (max(prop.table(gencount))>0.79){
        ann[i] <- annwrk2[as.numeric(names(gencount)[gencount == max(gencount)]),]$V2
        ann_dna_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][10]
        ann_prot_snpname[i] <- strsplit(full$V8[i], split ="\\|")[[1]][11]
      } else {
        ### if there are multiple alleles at similar freq with different annotations, code annotation as NA for that site
        ann[i] <- NA
      }
      ann_allsnps <- rbind(ann_allsnps,as.matrix(unname(cbind(full$V2[i],annwrk2[1:2]))))
    }
    
  }
}

### use protein based names if it exists, else use dna based names for variants
ann_snpname <- ann_prot_snpname
for (n in 1:length(ann_snpname)){
 if(sjmisc::is_empty(ann_snpname[n])){
   ann_snpname[n] <- ann_dna_snpname[n]
 } 
}
names(ann_snpname) <- paste(full$V1,full$V2,ann,sep="_")


#### 5. Add variants annotations and sample and population metadata to SNP matrix  ####

full <- full[!duplicated(full$V2),] ### remove duplicated sites if exists
full_snps <- full[9:ncol(full)] ### get genotype calls only
rownames(full_snps) <- paste(full$V1,full$V2,ann,sep="_") ## rownames combination of chr and pos
full_snps$ann <- ann ## annotation column for genotype matrix
full_snps$Pos <- full$V2 ## positions column for genotype matrix
full_snps <- full_snps[!is.na(full_snps$ann),]  ### remove SNPs without any annotation, mainly multiallelic SNPs with different effects

### process genotype matrix, add run names
full_snps <- full_snps[-c((ncol(full_snps)-1):ncol(full_snps))]  ### get genotype matrix
full_snps <- as.data.frame(t(full_snps)) #transpose genotype matrix
full_snps$Run <- rownames(full_snps) ### add run column using rownames
sranames <- full_snps$Run ### separate vector for run names
sranames <- as.data.frame(sranames)
colnames(sranames) <- "Run"

### convert sra accessions to sample names and add meta data
sranames_meta <- read.csv("../Input/SraRunTable_dpgp2.txt")
sranames_meta_all <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_all) <- c("Run","Sample")

sranames_meta <- read.csv("../Input/SraRunTable_dpgp3.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_dgrp_sra2.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$strain)))
sranames_meta_sub$V2 <- gsub("DGRP-","RAL-",sranames_meta_sub$V2)
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_bergman.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Library.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_pool.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_nuzhdin1.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_clark.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)

sranames_meta <- read.csv("../Input/SraRunTable_ages.txt")
sranames_meta_sub <- as.data.frame(cbind(as.character(sranames_meta$Run),as.character(sranames_meta$Sample.Name)))
colnames(sranames_meta_sub) <- c("Run","Sample")
sranames_meta_all <- rbind(sranames_meta_all,sranames_meta_sub)
sranames_meta_all <- na.omit(sranames_meta_all)

sranames_meta <- left_join(sranames,sranames_meta_all, by =c("Run"="Run")) ### sra names with sample names and meta data

### modify sample names for compatibility
sranames_meta$Sample <- gsub("-HE","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("_HE","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("diTAG-","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("_new","",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO8-3","CO8N",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO10-3","CO10N",sranames_meta$Sample)
sranames_meta$Sample <- gsub("CO13-3","CO13N",sranames_meta$Sample)
individuals <- read.csv(file = "../Input/TableS1_individuals.csv")
individuals <- individuals[,c(1,4,10,11)]
datades <- read.csv(file = "../Input/TableS2_populations.csv")
individuals_des <- left_join(individuals,datades, by =c("Population"="Population.ID") )
sranames_meta_des <- inner_join(sranames_meta,individuals_des, by = c("Sample"="Stock.ID"))

### add sample names and meta data with population descriptions to genotype matrix
full_snps <- inner_join(sranames_meta_des,full_snps, by =c("Run"))


#### 6. Combine SNPs and upstream indels  ####

full3 <- full3[-c(ncol(full3))] ###  upstream indel calls for all samples
dpgpv <- inner_join(full_snps,full3, by =c("Run")) ### combine with SNP dataset
dpgpv[,17:ncol(dpgpv)] <- apply(dpgpv[,17:ncol(dpgpv)],2,as.numeric)

#### 7. Combine duplicated samples  ####
### some samples have multiple sra accessions as they were seqeunced multiple times
strains_unique <- unique(dpgpv$Sample) ### unique sample names
fullsnp <- ""
### loop over unique samples
for (s in 1:length(strains_unique)){
  strains_unique_data <- dpgpv[dpgpv$Sample %in% strains_unique[s],]
  if (nrow(strains_unique_data) == 1){
    ### no change is each sample only has one sra accession
    fullsnp <- rbind(fullsnp,strains_unique_data)
  } else{
    strains_unique_data_mod <- strains_unique_data[1,1:16]
    ### if multiple sra accession exist, check if variant calls match among them
    ### looping over each site
    for(c in 17:ncol(strains_unique_data)){
      ### if one sra is NA and the other is called, use called from non-NA sra accession
      if (length(table(strains_unique_data[,c])) < 2){
        if(length(unique(strains_unique_data[,c])) > 1){
          geno <- unique(na.omit(strains_unique_data[,c]))
        } else {
          ### if variants call is same in all sra samples, use first call
          geno <- unique(strains_unique_data[,c])
        }
        strains_unique_data_mod <- cbind(strains_unique_data_mod,geno)
      } else {
        ### if variants call is different in  sra samples, code as NA
        strains_unique_data_mod <- cbind(strains_unique_data_mod,"NA")
      }
      
    }
    colnames(strains_unique_data_mod)[17:ncol(strains_unique_data_mod)] <- colnames(strains_unique_data)[17:ncol(strains_unique_data)]
    fullsnp <- rbind(fullsnp,strains_unique_data_mod)
  }
}
dpgpv <- fullsnp[-c(1),] ### remove empty row



#### 8. Sample filtering  ####

dpgpv <- dpgpv[!dpgpv$Sample %in% "ZW184",] ### first remove sample designated as outlier in global diversity lines (GDL)
dpgpv_new <- dpgpv
rownames(dpgpv_new)<- dpgpv_new$Sample
dpgpv_new_allsamples <- dpgpv_new
ann_snpname_allsamples <- ann_snpname
dpgpv_new <- dpgpv_new[!dpgpv_new$Sample %in% names(which(apply(as.data.frame(apply(dpgpv_new[,17:ncol(dpgpv_new)],1,is.na)),2,sum)/ncol(dpgpv_new[,17:ncol(dpgpv_new)]) > 0.5)),] ### remove samples with more than 50% of data missing for lectin-24A
dpgpv_new <- dpgpv_new[dpgpv_new$Population %in% names(which(table(dpgpv_new$Population) > 3)),] ### only retain populations with at least four samples, retain 35 populations in the end


#### 9. SNP allele frequency based filtering  ####

##only retain SNPs with allele frequency  > 0.05 in at least two populations
tmpnew <- dpgpv_new[1:16]
for (i in 17:ncol(dpgpv_new)){
  ### only working with SNPs with maximum of three alleles, including reference call
  if (ncol(table(dpgpv_new$Population,dpgpv_new[,i])) > 1 & ncol(table(dpgpv_new$Population,dpgpv_new[,i])) < 4){
    if (length(table(dpgpv_new$Population)) - sum(table(dpgpv_new$Population,dpgpv_new[,i])[,2]/rowSums(table(dpgpv_new$Population,dpgpv_new[,i])) < 0.05 | table(dpgpv_new$Population,dpgpv_new[,i])[,2]/rowSums(table(dpgpv_new$Population,dpgpv_new[,i])) > 0.95, na.rm=T) > 1){
      tmpnew <- cbind(tmpnew,dpgpv_new[i])
    }
  }
}
dpgpv_new <- cbind(tmpnew,dpgpv_new[ncol(dpgpv_new)])
####process dataframe to add back chr and pos
dpgpvt <- as.data.frame(t(dpgpv_new[,17:ncol(dpgpv_new)]))
dpgpvt$Chr <- colsplit(rownames(dpgpvt),"_",names=c("Chr","Pos","c"))[1]$Chr
dpgpvt$Chr <- gsub(".*ndel.*","2L",dpgpvt$Chr)
dpgpvt$Pos <- as.numeric(gsub("X","",colsplit(rownames(dpgpvt),"_",names=c("Chr","Pos","c"))[2]$Pos))
dpgpvt$Pos[which(is.na(dpgpvt$Pos))] <- 3718040 ### remove 8bp indel that was called using unmodified reference

### modify variant names having filtered SNPs
### modify positions of upstream SNPs occurring before the 21 BP indel to account of the 21 BP indel
### not necessary for 8BP since insertion is derived state and 7BP is the most upstream variant
ann_snpname <- ann_snpname[names(ann_snpname) %in% colnames(dpgpv_new)] ### modfify variant names to only include filtered SNPs
ann_snpname_num <- 3717728-as.numeric(gsub("_.*","",gsub("2L_","",names(ann_snpname))))
ann_snpname2 <- ann_snpname[ann_snpname_num>(-171)] ### those downstream of 21BP indel 
ann_snpname3 <- ann_snpname[ann_snpname_num<(-171)] ### those upstream of 21BP indel 
ann_snpname_num <- ann_snpname_num[ann_snpname_num<(-171)]
ann_snpname_num <- ann_snpname_num-21 ### change numbering of variants upstream of the 21 Bp indel
ann_snpname4 <- paste("c.",ann_snpname_num,gsub(".*[0-9]","",ann_snpname3),sep="")
names(ann_snpname4) <- names(ann_snpname3)

### add indel names to naming list, 3 upstream indels and 171 bp coding indel
ann_snpname <- c(ann_snpname2,ann_snpname4)
extranames <- c("c.-439_-433del","c.-334_-333insACATTCAT","c.-171_-151del","p.Phe217_Glu273del*")
names(extranames) <- c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878","Indel_X3716932")
ann_snpname <- c(ann_snpname,extranames) ### snp names by id
ann_snpname_pos <- ann_snpname
names(ann_snpname_pos) <- as.numeric(gsub("2L_||.*_X||_.*","",names(ann_snpname)))
ann_snpname_pos <- rev(ann_snpname_pos[order(as.numeric(names(ann_snpname_pos)))])  ### snp names by position


#### 10. Call 171 BP coding deletion in DPGP samples  ####

### samples designed as having deletion if NA for region between 3717102 and 3716904
### 93 samples designed as having deletion this way
### pindel calls deletions in all of these 93 samples
### however pindel also calls deletions is an extra ~25 samples which have non-NA values here with high confidence.
### manual inspection of BAM files does not indicate evidence for a deletion for this extra 25 samples. So only using these 97 which have been confirmed by pindel and manual inspection of bam files

pindel_calls <- read.csv(file="../Input/pindel_calls.csv") ## read in file containing samples where pindel has called a deletion
pindel_calls$Sample <- gsub("-HE","",pindel_calls$Sample) ### modify file names for compatability
pindel_calls$Sample <- gsub("_HE","",pindel_calls$Sample)
pindel_calls$GT <- 1 ### since file only contains samples where deletion was called
pindel_calls <- pindel_calls[!duplicated(pindel_calls$Sample),] ### remove duplicated samples, samples with multiple SRA accessions

dpgpvt_deleteion <- dpgpvt[dpgpvt$Pos < 3717081 & dpgpvt$Pos > 3716910,] ### extract SNPs within the region where deletion occurs, 9 SNPs total
dpgpvt_deleteion[is.na(dpgpvt_deleteion)] <- "NA"
Indel_X3716932 <- ""
for (n in 1:(ncol(dpgpvt_deleteion)-2)){
  ### if all 9 SNPs are NA, assign as deletion
  if (sum(dpgpvt_deleteion[,n]=="NA")==nrow(dpgpvt_deleteion)){
    Indel_X3716932[n] <- 1
    ### if even one the 9 SNPs are called, assign as no deletion
  } else if (sum(dpgpvt_deleteion[,n]=="NA")<nrow(dpgpvt_deleteion)){
    Indel_X3716932[n] <- 0
  } else {
    Indel_X3716932[n] <- NA
  }
}
Indel_X3716932 <- c(Indel_X3716932,"2L","3716932") ### the position where the deletion begins is actually 3716904, this is corrected below
dpgpvt <- rbind(dpgpvt,Indel_X3716932)
rownames(dpgpvt)[nrow(dpgpvt)] <- "Indel_X3716932"
names(Indel_X3716932) <- colnames(dpgpvt)

### create lectin coding deletion dataframe for comparison
Indel_X3716932_df <- as.data.frame(Indel_X3716932)
Indel_X3716932_df$Sample <- rownames(Indel_X3716932_df)

Indel_X3716932_df <- left_join(Indel_X3716932_df,pindel_calls,by="Sample") ### combine pindel calls with approach used here
table(Indel_X3716932_df$Indel_X3716932,Indel_X3716932_df$GT) ### all 88 samples called as having coding deletions here also called as having deletions by pindel. Pindel calls deletions is additonal 20 samples, but manual BAM inspection does not support it and CRISP has called variants in that region. So ignoring those calls.


### testing if this method is precise by getting equivalent sized chunks on either side of the deletion
dpgpvt_test1 <- dpgpvt[dpgpvt$Pos < 3716910 & dpgpvt$Pos > 3716739,] ### extract SNPs downstream of where deletion occurs
dpgpvt_test1[is.na(dpgpvt_test1)] <- "NA"
test1 <- ""
for (n in 1:(ncol(dpgpvt_test1)-2)){
  ### if all SNPs are NA, assign as deletion
  if (sum(dpgpvt_test1[,n]=="NA")==nrow(dpgpvt_test1)){
    test1[n] <- 1
    ### if even one the SNPs are called, assign as no deletion
  } else if (sum(dpgpvt_test1[,n]=="NA")<nrow(dpgpvt_test1)){
    test1[n] <- 0
  } else {
    test1[n] <- NA
  }
}
names(test1) <- colnames(dpgpvt_test1)[1:(ncol(dpgpvt_test1)-2)]
table(test1) ### no samples called as having a deletion

dpgpvt_test2 <- dpgpvt[dpgpvt$Pos < 3717252 & dpgpvt$Pos > 3717081,] ### extract SNPs upstream of where deletion occurs, 9 SNPs total
dpgpvt_test2[is.na(dpgpvt_test2)] <- "NA"
test2 <- ""
for (n in 1:(ncol(dpgpvt_test2)-2)){
  ### if all SNPs are NA, assign as deletion
  if (sum(dpgpvt_test2[,n]=="NA")==nrow(dpgpvt_test2)){
    test2[n] <- 1
    ### if even one the SNPs are called, assign as no deletion
  } else if (sum(dpgpvt_test2[,n]=="NA")<nrow(dpgpvt_test2)){
    test2[n] <- 0
  } else {
    test2[n] <- NA
  }
}
names(test2) <- colnames(dpgpvt_test2)[1:(ncol(dpgpvt_test2)-2)]
table(test2) ### again no sample called as containing a deletion, method is quite specific.

#### 11. Mask regions with ibd, admixture and heterozygosity, from DPGP analyses  ####

### regions for masking obtained from DPGP paper
ibd <- read.table(file = "../Input/ibd_filter_tracts.txt",sep="\t")
admix <- read.table(file = "../Input/admixture_filter_tracts.txt",sep="\t")
het <- read.table(file = "../Input/het_filter_tracts.txt",sep="\t")
fil <- rbind(ibd,admix,het) 
fil$V2 <- gsub("Chr","",fil$V2)

### basically, loop over samples and replaces masked region with NA
fulldata <-  dpgpvt[1:(ncol(dpgpvt)-2)] ### make another df containing genotypes
### loop over sample
for (c in 1:(ncol(dpgpvt)-2)){
  newdat <- cbind(dpgpvt[c],dpgpvt["Chr"],dpgpvt["Pos"])
  mask <- fil[fil$V1 %in% colnames(dpgpvt[c]),] ### regions to mask for each sample
  newdat_all <- ""
  newdat_all <- as.data.frame(newdat_all)
  colnames(newdat_all) <- colnames(dpgpvt[c])
  ###if there are regions to mask, proceed
  if(nrow(mask) > 0){
    for(ch in 1:length(unique(mask$V2))){
      ### loop over each chromosome
      mask_ch <- mask[mask$V2 %in% unique(mask$V2)[ch],]
      newdat_ch <- newdat[newdat$Chr %in% unique(mask$V2)[ch],]
      ###if there are regions to mask for a given chromosome, proceed
      if (nrow(mask_ch) > 0 & nrow(newdat_ch) > 0){
        ### basically mask by start and stop positions
        for(i in 1:nrow(newdat_ch)){
          for(j in 1:nrow(mask_ch)){
            if((as.numeric(newdat_ch[i,]$Pos) >= as.numeric(mask_ch[j,]$V3)) & (as.numeric(newdat_ch[i,]$Pos) <= as.numeric(mask_ch[j,]$V4))){
              newdat_ch[i,1] <- NA
            }
          }
        }
      }
      colnames(newdat_ch)[1] <- colnames(dpgpvt[c])
      newdat_all <- rbind(newdat_all,newdat_ch[1])
    }
    newdat_all <- newdat_all[-c(1),]
    fulldata[c] <- newdat_all
  } else {
    ###for samples there are no region to mask, copy into new dataframe
    fulldata[c] <- dpgpvt[c]
  }
}

#### 12. Create new lectin-24A genotype matrix with masked regions  ####

### process genotype matrix and add back Chr, Pos and annotation
dpgpvt <- as.data.frame(t(fulldata))
fulldata2 <- fulldata
fulldata2 <- cbind(colsplit(rownames(fulldata2),"_",names=c("Chr","Pos","ann")), fulldata2)
fulldata2$Pos <- as.numeric(fulldata2$Pos)

### more informative ids for indels
rownames(fulldata) <- gsub("Indel7bp_X3718140","2L_3718140_upstream_7BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel8bp_X3718040","2L_3718040_upstream_8BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel21bp_X3717878","2L_3717878_upstream_21BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel_X3716932","2L_3716932_coding_indel",rownames(fulldata))
fulldata_gt <- cbind(colsplit(rownames(fulldata),"_",names=c("CHROM","POS","ID")), fulldata)
fulldata_gt <- fulldata_gt[-(nrow(fulldata_gt)-1),]
fulldata_gt$comb <- paste(fulldata_gt$CHROM,fulldata_gt$POS)
metanew <- dataChunk[1:9] ### get meta data from original SNP VCF for each site
metanew$V9 <- "GT" ### vcf specification that data only has genotype (GT)
metanew$V8 <- "." ### vcf specification that will be later used for ancestry status
metanew$comb <- paste(metanew$V1,metanew$V2)
fulldata_gt <- right_join(metanew,fulldata_gt,by="comb") ### add meta data to filtered sites
fulldata_gt$V3 <- fulldata_gt$ID ### vcf specification that will be later used for ID
fulldata_gt <- fulldata_gt[-c(10:13)]
colnames(fulldata_gt)[1:9] <- c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT")
fulldata_gt$FORMAT <- "GT"
fulldata_gt$`#CHROM` <- "2L" ### all have same chromosome

### manually add indels to vcf
fulldata_gt$POS[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c(3718140,3718040,3717878,3716932)
fulldata_gt$REF[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c("A","T","T","G") ### random placeholder annotatons for indels
fulldata_gt$ALT[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c("GAGAACG","ATGAATGT","TGATCCCAGATAGCCTTATTT","TTTTTT") ### random placeholder annotatons for indels, size is correct for upstream indels
fulldata_gt$FILTER[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- "PASS" ### vcf specification arbitrary filter
fulldata_gt$QUAL[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- "10000" ### vcf specification arbitrary quality
fulldata_gt[is.na(fulldata_gt)] <- "." ### replace NA with .
fulldata_gt <- fulldata_gt[order(fulldata_gt$POS),] ### order vcf by position

#### 13. Coding ancestral state for variants  ####

fa <- read.FASTA(file = "../Input/dmel_dsech_dsim_mafft_aln.fasta",type="DNA") ### MSA for 200kb surrounding lectin-24a in three drosophila species
fa_dmel <- as.matrix(fa)[1,] ## D. melanogaster
fa_dsech <- as.matrix(fa)[2,] ## D. sechellia
fa_dsim <- as.matrix(fa)[3,] ## D. simulans
vcfpos <- fulldata_gt$POS-3616811 ### get positions of interest, 3616811 is where the alignment starts
### make a df containing genotypes of three species
dmel_dsech_dsim <- as.data.frame(cbind(fulldata_gt$POS,fulldata_gt$REF,fulldata_gt$ALT,unname(as.character(as.matrix(fa_dmel)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsech)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsim)[c(vcfpos)]))))
colnames(dmel_dsech_dsim) <- c("Pos","Ref","Alt","dmel","dsech","dsim")
### convert calls to uppercase
dmel_dsech_dsim$dmel <- toupper(dmel_dsech_dsim$dmel)
dmel_dsech_dsim$dsech <- toupper(dmel_dsech_dsim$dsech)
dmel_dsech_dsim$dsim <- toupper(dmel_dsech_dsim$dsim)
dmel_dsech_dsim$Alt <- gsub(",.*","", dmel_dsech_dsim$Alt) ### analysing most common alternate allele for multiallelic sites

### basically loop over each site and see if allele found in either of the other two drosophila species, that allele is designated as the ancestral allele

dmel_dsech_dsim$ancestral <- ""
for (i in 1:nrow(dmel_dsech_dsim)){
  ### if both reference and alternate call is found in the other two species, code as NA
  if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i]) & dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    dmel_dsech_dsim$ancestral[i] <- NA
  } else if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### then check if reference call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Ref[i],sep="")
  } else if (dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### now check if alternate call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Alt[i],sep="")
  } else{
    ### if neither reference and alternate call is found in the other two species, code as NA
    dmel_dsech_dsim$ancestral[i] <- NA
  }
}
fulldata_gt$INFO <- dmel_dsech_dsim$ancestral ### adding ancestral status to info column in vcf
### make alternate calls for indels into SNP and some programs don't handle indel calls
fulldata_gt$ALT <- gsub("TTTTTT","T",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("TGATCCCAGATAGCCTTATTT","A",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("ATGAATGT","A",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("GAGAACG","G",fulldata_gt$ALT)
### manually code in the ancestral states for indels
fulldata_gt[fulldata_gt$ID %in% "coding_indel",]$INFO <- "AA=G"
fulldata_gt[fulldata_gt$ID %in% "upstream_21BP_indel",]$INFO <- "AA=T"
fulldata_gt[fulldata_gt$ID %in% "upstream_8BP_indel",]$INFO <- "AA=A"
fulldata_gt[fulldata_gt$ID %in% "upstream_7BP_indel",]$INFO <- "AA=A"
fulldata_gt <- fulldata_gt[!is.na(fulldata_gt$INFO),] ### remove sites without ancestral state classification

#### 14. Post-mask repeat SNP and individual filtering ####

dpgpvt <- dpgpvt[colnames(dpgpvt) %in% c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878","indelhaplotype","Indel_X3716932",paste(fulldata_gt$`#CHROM`,fulldata_gt$POS,fulldata_gt$ID,sep="_"))] ###only keep indels and snps with ans coding known, were identified in the processed lectin vcf files
dpgpv2 <- cbind(dpgpv[dpgpv$Sample %in% rownames(dpgpvt),1:16],dpgpvt) ### add meta data containing population information back to geotype matrix
fullfst2 <- dpgpv2
rownames(fullfst2) <- fullfst2$Sample

fullfst2 <- fullfst2[!fullfst2$Sample %in% names(which(apply(as.data.frame(apply(fullfst2[,17:ncol(fullfst2)],1,is.na)),2,sum)/ncol(fullfst2[,17:ncol(fullfst2)]) > 0.5)),]  ### remove samples with more than 50% of data missing for lectin-24A after masking
fullfst2 <- fullfst2[fullfst2$Population %in% names(which(table(fullfst2$Population) > 3)),] ### only retain populations with at least four samples, retain 26 populations in the end

### comparing 165BP coding deletion with pindel
pindel_calls <- read.csv(file="../Input/pindel_calls.csv") ## read in file containing samples where pindel has called a deletion
pindel_calls$Sample <- gsub("-HE","",pindel_calls$Sample) ### modify file names for compatability
pindel_calls$Sample <- gsub("_HE","",pindel_calls$Sample)
pindel_calls$GT <- 1 ### since file only contains samples where deletion was called
pindel_calls <- pindel_calls[!duplicated(pindel_calls$Sample),] ### remove duplicated samples, samples with multiple SRA accessions
fullfst3 <- left_join(fullfst2,pindel_calls,by="Sample") ### combine with genotype matrix for comparison
table(fullfst3$Indel_X3716932,fullfst3$GT) ### 1 sample has SNPs called within deletion called from Pindel
fullfst2 <- fullfst2[!fullfst2$Sample %in% fullfst3[fullfst3$Indel_X3716932 %in% 0 & fullfst3$GT %in% 1,]$Sample,] ### remove that sample from analyses

### only keep variants with AF>0.5 in at least two populations
tmpnew <- fullfst2[1:16]
for (i in 17:ncol(fullfst2)){
  ### only working with SNPs with maximum of three alleles, including reference call
  if (ncol(table(fullfst2$Population,fullfst2[,i])) > 1 & ncol(table(fullfst2$Population,fullfst2[,i])) < 4){
    if (length(table(fullfst2$Population)) - sum(table(fullfst2$Population,fullfst2[,i])[,2]/rowSums(table(fullfst2$Population,fullfst2[,i])) < 0.05 | table(fullfst2$Population,fullfst2[,i])[,2]/rowSums(table(fullfst2$Population,fullfst2[,i])) > 0.95, na.rm=T) > 1){
      tmpnew <- cbind(tmpnew,fullfst2[i])
    }
  }
}
fullfst2 <- tmpnew

### output file list containing samples in each population retained after filtering
for (p in 1:length(unique(fullfst2$Population))){
  write.table(fullfst2[fullfst2$Population %in% unique(fullfst2$Population)[p],]$Sample,file=paste(unique(fullfst2$Population)[p],"_POP",sep=""),quote = F,col.names = F,row.names = F)
}

write.table(fullfst2$Sample, file="Popover3",row.names = F,col.names = F, quote = F) ### a list containing all samples used for analysis
write.table(sranames_meta_des[sranames_meta_des$Sample %in% fullfst2$Sample,]$Run,file="Popover3_run",row.names = F,col.names = F, quote = F) ### a list containing all samples used for analysis
### filter and write vcf file
calls <- fulldata_gt[10:ncol(fulldata_gt)] ## another df for genotype calls
calls <- calls[colnames(calls) %in% fullfst2$Sample] ### only keep filtered samples
fulldata_gt <- cbind(fulldata_gt[1:9],calls) ### add back meta data
fulldata_gt <- fulldata_gt[fulldata_gt$POS %in% as.numeric(gsub("_.*","",gsub(".*_X","",gsub("2L_","",colnames(fullfst2[17:ncol(fullfst2)]))))),] ### filter by variant

write.table(fulldata_gt, file="lectin.vcf",sep="\t",quote = F, row.names = F)
write.table(fulldata_gt[10:ncol(fulldata_gt)], file="lectin_gt",sep=" ",quote = F, row.names = F, col.names = F)
write.table(fulldata_gt[1:9], file="lectin_meta",sep=" ",quote = F, row.names = F, col.names = F)

ann_snpname_pos <- ann_snpname_pos[names(ann_snpname_pos) %in% fulldata_gt$POS] ### restrict lectin variant names to sites that were retained after filtering

#### 15. Investigate frequency of null variants  ####

### null variants are the 21 BP indel, coding indel that results in a premature stop and three stop gained mutations
nullvariants <- cbind(fullfst2["Locality"],fullfst2["Country"],fullfst2["Indel21bp_X3717878"],rev(fullfst2[grep("stop",colnames(fullfst2))]),fullfst2["Indel_X3716932"])
nullvariants[3:7] <- apply(nullvariants[3:7],2,as.numeric)
colnames(nullvariants)[3:7] <- c("c.-171_-151del","p.Leu81*","p.Glu202*","p.Gln254*","p.Phe217_Glu273del*") ### names for null variants
nullvariants$Null <- rowSums(nullvariants[3:7],na.rm=T) ### count number of null variants, not 1 is always the loss of function mutation, derived state
### for samples with only one null mutation, state what the variant is a new vector called "Null
for (r in 1:nrow(nullvariants)){
  if(nullvariants$Null[r] == 1){
    nullvariants$Null[r] <- colnames(nullvariants[which(nullvariants[r,]==1)])
  }
}
nullvariants[nullvariants$Null == 2,]$Null <- "Multiple" ### for samples with more than one null mutation, designate multiple. Note there are no samples with more than two null variants.
nullvariants$Locality <- paste(nullvariants$Locality,nullvariants$Country,sep=", ") ### id populations by locality and country
nulltable <- table(nullvariants$Locality,nullvariants$Null)/rowSums(table(nullvariants$Locality,nullvariants$Null)) ## calculate frequency of null mutations in each population
nulltable <- nulltable[,-c(1)]
nulltable <- melt(nulltable)

### add region info for grouping samples
cont2 <- melt(cont) ### convert list contain region info to dataframe
fullfst2_meta <- cbind(fullfst2["Sample"],paste(fullfst2$Locality,fullfst2$Country,sep=", ")) ### get meta data for samples
cont2 <- right_join(cont2,fullfst2_meta,by=c("value"="Sample")) ### combine with region info
cont3 <- cont2[-c(1)]
cont3 <- unique(cont3)
cont3 <- cont3[complete.cases(cont3),]
colnames(cont3)[2] <- "Region"
nulltable <- left_join(nulltable,cont3,by=c("Var1"="Region"))
nulltable$L1 <- factor(nulltable$L1,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
nulltable$L2 <- gsub(".*, ","",nulltable$Var1) ### L2 is population name is a combination of locality and country
nulltable$L2 <- factor(nulltable$L2) 
nulltable <- nulltable[order(nulltable$L1,nulltable$L2),] ### order by population then country

###plot of frequency of null variants
pdf("null.pdf", width= 9.5, height= 4)
ggplot(data=nulltable,aes(x=factor(Var1,levels=unique(Var1)),y=value,fill=factor(Var2,levels=c("c.-171_-151del","p.Leu81*","p.Glu202*","p.Gln254*","p.Phe217_Glu273del*","Multiple"))))+
  geom_bar(stat="identity")+
  xlab("Population")+
  ylab("Frequency")+
  scale_fill_brewer(palette="Dark2")+
  theme_classic()+
  guides(fill=guide_legend(title="Null mutation"))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  geom_segment(aes(x = 0.7, xend = 3.3, y = 0.18, yend = 0.18),size=1.2)+
  geom_text(aes(x = 2, y = 0.30, label = "North \nAmerica"))+
  geom_segment(aes(x = 3.7, xend = 4.3, y = 0.29, yend = 0.29),size=1.2)+
  geom_text(aes(x = 4, y = 0.47, label = "Oceania",angle=90))+
  geom_segment(aes(x = 4.7, xend = 5.3, y = 0.31, yend = 0.31),size=1.2)+
  geom_text(aes(x = 5, y = 0.42, label = "Asia",angle=90))+
  geom_segment(aes(x = 5.7, xend = 8.3, y = 0.29, yend = 0.29),size=1.2)+
  geom_text(aes(x = 7, y = 0.45, label = "Europe & \nNorth Africa"))+
  geom_segment(aes(x = 8.7, xend = 14.3, y = 0.46, yend = 0.46),size=1.2)+
  geom_text(aes(x = 11.5, y = 0.52, label = "East Africa"))+
  geom_segment(aes(x = 14.7, xend = 17.3, y = 0.26, yend = 0.26),size=1.2)+
  geom_text(aes(x = 16, y = 0.38, label = "Central \nAfrica"))+
  geom_segment(aes(x = 17.7, xend = 19.3, y = 0.04, yend = 0.04),size=1.2)+
  geom_text(aes(x = 18.5, y = 0.17, label = "West \nAfrica"))+
  geom_segment(aes(x = 19.7, xend = 26.3, y = 1.02, yend = 1.02),size=1.2)+
  geom_text(aes(x = 23, y = 1.09, label = "Southern Africa"))+
  geom_segment(aes(x = 0.6, xend = 1.4, y = 0, yend = 0),size=0.7)+
  geom_segment(aes(x = 8.6, xend = 9.4, y = 0, yend = 0),size=0.7)+
  geom_segment(aes(x = 12.6, xend = 13.4, y = 0, yend = 0),size=0.7)+
  geom_segment(aes(x = 15.6, xend = 16.4, y = 0, yend = 0),size=0.7)+
  geom_segment(aes(x = 17.6, xend = 18.4, y = 0, yend = 0),size=0.7)+
  geom_segment(aes(x = 18.6, xend = 19.4, y = 0, yend = 0),size=0.7)+
  ylim(0,1.09)
dev.off()

#### 16. Investigate derived allele frequency for lectin-24a variants  ####

locus2 <- fullfst2[,17:ncol(fullfst2)]
locus3 <- locus2
locus3 <- locus3[!colnames(locus3) %in% "indelhaplotype"] ### remove haplotype from analysis
locus <- locus3
colnames(locus) <- seq(1,ncol(locus))
ind <- as.character(fullfst2$Sample) # labels of the individuals
population <- as.character(paste(fullfst2$Locality,fullfst2$Country,sep=", ")) ### population name is a combination of locality and country
Mydata1 <- df2genind(locus, ploidy = 1, NA.char = "NA", ind.names = ind, pop = population, sep = "")
Mydata2 <- genind2hierfstat(Mydata1) # Create hierfstat object
Mydata2$pop <- as.character(Mydata2$pop)

popaf <- pop.freq(Mydata2, diploid = F) ### get allele frequency per population
names(popaf) <- colnames(locus3)
### split frequency list by reference and alternate allele frequency
refcall <- ""
altcall <- ""
for (i in 1:length(popaf)){
  refcall <- rbind(refcall,popaf[[i]][1,])
  altcall <- rbind(altcall,popaf[[i]][2,])
}
refcall <- as.data.frame(refcall[-c(1),])
altcall <- as.data.frame(altcall[-c(1),])
refcall <- as.data.frame(apply(refcall,2,as.numeric)) ### make reference allele frequency into dataframe
altcall <- as.data.frame(apply(altcall,2,as.numeric))  ### make alternate allele frequency into dataframe
rownames(refcall) <- colnames(locus3)[1:length(popaf)] ### variant names as rownames
rownames(altcall) <- colnames(locus3)[1:length(popaf)]  ### variant names as rownames

fulldata_gt_alt <- fulldata_gt[fulldata_gt$ALT == gsub("AA=","",fulldata_gt$INFO) | fulldata_gt$POS %in% 3717950,1:8]  ## df where alternate allele is ancestral allele, hand code in for multiallelic site
fulldata_gt_ref <- fulldata_gt[fulldata_gt$REF == gsub("AA=","",fulldata_gt$INFO),1:8] ## df where reference allele is ancestral allele

derallele <- altcall[rownames(altcall) %in% paste(fulldata_gt_ref$`#CHROM`,fulldata_gt_ref$POS,fulldata_gt_ref$ID,sep="_"),] ### extract the dervied allele frequency from the reference and alternate frequency dataframes
derallele <- rbind(derallele,refcall[rownames(refcall) %in% paste(fulldata_gt_alt$`#CHROM`,fulldata_gt_alt$POS,fulldata_gt_alt$ID,sep="_"),] )
derallele <- rbind(derallele,altcall["Indel7bp_X3718140",],refcall["Indel8bp_X3718040",],altcall["Indel21bp_X3717878",],altcall["Indel_X3716932",])  ### add derived af for indels manually


##### change name for proteins with reference as derived alleles, i.e. switch amino acid based before and after substitution
ref_der_prot <- rownames(refcall[rownames(refcall) %in% paste(fulldata_gt_alt$`#CHROM`,fulldata_gt_alt$POS,fulldata_gt_alt$ID,sep="_"),])
ref_der_prot <- gsub("_.*","",gsub("2L_","",ref_der_prot[grep("missense",ref_der_prot)]))
for (n in 1:length(ann_snpname_pos)){
  if (names(ann_snpname_pos)[n] %in% ref_der_prot){
    ann_snpname_pos[n] <- paste("p.",str_sub(ann_snpname_pos[n], start= -3),str_sub(ann_snpname_pos[n],6,-4),str_sub(ann_snpname_pos[n],3,5),sep="")
  }
}

derallele$SNP <- rownames(derallele) ### variant ID column
derallele <- separate(derallele,col=SNP,into=c("Chr","POS","Ann"),sep="_",extra="merge") ### make chr, pos and ann columns from variant ID
derallele$POS <- gsub("X","",derallele$POS)
derallele$POS <- as.numeric(derallele$POS) ### make position numeric
derallele <- derallele[order(-derallele$POS),] ### order by position
derallele <- derallele[derallele$POS %in% fulldata_gt$POS,]  ### only keep filtered sites
rownames(derallele) <- ann_snpname_pos ### use variant names
derallele <- as.data.frame(t(derallele[1:(ncol(derallele)-3)]))
derallele$Region <- rownames(derallele)
derallele <- left_join(derallele,cont3,by="Region") ### add region data
derallele$L1 <- factor(derallele$L1,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
derallele$L2 <- gsub(".*, ","",derallele$Region) ### L2 is population name is a combination of locality and country
derallele$L2 <- factor(derallele$L2)
derallele <- derallele[order(derallele$L1,derallele$L2),] ### order by population then country
rownames(derallele) <- derallele$Region

derallele2 <- melt(derallele)
derallele2$L1 <- factor(derallele2$L1,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
derallele2$variable <- factor(derallele2$variable,levels = colnames(derallele)[1:(ncol(derallele)-3)]) ### make variant names as factor and keep order by position on chr
derallele2 <- derallele2[order(derallele2$L1,derallele2$L2,derallele2$variable),] ### order by population then country and by variant

### plot of derived allele frequency for lectin-24A
pdf("derived af.pdf", width= 10.1, height= 6.62)
ggplot(data=derallele2,aes(x=factor(variable,levels = colnames(derallele)[1:(ncol(derallele)-3)]),y= factor(Region,levels = rev(derallele$Region)),fill=value))+
  geom_tile() +
  scale_fill_viridis_c(option = "inferno", name="Derived \nallele \nfrequency") +
  ylab("")+
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_blank()) +

  theme(axis.text.x = element_text(angle = 90, vjust = 0.5,hjust = 1))+
  xlab("")+
  guides(fill = guide_legend(title.hjust = 0.5)) +
  facet_grid(L1 ~ ., 
             scales = "free_y", # Let the x axis vary across facets.
             space = "free_y",  # Let the width of facets vary and force all bars to have the same width.
  )+ 

  theme(panel.spacing = unit(0, "lines"),strip.background = element_blank())+
  theme(strip.text.y = element_text(angle = 0))+
  scale_y_discrete(position = "left") 
dev.off()

#### 17. Validation of lectin-24a variant calls  ####

### first validating SNPs calls from CRISP with those from DGRP Freeze2.0

fulldata_dgrp <- fulldata_gt[grep("RAL-",colnames(fulldata_gt))] ### get dgrp specific calls
fulldata_dgrp <- apply(fulldata_dgrp,2,as.numeric) ## convert calls to numberic
ncol(fulldata_dgrp) ### 152 DGRP samples analysed here
sum(rowSums(fulldata_dgrp, na.rm=T)>0) ### only 30 polymorphic sites  in identified DGRPs

dgrp_snps <- read.csv(file = "../Input/lectin_promoter_and_coding.csv") ### Read in Freeze 2.0 DGRP calls
dgrp_snps <- dgrp_snps[!dgrp_snps$pos %in% 3718040,] ### remove 8bp indel, analysed seperatly
ncol(dgrp_snps)-9 ### freeze 2.0 calls has 205 samples, some may have been excluded in lectin analyses due to masking
nrow(dgrp_snps[dgrp_snps$pos > 3716812 & dgrp_snps$pos < 3718140,])-1 ## 42 snps found in lectin refion from DGRP freeze 2.0 calls, some might only occur in DGRPs.
dgrp_snps <- dgrp_snps[dgrp_snps$pos %in% fulldata_gt$POS,] ### only select snps analysed for lectin analyses
colnames(dgrp_snps) <- gsub("line_","RAL-",colnames(dgrp_snps)) ### convert to have similar naming
nrow(dgrp_snps) ### 24 SNPs also found in freeze 2.0 calls
dgrp_snps <- dgrp_snps[colnames(dgrp_snps) %in% c(colnames(fulldata_dgrp),"pos")] ### only keep DGRP samples analysed for lectin region analyses

### for each site, compare calls from lectin region analyses with DGRP Freeze2.0 calls
dgrp_compare <- ""
for (s in 1:nrow(dgrp_snps)){
  fulldata_gt_c <- fulldata_gt[fulldata_gt$POS %in% dgrp_snps$pos[s],grep("RAL-",colnames(fulldata_gt))] ### isolate each site from lectin region analyses
  fulldata_gt_c <- fulldata_gt_c[order(colnames(fulldata_gt_c))] ### order by sample name
  fulldata_gt_c <- factor(fulldata_gt_c,levels = c(0,1)) ### factor by ref or alt alleles
  dgrp_snps_c <- dgrp_snps[s,2:ncol(dgrp_snps)] ### only keep sample columns from Freeze 2.0 calls 
  dgrp_snps_c <- dgrp_snps_c[order(colnames(dgrp_snps_c))] ### order samples by name
  dgrp_snps_c <- factor(dgrp_snps_c,levels = c(0,2))  ### factor by ref or alt alleles
  dgrp_compare <- rbind(dgrp_compare,cbind(rev(ann_snpname_pos[names(ann_snpname_pos) %in% dgrp_snps$pos])[s],table(fulldata_gt_c,dgrp_snps_c))) ### compare lectin region and freeze 2.0 calls
}
dgrp_compare <- dgrp_compare[-c(1),]
dgrp_compare <- as.data.frame(dgrp_compare)
colnames(dgrp_compare) <- c("Variant","Reference_allele_DGRP_Freeze2.0","Alternate_allele_DGRP_Freeze2.0")
dgrp_compare$Allele_DGN <- rep(c("Reference","Alternate"),nrow(dgrp_snps)) ### change 0,1 to ref and alt for lectin region analyses calls
write.csv(dgrp_compare,file="dgrp_compare.csv",row.names=F)

### now validating SNPs calls from CRISP with those from GDL

gdl <- read.table(file = "../Input/GDL_lectin.ann2.vcf", header = T) ###  GDL data for lectin coding region
gdl <- gdl[!names(gdl) %in% "ZW184"] ### remove recent migrant ZW184

### process VCF to only retain genotype for each sample at each site
gdl_gt <- ""
for (i in 10:ncol(gdl)){
  gt_sub <- gdl[i]
  new <- gt_sub %>% separate(colnames(gt_sub), into =c("GT","DP","GQ"), sep = ":")
  new$GT <- gsub("0/0","R",new$GT) ### 0 is ref
  new$GT <- gsub("[0123456789]/[123456789]","A",new$GT) ### anything else is alt, not for lectin region, only studying biallelic sites
  new$GT <- gsub("[^RA]+", "NA", new$GT)
  genosample <- as.data.frame(new$GT)
  colnames(genosample) <- colnames(gt_sub)
  gdl_gt <- cbind(gdl_gt,genosample)
  
}
gdl_gt <- gdl_gt[,-1]
rownames(gdl_gt) <- gdl$POS
ncol(gdl_gt) ### 84 GDL samples in total
gdl_gt$Pos <- gdl$POS
nrow(gdl_gt[gdl_gt$Pos > 3716812 & gdl_gt$Pos < 3718140,]) ## 39 snps found in lectin coding region from GDL

fulldata_gdl <- fulldata_gt[colnames(fulldata_gt) %in% colnames(gdl_gt)] ### get dgrp specific calls
fulldata_gdl <- apply(fulldata_gdl,2,as.numeric) ## convert calls to numeric
ncol(fulldata_gdl) ### 56 DGRP samples analysed for lectin region analyses
sum(rowSums(fulldata_gdl, na.rm=T)>0) ### only 42 polymorphic sites  in identified DGRPs

gdl_gt <- gdl_gt[gdl_gt$Pos %in% fulldata_gt$POS,] ### only select snps analysed for lectin analyses
nrow(gdl_gt) ### 29 SNPs also found in GDL calls
gdl_gt <- gdl_gt[colnames(gdl_gt) %in% c(colnames(fulldata_gdl),"Pos")] ### only keep GDL samples analysed for lectin region analyses

### for each site, compare calls from lectin region analyses with GDL calls
gdl_compare <- ""
for (s in 1:nrow(gdl_gt)){
  fulldata_gt_c <- fulldata_gt[fulldata_gt$POS %in% gdl_gt$Pos[s],colnames(fulldata_gt) %in% colnames(gdl_gt)] ### isolate each site from lectin region analyses
  fulldata_gt_c <- fulldata_gt_c[order(colnames(fulldata_gt_c))] ### order by sample name
  fulldata_gt_c <- factor(fulldata_gt_c,levels = c(0,1)) ### factor by ref or alt alleles
  gdl_gt_c <- gdl_gt[s,1:(ncol(gdl_gt)-1)] ### only keep sample columns from GDL
  gdl_gt_c <- gdl_gt_c[order(colnames(gdl_gt_c))] ### order samples by name
  gdl_gt_c <- factor(gdl_gt_c,levels = c("A","R"))  ### factor by ref or alt alleles
  gdl_compare <- rbind(gdl_compare,cbind(rev(ann_snpname_pos[names(ann_snpname_pos) %in% gdl_gt$Pos])[s],table(fulldata_gt_c,gdl_gt_c))) ### compare lectin region and GDL calls
}
gdl_compare <- gdl_compare[-c(1),]
gdl_compare <- as.data.frame(gdl_compare)
colnames(gdl_compare) <- c("Variant","Alternate_allele_GDL","Reference_allele_GDL")
gdl_compare$Allele_DGN <- rep(c("Reference","Alternate"),nrow(gdl_gt)) ### change 0,1 to ref and alt for lectin region analyses calls
write.csv(gdl_compare,file="gdl_compare.csv",row.names=F)

### here validating upstream indel calls from CRISP with those from Sanger sequencing of DGRP lines

DGRP_lectin_indels <- read.csv(file = "../Input/DGRP_lectin_indels.csv", header = T) ###  Lectin upstream Indel calls for DGRP from Sanger seq
colnames(DGRP_lectin_indels)[3:5] <- c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878")
DGRP_lectin_indels$line <- gsub("^","RAL-",DGRP_lectin_indels$line) ### covert sample name for consistency
colnames(DGRP_lectin_indels)[1] <- "Sample" 
fullfst2_indel <- fullfst2[colnames(fullfst2) %in% c("Sample","Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878")] ### extract indels from main lectin region calls
fullfst2_indel <- fullfst2_indel[fullfst2_indel$Sample %in% DGRP_lectin_indels$Sample,] ####  only keep samples that were genotyped by Sanger seq
DGRP_lectin_indels <- DGRP_lectin_indels[DGRP_lectin_indels$Sample %in% fullfst2_indel$Sample,] ### only keey samples analysed for lectin region analyses
dim(DGRP_lectin_indels) ### 126 samples in total
dim(fullfst2_indel)

fullfst2_indel <- fullfst2_indel[order(fullfst2_indel$Sample),] ### order lectin region called samples by name
DGRP_lectin_indels <- DGRP_lectin_indels[order(DGRP_lectin_indels$Sample),] ### order sanger sequenced samples by name
DGRP_lectin_indels <- DGRP_lectin_indels[colnames(DGRP_lectin_indels) %in% c("Sample","Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878")] ### isolate just indels of interest
indelnames <- c("name","c.-439_-433del","c.-334_-333insACATTCAT","c.-171_-151del")

dgrp_indel_compare <- ""
for (i in 2:4){
  dgrp_indel_compare <- rbind(dgrp_indel_compare,(cbind(indelnames[i],table(fullfst2_indel[,i],DGRP_lectin_indels[,i]))))
}
dgrp_indel_compare <- dgrp_indel_compare[-c(1),]
dgrp_indel_compare <- as.data.frame(dgrp_indel_compare)
colnames(dgrp_indel_compare) <- c("Upstream_Indel","Ancestral_allele_Sanger","Derived_allele_Sanger")
dgrp_indel_compare$Allele_DGN <- c("Ancestral","Derived","Derived","Ancestral","Ancestral","Derived")
write.csv(dgrp_indel_compare,file="dgrp_indel_compare.csv",row.names=F)

### validating upstream indels, coding sequence deletion and stop codons in South African samples using Sanger sequencing calls
SP_SD_Sanger_calls <- read.csv(file="../Input/SP_SD_Sanger_calls.csv")
SD_SP_lines <- as.data.frame(SP_SD_Sanger_calls$Sample)
colnames(SD_SP_lines) <- "Sample"
SD_SP_lines <- left_join(SD_SP_lines,fullfst2,by="Sample")
SD_SP_lines <- SD_SP_lines[grep("Sample|Indel|stop|cod",colnames(SD_SP_lines))]
SD_SP_lines <- SD_SP_lines[rev(order(as.numeric(gsub("2L_|Indel.*_X|_stop_gained","",colnames(SD_SP_lines)))))]
newnames <-  ann_snpname[names(ann_snpname) %in% names(SD_SP_lines)]
newnames <-  newnames[rev(order(as.numeric(gsub("2L_|Indel.*_X|_stop_gained","",names(newnames)))))]
colnames(SD_SP_lines)[2:ncol(SD_SP_lines)] <- newnames

SD_SP_compare <- ""
for (i in 2:ncol(SD_SP_lines)){
  SD_SP_compare <- rbind(SD_SP_compare,(cbind(colnames(SD_SP_lines[i]),table(factor(SD_SP_lines[,i],levels=c(0,1)),factor(SP_SD_Sanger_calls[,i],levels=c("A","D"))))))
}
SD_SP_compare <- SD_SP_compare[-c(1),]
SD_SP_compare <- as.data.frame(SD_SP_compare)

colnames(SD_SP_compare) <- c("Variant","Ancestral_allele_SD_SP_Sanger","Derived_allele_SD_SP_Sanger")
write.csv(SD_SP_compare,file="SD_SP_compare.csv",row.names=F)

#### 18. Genotype matrix for lectin-24a variants  ####

tree <- read.tree("../Input/lectin_masked_rev.nwk") ###tree file based on neighbor joining of lectin coding region

cont <- list(america,ocenia,asia,europe_north_africa,east_africa,central_africa,west_africa,southern_africa) ### remake region list
names(cont) <- c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa") ### name region list
treecols <- brewer.pal("Set1",n=8) ### colours for regions

tree2<- groupOTU(tree, cont) ### group lectin-24a coding sequence tree by region
### tree without branch lengths
branchtree <- ggtree(tree2, aes(color=group),size=0.2, branch.length="none")+
  scale_color_manual(values=treecols)+
  guides(colour=FALSE)+
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) 

### tree with branch lengths
branchtreelen <- ggtree(tree2,aes(color=group),size=0.2)+
  scale_color_manual(values=treecols)+
  geom_treescale(x=0,y=700,width = 0.001)+
  guides(colour=FALSE)+
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))
### in order to assign custom colours to tree, each sample/tips needs to be assoicated with a specific region colour
heatmapdata3 <- branchtree[["data"]] ### get tree data
heatmapdata3 <- cbind(heatmapdata3["label"],heatmapdata3["y"]) ## get tip numbers
heatmapdata3 <- heatmapdata3[!is.na(heatmapdata3$label),]
heatmapdata <- as.data.frame(gsub("[0-9]","",names(unlist(cont)))) ### get population codes for samples, rownames are samples  and contains a single column with region values
rownames(heatmapdata) <- unlist(cont) ##rownames as sample names
colnames(heatmapdata) <- "heatmapdata"
treecols2 <- as.data.frame(treecols) ## make tree colours into dataframe
heatmapdata2 <- heatmapdata
treecols2$heatmapdata <- sort(unique(heatmapdata2$heatmapdata)) ### assign unique color to each region
heatmapdata2 <- left_join(heatmapdata2,treecols2,"heatmapdata") ### assign colours to df contain samples based on regions they were collected from
heatmapdata2$label <- rownames(heatmapdata)  ##label as sample names
heatmapdata3 <- left_join(heatmapdata3,heatmapdata2,"label") ## now df contains sample id, tip number, region and colour associated for each sample
heatmapdata3 <- heatmapdata3[order(heatmapdata3$y),] ### order by tip number

newcols <- as.data.frame(cbind(treecols,sort(c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa")))) ##also a dataframe for color bar for fill scale, grouping by region
newcols$V2 <- factor(newcols$V2,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
newcols <- newcols[order(newcols$V2),] ### order by region

### plot just to extract the fill legend
p2 <- gheatmap(branchtree, heatmapdata, width = 0, offset = 0, colnames=F, color = heatmapdata3$treecols) +
  scale_fill_manual(name = "Region",values=newcols$treecols,labels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))+
  guides(colour=FALSE)+
  theme(legend.position="left")

### plot just to extract the sample groupings along tree tips, colored by region
p <- gheatmap(branchtree, heatmapdata, width = 0.1, offset = 0, colnames=F, color = heatmapdata3$treecols) +
  scale_fill_manual(values=treecols)+ 
  guides(colour=FALSE)+
  theme(legend.position="left") +
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

### extract sample groupings along tree tips
np <- grid.arrange(ggplot_gtable(ggplot_build(p))$grobs[[6]])
seg <- grid.arrange(np[["grobs"]][[1]][["children"]][[5]])
npp <- plot_grid(seg)
npp <- npp + theme(plot.margin=grid::unit(c(0,0,0,-15), "cm"))

### generating genotype matrix plotting ancestral and derived alleles for each variant
fulldata_gt_alt2 <- fulldata_gt[fulldata_gt$ALT == gsub("AA=","",fulldata_gt$INFO),] ### extract varaints where alternate allele is ancestral
fulldata_gt_alt2 <- fulldata_gt_alt2[(fulldata_gt_alt2$POS < 3717729 & fulldata_gt_alt2$POS > 3716879) | (fulldata_gt_alt2$POS %in% c("3718040")),] ### select only coding region and add upstream 8BP indel
fulldata_gt_alt2_dat <- fulldata_gt_alt2[17:ncol(fulldata_gt_alt2)] ## remove metadata
### 0 = ancestral and 1 and 2 is derived
fulldata_gt_alt2_dat[fulldata_gt_alt2_dat == 1] <- 2
fulldata_gt_alt2_dat[fulldata_gt_alt2_dat == 0] <- 1
fulldata_gt_alt2_dat[fulldata_gt_alt2_dat == 2] <- 0
fulldata_gt_alt2[17:ncol(fulldata_gt_alt2)] <- fulldata_gt_alt2_dat ## add back new assignments to orginal df
## repeat where reference allele is ancestral
fulldata_gt_ref2 <- fulldata_gt[fulldata_gt$REF == gsub("AA=","",fulldata_gt$INFO),]
fulldata_gt_ref2 <- fulldata_gt_ref2[(fulldata_gt_ref2$POS < 3717729 & fulldata_gt_ref2$POS > 3716879) | (fulldata_gt_ref2$POS %in% c(3717878,3718140)),] ### select only coding region and remaining  indels
fulldata_gt_ref2 <- fulldata_gt_ref2[fulldata_gt_ref2$POS != 3716932,] ## remove coding region indel, will be in plot based on NA values
fulldata_gt_der <- rbind(fulldata_gt_ref2,fulldata_gt_alt2) ## combine all variants
fulldata_gt_der <- fulldata_gt_der[rev(order(fulldata_gt_der$POS)),] ## order by position
snpname <- ann_snpname_pos[names(ann_snpname_pos) %in% fulldata_gt_der$POS] ### get list of relevant variant names
fulldata_gt_der <- fulldata_gt_der[17:ncol(fulldata_gt_der)]
fulldata_gt_der <- fulldata_gt_der[colnames(fulldata_gt_der) %in% tree2$tip.label] ## sample names from tree tip labels
fulldata_gt_der$snpname <- snpname ## add cokumn with variant names
fulldata_gt_der <- reshape2::melt(fulldata_gt_der,id.var="snpname")
fulldata_gt_der$variable <- factor(fulldata_gt_der$variable, levels = rev(tree2$tip.label))
fulldata_gt_der$snpname <- factor(fulldata_gt_der$snpname, levels = snpname)
### convert genotype type calls into meaningful values
fulldata_gt_der$value <- gsub("\\.","Not called",fulldata_gt_der$value)
fulldata_gt_der$value <- gsub("NA","Not called",fulldata_gt_der$value)
fulldata_gt_der$value <- gsub("0","Ancestral",fulldata_gt_der$value)
fulldata_gt_der$value <- gsub("1","Derived",fulldata_gt_der$value)
###plot genotype matrix
genotype <- ggplot(fulldata_gt_der, aes(y=variable,x=snpname,fill=value)) +
  geom_tile()+
  ylab("")+
  xlab("Variable sites")+
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),axis.text.x = element_text(angle = 90,vjust=0.5,hjust=1))+
  scale_fill_manual(values = c("#d8b365","#2ca25f","black"))+
  guides(fill=guide_legend(title=NULL)) +
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

##add sample colouring and genotype matrix together
alignedplots3 <- plot_grid(align_plots(npp, genotype, align = "hv",axis= "tb")[[1]],align_plots(npp, genotype, align = "hv",axis= "tb")[[2]],ncol=2,rel_widths = c(0.1,1))
alignedplots3 <- alignedplots3+
  theme(plot.margin = unit(c(0, 0, 0, 1), "cm"))

##add fill legend, sample colouring and genotype matrix together
pdf("genotype.pdf",height=6.5,width=8, onefile=FALSE)
plot_grid(get_legend(p2),alignedplots3,rel_widths = c(3,10)) 
dev.off()

### ASIDE for tree generation, not run here
##dont need to run again, model test revealed JC best model
#lectin_dpgp_reve <- read.dna(file = "lectin_dpgp_reve2.fas",format="fasta")
#lectin_dpgp_reve_phyDat <- phyDat(lectin_dpgp_reve, type = "DNA", levels = NULL)
#mt <- modelTest(lectin_dpgp_reve_phyDat)
#print(mt)
#dna_dist <- dist.ml(lectin_dpgp_reve_phyDat, model="JC69")


#### 19. Pairwise linkage disequilibrium between lectin-24a variants  ####
Popover3 <- read.table(file = "../Input/Popover3")[,1] ### only samples from populations that have more than three lines
vcf <- readVcf("../Input/lectin_masked.vcf","dm3") ### read in vcf in the format suitable for ld analysis, does not accommodate indels, so calls for indels changed to snps
lectin_snpmartix <- genotypeToSnpMatrix(vcf) ### create variant matrix
gen <- lectin_snpmartix$genotypes ##get genotypes
colnames(gen) <- gsub("2L:","",as.character(rowRanges(vcf))) ## name by position only
ind <- as.numeric(colnames(gen)) ## get positions vector
colnames(gen) <- rev(ann_snpname_pos) ## change ids to variant names

all <-  Popover3 ### combine all populations together
### select the three populations also used for iHS analysis
zi <- as.character(read.table("../Input/ZI_POP")[[1]])
ral <- as.character(read.table("../Input/RAL_POP")[[1]])

pop <- list(zi,ral,all) ### combine populations to be analysed into a list
names(pop) <- c("Siavonga, Zambia","Raleigh, USA","All Samples")

### generate ld plot, loop over populations
ldplots2 <- list()
for (a in 1:length(pop)){
  pstLDHP2<-LDheatmap(gen[rownames(gen) %in% as.character(pop[[a]]),], distances="physical", add.map=TRUE,color = heat.colors(20),flip=F) ### calculate LD
  LDmatrix<-as.matrix(pstLDHP2$LDmatrix) ### extract ld matrix
  LDmatrix<-melt(LDmatrix)
  LDmatrix<-na.omit(LDmatrix)
  ### plot ld for each population
  ldplots2[[a]] <- ggplot(data = LDmatrix, aes(factor(Var1,levels=ann_snpname_pos), factor(Var2,levels=ann_snpname_pos), fill = value))+
    geom_tile(color = "white")+
    scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", 
                         midpoint = 0.5, limit = c(0,1), name="Linkage Disequilibrium") +
    theme_minimal()+ 
    theme(axis.text.x = element_text(angle = 90, vjust = 1, 
                                     size = 10, hjust = 1))+
    ylab("")+
    xlab("")+
    theme(legend.position = "none")+
    ggtitle(paste(names(pop)[a],", n=",length(which(as.character(pop[[a]]) %in% Popover3)),sep=""))
  
}

### generate with one population, but including legend, extract legend later to combine with plots
forlegend <- ggplot(data = LDmatrix, aes(factor(Var1,levels=ann_snpname_pos), factor(Var2,levels=ann_snpname_pos), fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", 
                       midpoint = 0.5, limit = c(0,1), name="LD") +
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 1, 
                                   size = 9, hjust = 1))+
  coord_fixed()+
  ylab("")+
  xlab("")+
  theme(plot.margin = unit(c(0,0,-2,0), "cm"))+
  ggtitle(paste(names(pop)[a],", n=",length(which(as.character(pop[[a]]) %in% Popover3)),sep=""))

### plot of linkage disequilibrium by population
ldplot <- do.call("plot_grid", c(ldplots2, ncol=1))
pdf("ldplot.pdf",height=30,width=12)
plot_grid(ldplot,get_legend(forlegend),ncol=2,rel_widths = c(1,0.1))
dev.off()


#### 20. Pairwise Fst across lectin-24a coding region  ####

### pairwise weighted fst calculated using vcftools
## change and order names
pairwise_weighted_fst <- read.table(file="../Input/pairwise_weighted_fst")
pairwise_weighted_fst$V1 <- mapvalues(from=c("asia","ocenia","america","europe_north_africa","west_africa","central_africa","east_africa","southern_africa"),to=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_weighted_fst$V1)
pairwise_weighted_fst$V2 <- mapvalues(from=c("asia","ocenia","america","europe_north_africa","west_africa","central_africa","east_africa","southern_africa"),to=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_weighted_fst$V2)
pairwise_weighted_fst$V1 <- factor(pairwise_weighted_fst$V1,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
pairwise_weighted_fst$V2 <- factor(pairwise_weighted_fst$V2,levels=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"))
pairwise_weighted_fst2 <- pairwise_weighted_fst

### manually switch order to have to group regions by differentiation
pairwise_weighted_fst[pairwise_weighted_fst$V1 %in% "Southern Africa",]$V1 <- pairwise_weighted_fst[pairwise_weighted_fst$V1 %in% "Southern Africa",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Southern Africa",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Southern Africa",]$V1
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Central Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V1 <- pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Central Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Central Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Central Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V1
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "West Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V1 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "West Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "West Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "West Africa" & pairwise_weighted_fst2$V2 %in% "East Africa",]$V1
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "North America",]$V1 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "North America",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "North America",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "North America",]$V1
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Oceania" & pairwise_weighted_fst2$V2 %in% "North America",]$V1 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Oceania" & pairwise_weighted_fst2$V2 %in% "North America",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Oceania" & pairwise_weighted_fst2$V2 %in% "North America",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Oceania" & pairwise_weighted_fst2$V2 %in% "North America",]$V1
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "Oceania",]$V1 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "Oceania",]$V2
pairwise_weighted_fst[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "Oceania",]$V2 <- pairwise_weighted_fst2[pairwise_weighted_fst2$V1 %in% "Asia" & pairwise_weighted_fst2$V2 %in% "Oceania",]$V1
### change names to be multiline
pairwise_weighted_fst$V1 <- mapvalues(to=c("North \nAmerica","Oceania","Asia","Europe & \nNorth Africa","East \nAfrica","Central \nAfrica","West \nAfrica","Southern \nAfrica"),from=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"),pairwise_weighted_fst$V1)
pairwise_weighted_fst$V2 <- mapvalues(to=c("North \nAmerica","Oceania","Asia","Europe & \nNorth Africa","East \nAfrica","Central \nAfrica","West \nAfrica","Southern \nAfrica"),from=c("North America","Oceania","Asia","Europe & North Africa","East Africa","Central Africa","West Africa","Southern Africa"),pairwise_weighted_fst$V2)
pairwise_weighted_fst$V1 <- factor(pairwise_weighted_fst$V1,levels=c("North \nAmerica","Oceania","Asia","Europe & \nNorth Africa","East \nAfrica","Central \nAfrica","West \nAfrica"))
pairwise_weighted_fst$V2 <- factor(pairwise_weighted_fst$V2,levels=c("North \nAmerica","Oceania","Asia","Europe & \nNorth Africa","East \nAfrica","Central \nAfrica","West \nAfrica","Southern \nAfrica"))

pairwise_weighted_fst$V3 <- pmax(pairwise_weighted_fst$V3,0) ## make negative fst values  to 0

### pairwise weighted fst plot
pdf("pairwise fst.pdf",height=3.2,width=6.7)
ggplot(data= pairwise_weighted_fst,aes(x=V2,y=V1,fill=V3))+
  geom_tile()+
  geom_text(aes(label=round(V3,3)))+
  scale_fill_viridis_c(option="plasma","Weighted Fst")+
  ylab("")+
  xlab("")+
  theme_minimal()+
  theme(axis.text.y = element_text(hjust=0.5))
dev.off()

### repeat for unweighted fst
pairwise_mean_fst <- read.table(file="../Input/pairwise_mean_fst")
pairwise_mean_fst$V1 <- mapvalues(from=c("asia","ocenia","america","europe_north_africa","west_africa","central_africa","east_africa","southern_africa"),to=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_mean_fst$V1)
pairwise_mean_fst$V2 <- mapvalues(from=c("asia","ocenia","america","europe_north_africa","west_africa","central_africa","east_africa","southern_africa"),to=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_mean_fst$V2)
pairwise_mean_fst$V1 <- factor(pairwise_mean_fst$V1,levels=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"))
pairwise_mean_fst$V2 <- factor(pairwise_mean_fst$V2,levels=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"))
pairwise_mean_fst2 <- pairwise_mean_fst
pairwise_mean_fst[pairwise_mean_fst$V1 %in% "Southern Africa",]$V1 <- pairwise_mean_fst[pairwise_mean_fst$V1 %in% "Southern Africa",]$V2
pairwise_mean_fst[pairwise_mean_fst2$V1 %in% "Southern Africa",]$V2 <- pairwise_mean_fst2[pairwise_mean_fst2$V1 %in% "Southern Africa",]$V1
pairwise_mean_fst[pairwise_mean_fst2$V1 %in% "Central Africa" & pairwise_mean_fst2$V2 %in% "West Africa",]$V1 <- pairwise_mean_fst[pairwise_mean_fst2$V1 %in% "Central Africa" & pairwise_mean_fst2$V2 %in% "West Africa",]$V2
pairwise_mean_fst[pairwise_mean_fst2$V1 %in% "Central Africa" & pairwise_mean_fst2$V2 %in% "West Africa",]$V2 <- pairwise_mean_fst2[pairwise_mean_fst2$V1 %in% "Central Africa" & pairwise_mean_fst2$V2 %in% "West Africa",]$V1
pairwise_mean_fst$V1 <- mapvalues(to=c("Asia","Oceania","North \nAmerica","Europe & \nNorth Africa","West \nAfrica","Central \nAfrica","East \nAfrica","Southern \nAfrica"),from=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_mean_fst$V1)
pairwise_mean_fst$V2 <- mapvalues(to=c("Asia","Oceania","North \nAmerica","Europe & \nNorth Africa","West \nAfrica","Central \nAfrica","East \nAfrica","Southern \nAfrica"),from=c("Asia","Oceania","North America","Europe & North Africa","West Africa","Central Africa","East Africa","Southern Africa"),pairwise_mean_fst$V2)
pairwise_mean_fst$V1 <- factor(pairwise_mean_fst$V1,levels=c("Asia","Oceania","North \nAmerica","Europe & \nNorth Africa","West \nAfrica","Central \nAfrica","East \nAfrica"))
pairwise_mean_fst$V2 <- factor(pairwise_mean_fst$V2,levels=c("Asia","Oceania","North \nAmerica","Europe & \nNorth Africa","West \nAfrica","Central \nAfrica","East \nAfrica","Southern \nAfrica"))
pairwise_mean_fst$V3 <- pmax(pairwise_mean_fst$V3,0) ## make negative fst values  to 0
mean_fst_plot <- ggplot(data= pairwise_mean_fst,aes(x=V2,y=V1,fill=V3))+
  geom_tile()+
  scale_fill_viridis_c(option="plasma","Mean Fst")+
  ylab("")+
  xlab("")+
  theme_minimal()+
  theme(axis.text.y = element_text(hjust=0.5))


#### 21. Ancestral state identification for 200kb surrounding lectin-24a ####

fa <- read.FASTA(file = "../Input/dmel_dsech_dsim_mafft_aln.fasta",type="DNA")  ### MSA for 200kb surrounding lectin-24a in three drosophila species
fa_dmel <- as.matrix(fa)[1,]
fa_dsech <- as.matrix(fa)[2,]
fa_dsim <- as.matrix(fa)[3,]
lectin_200kb_region_id_hap <- read.table(file="../Input/lectin_200kb_region_id_hap.vcf") ### read in SNPs in 200kb region, note this does not contain and indels including ones in lectin
vcfpos <- lectin_200kb_region_id_hap$V2-3616811 ### get positions of interest, 3616811 is where the alignment starts
### make a df containing genotypes of three species
dmel_dsech_dsim <- as.data.frame(cbind(lectin_200kb_region_id_hap$V2,lectin_200kb_region_id_hap$V4,lectin_200kb_region_id_hap$V5,unname(as.character(as.matrix(fa_dmel)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsech)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsim)[c(vcfpos)]))))
colnames(dmel_dsech_dsim) <- c("Pos","Ref","Alt","dmel","dsech","dsim")
### convert calls to uppercase
dmel_dsech_dsim$dmel <- toupper(dmel_dsech_dsim$dmel)
dmel_dsech_dsim$dsech <- toupper(dmel_dsech_dsim$dsech)
dmel_dsech_dsim$dsim <- toupper(dmel_dsech_dsim$dsim)
dmel_dsech_dsim$Alt <- gsub(",.*","", dmel_dsech_dsim$Alt)

### basically loop over each site and see if allele found in either of the other two drosophila species, that allele is designated as the ancestral allele
dmel_dsech_dsim$ancestral <- ""
for (i in 1:nrow(dmel_dsech_dsim)){
  ### if both reference and alternate call is found in the other two species, code as NA
  if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i]) & dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    dmel_dsech_dsim$ancestral[i] <- NA
  } else if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### then check if reference call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Ref[i],sep="")
  } else if (dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### now check if alternate call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Alt[i],sep="")
  } else{
    ### if neither reference and alternate call is found in the other two species, code as NA
    dmel_dsech_dsim$ancestral[i] <- NA
  }
}
lectin_200kb_region_id_hap$V8 <- dmel_dsech_dsim$ancestral ### add ancestral assignments to meta data
lectin_200kb_region_id_hap_anc <- lectin_200kb_region_id_hap[!is.na(lectin_200kb_region_id_hap$V8),] ### remove variants without ancestral state assignments

lectin_200kb_region_id_hap_withindel <- read.table(file="../Input/lectin_masked_hap.vcf") ### read in lectin region vcf file, needed to get indels and combine with 200kb vcf
indels <- lectin_200kb_region_id_hap_withindel[grep("indel",lectin_200kb_region_id_hap_withindel[,3]),] ### get indel variants
indels$V2[1] <- 3717081 ### change to correct indel position for coding indel
lectin_200kb_region_id_hap_withindel_anc <- rbind(lectin_200kb_region_id_hap_anc,indels) ### add indels to 200kb vcf SNPs
lectin_200kb_region_id_hap_withindel_anc <- lectin_200kb_region_id_hap_withindel_anc[order(lectin_200kb_region_id_hap_withindel_anc$V2),] ### order vcf by position
lectin_200kb_region_id_hap_withindel_anc <- lectin_200kb_region_id_hap_withindel_anc[!lectin_200kb_region_id_hap_withindel_anc$V2 %in% lectin_200kb_region_id_hap_withindel_anc[duplicated(lectin_200kb_region_id_hap_withindel_anc$V2),]$V2,] ### remove any duplicated site if present

### output vcf file, will be used for iHS analyses after subsetting populations with vcftools
write.table(lectin_200kb_region_id_hap_withindel_anc,file="lectin_200kb_region_id_hap_withindel_anc",quote=F,row.names = F,col.names = F,sep="\t")  


#### 22. Lectin vcf without region based filtering ####

### purpose of this part is to create a lectin VCF including all samples, no lectin region based sample filtering, so considering samples even with <50% SNPs genotyped for Lectin
### For getting Fst estimates

dpgpv_new <- dpgpv_new_allsamples
ann_snpname <- ann_snpname_allsamples
dpgpv_new <- dpgpv_new[!dpgpv_new$Sample %in% names(which(apply(as.data.frame(apply(dpgpv_new[,17:ncol(dpgpv_new)],1,is.na)),2,sum)/ncol(dpgpv_new[,17:ncol(dpgpv_new)]) > 0.5)),] ### remove samples with more than 50% of data missing for lectin-24A
dpgpv_new <- dpgpv_new[dpgpv_new$Population %in% names(which(table(dpgpv_new$Population) > 3)),] ### only retain populations with at least for samples, retain 35 populations in the end

#### SNP allele frequency based filtering

##only retain SNPs with allele frequency  > 0.05 in at least two populations
tmpnew <- dpgpv_new[1:16]
for (i in 17:ncol(dpgpv_new)){
  ### only working with SNPs with maximum of three alleles, including reference call
  if (ncol(table(dpgpv_new$Population,dpgpv_new[,i])) > 1 & ncol(table(dpgpv_new$Population,dpgpv_new[,i])) < 4){
    if (length(table(dpgpv_new$Population)) - sum(table(dpgpv_new$Population,dpgpv_new[,i])[,2]/rowSums(table(dpgpv_new$Population,dpgpv_new[,i])) < 0.05 | table(dpgpv_new$Population,dpgpv_new[,i])[,2]/rowSums(table(dpgpv_new$Population,dpgpv_new[,i])) > 0.95, na.rm=T) > 1){
      tmpnew <- cbind(tmpnew,dpgpv_new[i])
    }
  }
}
dpgpv_new <- cbind(tmpnew,dpgpv_new[ncol(dpgpv_new)])
####process dataframe to add back chr and pos
dpgpvt <- as.data.frame(t(dpgpv_new[,17:ncol(dpgpv_new)]))
dpgpvt$Chr <- colsplit(rownames(dpgpvt),"_",names=c("Chr","Pos","c"))[1]$Chr
dpgpvt$Chr <- gsub(".*ndel.*","2L",dpgpvt$Chr)
dpgpvt$Pos <- as.numeric(gsub("X","",colsplit(rownames(dpgpvt),"_",names=c("Chr","Pos","c"))[2]$Pos))
dpgpvt$Pos[which(is.na(dpgpvt$Pos))] <- 3718040 ### remove 8bp indel that was called using unmodified reference

### modify variant names having filtered SNPs
### modify positions of upstream SNPs occurring before the 21 BP indel to account of the 21 BP indel
### not necessary for 8BP since insertion is derived state and 7BP is the most upstream variant
ann_snpname <- ann_snpname[names(ann_snpname) %in% colnames(dpgpv_new)] ### modfify variant names to only include filtered SNPs
ann_snpname_num <- 3717728-as.numeric(gsub("_.*","",gsub("2L_","",names(ann_snpname))))
ann_snpname2 <- ann_snpname[ann_snpname_num>(-171)] ### those downstream of 21BP indel 
ann_snpname3 <- ann_snpname[ann_snpname_num<(-171)] ### those upstream of 21BP indel 
ann_snpname_num <- ann_snpname_num[ann_snpname_num<(-171)]
ann_snpname_num <- ann_snpname_num-21 ### change numbering of variants upstream of the 21 Bp indel
ann_snpname4 <- paste("c.",ann_snpname_num,gsub(".*[0-9]","",ann_snpname3),sep="")
names(ann_snpname4) <- names(ann_snpname3)

### add indel names to naming list, 3 upstream indels and 171 bp coding indel
ann_snpname <- c(ann_snpname2,ann_snpname4)
extranames <- c("c.-439_-433del","c.-334_-333insACATTCAT","c.-171_-151del","p.Phe217_Glu273del*")
names(extranames) <- c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878","Indel_X3716932")
ann_snpname <- c(ann_snpname,extranames) ### snp names by id
ann_snpname_pos <- ann_snpname
names(ann_snpname_pos) <- as.numeric(gsub("2L_||.*_X||_.*","",names(ann_snpname)))
ann_snpname_pos <- rev(ann_snpname_pos[order(as.numeric(names(ann_snpname_pos)))])  ### snp names by position


#### Call 171 BP coding deletion in DPGP samples

### samples designed as having deletion if NA for region between 3717102 and 3716904
### 93 samples designed as having deletion this way
### pindel calls deletions in all of these 93 samples
### however pindel also calls deletions is an extra ~25 samples which have non-NA values here with high confidence.
### manual inspection of BAM files does not indicate evidence for a deletion for this extra 25 samples. So only using these 97 which have been confirmed by pindel and manual inspection of bam files

pindel_calls <- read.csv(file="../Input/pindel_calls.csv") ## read in file containing samples where pindel has called a deletion
pindel_calls$Sample <- gsub("-HE","",pindel_calls$Sample) ### modify file names for compatability
pindel_calls$Sample <- gsub("_HE","",pindel_calls$Sample)
pindel_calls$GT <- 1 ### since file only contains samples where deletion was called
pindel_calls <- pindel_calls[!duplicated(pindel_calls$Sample),] ### remove duplicated samples, samples with multiple SRA accessions

dpgpvt_deleteion <- dpgpvt[dpgpvt$Pos < 3717081 & dpgpvt$Pos > 3716910,] ### extract SNPs within the region where deletion occurs, 9 SNPs total
dpgpvt_deleteion[is.na(dpgpvt_deleteion)] <- "NA"
Indel_X3716932 <- ""
for (n in 1:(ncol(dpgpvt_deleteion)-2)){
  ### if all 9 SNPs are NA, assign as deletion
  if (sum(dpgpvt_deleteion[,n]=="NA")==nrow(dpgpvt_deleteion)){
    Indel_X3716932[n] <- 1
    ### if even one the 9 SNPs are called, assign as no deletion
  } else if (sum(dpgpvt_deleteion[,n]=="NA")<nrow(dpgpvt_deleteion)){
    Indel_X3716932[n] <- 0
  } else {
    Indel_X3716932[n] <- NA
  }
}
Indel_X3716932 <- c(Indel_X3716932,"2L","3716932") ### the position where the deletion begins is actually 3716904, this is corrected below
dpgpvt <- rbind(dpgpvt,Indel_X3716932)
rownames(dpgpvt)[nrow(dpgpvt)] <- "Indel_X3716932"
names(Indel_X3716932) <- colnames(dpgpvt)

### create lectin coding deletion dataframe for comparison
Indel_X3716932_df <- as.data.frame(Indel_X3716932)
Indel_X3716932_df$Sample <- rownames(Indel_X3716932_df)

Indel_X3716932_df <- left_join(Indel_X3716932_df,pindel_calls,by="Sample") ### combine pindel calls with approach used here
table(Indel_X3716932_df$Indel_X3716932,Indel_X3716932_df$GT) ### all 88 samples called as having coding deletions here also called as having deletions by pindel. Pindel calls deletions is additonal 20 samples, but manual BAM inspection does not support it and CRISP has called variants in that region. So ignoring those calls.


####  Mask regions with ibd, admixture and heterozygosity, from DPGP analyses

### regions for masking obtained from DPGP paper
ibd <- read.table(file = "../Input/ibd_filter_tracts.txt",sep="\t")
admix <- read.table(file = "../Input/admixture_filter_tracts.txt",sep="\t")
het <- read.table(file = "../Input/het_filter_tracts.txt",sep="\t")
fil <- rbind(ibd,admix,het) 
fil$V2 <- gsub("Chr","",fil$V2)

### basically, loop over samples and replaces masked region with NA
fulldata <-  dpgpvt[1:(ncol(dpgpvt)-2)] ### make another df containing genotypes
### loop over sample
for (c in 1:(ncol(dpgpvt)-2)){
  newdat <- cbind(dpgpvt[c],dpgpvt["Chr"],dpgpvt["Pos"])
  mask <- fil[fil$V1 %in% colnames(dpgpvt[c]),] ### regions to mask for each sample
  newdat_all <- ""
  newdat_all <- as.data.frame(newdat_all)
  colnames(newdat_all) <- colnames(dpgpvt[c])
  ###if there are regions to mask, proceed
  if(nrow(mask) > 0){
    for(ch in 1:length(unique(mask$V2))){
      ### loop over each chromosome
      mask_ch <- mask[mask$V2 %in% unique(mask$V2)[ch],]
      newdat_ch <- newdat[newdat$Chr %in% unique(mask$V2)[ch],]
      ###if there are regions to mask for a given chromosome, proceed
      if (nrow(mask_ch) > 0 & nrow(newdat_ch) > 0){
        ### basically mask by start and stop positions
        for(i in 1:nrow(newdat_ch)){
          for(j in 1:nrow(mask_ch)){
            if((as.numeric(newdat_ch[i,]$Pos) >= as.numeric(mask_ch[j,]$V3)) & (as.numeric(newdat_ch[i,]$Pos) <= as.numeric(mask_ch[j,]$V4))){
              newdat_ch[i,1] <- NA
            }
          }
        }
      }
      colnames(newdat_ch)[1] <- colnames(dpgpvt[c])
      newdat_all <- rbind(newdat_all,newdat_ch[1])
    }
    newdat_all <- newdat_all[-c(1),]
    fulldata[c] <- newdat_all
  } else {
    ###for samples there are no region to mask, copy into new dataframe
    fulldata[c] <- dpgpvt[c]
  }
}

#### Create new lectin-24A genotype matrix with masked regions

### process genotype matrix and add back Chr, Pos and annotation
dpgpvt <- as.data.frame(t(fulldata))
fulldata2 <- fulldata
fulldata2 <- cbind(colsplit(rownames(fulldata2),"_",names=c("Chr","Pos","ann")), fulldata2)
fulldata2$Pos <- as.numeric(fulldata2$Pos)

### more informative ids for indels
rownames(fulldata) <- gsub("Indel7bp_X3718140","2L_3718140_upstream_7BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel8bp_X3718040","2L_3718040_upstream_8BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel21bp_X3717878","2L_3717878_upstream_21BP_indel",rownames(fulldata))
rownames(fulldata) <- gsub("Indel_X3716932","2L_3716932_coding_indel",rownames(fulldata))
fulldata_gt <- cbind(colsplit(rownames(fulldata),"_",names=c("CHROM","POS","ID")), fulldata)
fulldata_gt <- fulldata_gt[-(nrow(fulldata_gt)-1),]
fulldata_gt$comb <- paste(fulldata_gt$CHROM,fulldata_gt$POS)
metanew <- dataChunk[1:9] ### get meta data from original SNP VCF for each site
metanew$V9 <- "GT" ### vcf specification that data only has genotype (GT)
metanew$V8 <- "." ### vcf specification that will be later used for ancestry status
metanew$comb <- paste(metanew$V1,metanew$V2)
fulldata_gt <- right_join(metanew,fulldata_gt,by="comb") ### add meta data to filtered sites
fulldata_gt$V3 <- fulldata_gt$ID ### vcf specification that will be later used for ID
fulldata_gt <- fulldata_gt[-c(10:13)]
colnames(fulldata_gt)[1:9] <- c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT")
fulldata_gt$FORMAT <- "GT"
fulldata_gt$`#CHROM` <- "2L" ### all have same chromosome

### manually add indels to vcf
fulldata_gt$POS[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c(3718140,3718040,3717878,3716932)
fulldata_gt$REF[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c("A","T","T","G") ### random placeholder annotatons for indels
fulldata_gt$ALT[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- c("GAGAACG","ATGAATGT","TGATCCCAGATAGCCTTATTT","TTTTTT") ### random placeholder annotatons for indels, size is correct for upstream indels
fulldata_gt$FILTER[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- "PASS" ### vcf specification arbitrary filter
fulldata_gt$QUAL[(length(fulldata_gt$POS)-3):length(fulldata_gt$POS)] <- "10000" ### vcf specification arbitrary quality
fulldata_gt[is.na(fulldata_gt)] <- "." ### replace NA with .
fulldata_gt <- fulldata_gt[order(fulldata_gt$POS),] ### order vcf by position

####  Coding ancestral state for variants

fa <- read.FASTA(file = "../Input/dmel_dsech_dsim_mafft_aln.fasta",type="DNA") ### MSA for 200kb surrounding lectin-24a in three drosophila species
fa_dmel <- as.matrix(fa)[1,] ## D. melanogaster
fa_dsech <- as.matrix(fa)[2,] ## D. sechellia
fa_dsim <- as.matrix(fa)[3,] ## D. simulans
fulldata_gt$POS <- as.numeric(fulldata_gt$POS)
vcfpos <- fulldata_gt$POS-3616811 ### get positions of interest, 3616811 is where the alignment starts
### make a df containing genotypes of three species
dmel_dsech_dsim <- as.data.frame(cbind(fulldata_gt$POS,fulldata_gt$REF,fulldata_gt$ALT,unname(as.character(as.matrix(fa_dmel)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsech)[c(vcfpos)])),unname(as.character(as.matrix(fa_dsim)[c(vcfpos)]))))
colnames(dmel_dsech_dsim) <- c("Pos","Ref","Alt","dmel","dsech","dsim")
### convert calls to uppercase
dmel_dsech_dsim$dmel <- toupper(dmel_dsech_dsim$dmel)
dmel_dsech_dsim$dsech <- toupper(dmel_dsech_dsim$dsech)
dmel_dsech_dsim$dsim <- toupper(dmel_dsech_dsim$dsim)
dmel_dsech_dsim$Alt <- gsub(",.*","", dmel_dsech_dsim$Alt) ### analysing most common alternate allele for multiallelic sites

### basically loop over each site and see if allele found in either of the other two drosophila species, that allele is designated as the ancestral allele

dmel_dsech_dsim$ancestral <- ""
for (i in 1:nrow(dmel_dsech_dsim)){
  ### if both reference and alternate call is found in the other two species, code as NA
  if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i]) & dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    dmel_dsech_dsim$ancestral[i] <- NA
  } else if (dmel_dsech_dsim$Ref[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### then check if reference call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Ref[i],sep="")
  } else if (dmel_dsech_dsim$Alt[i] %in% c(dmel_dsech_dsim$dsech[i],dmel_dsech_dsim$dsim[i])){
    ### now check if alternate call is the ancestral state
    dmel_dsech_dsim$ancestral[i] <- paste("AA=",dmel_dsech_dsim$Alt[i],sep="")
  } else{
    ### if neither reference and alternate call is found in the other two species, code as NA
    dmel_dsech_dsim$ancestral[i] <- NA
  }
}
fulldata_gt$INFO <- dmel_dsech_dsim$ancestral ### adding ancestral status to info column in vcf
### make alternate calls for indels into SNP and some programs don't handle indel calls
fulldata_gt$ALT <- gsub("TTTTTT","T",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("TGATCCCAGATAGCCTTATTT","A",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("ATGAATGT","A",fulldata_gt$ALT)
fulldata_gt$ALT <- gsub("GAGAACG","G",fulldata_gt$ALT)
### manually code in the ancestral states for indels
fulldata_gt[fulldata_gt$ID %in% "coding_indel",]$INFO <- "AA=G"
fulldata_gt[fulldata_gt$ID %in% "upstream_21BP_indel",]$INFO <- "AA=T"
fulldata_gt[fulldata_gt$ID %in% "upstream_8BP_indel",]$INFO <- "AA=A"
fulldata_gt[fulldata_gt$ID %in% "upstream_7BP_indel",]$INFO <- "AA=A"
fulldata_gt <- fulldata_gt[!is.na(fulldata_gt$INFO),] ### remove sites without ancestral state classification

####  Post-mask repeat SNP and individual filtering

dpgpvt <- dpgpvt[colnames(dpgpvt) %in% c("Indel7bp_X3718140","Indel8bp_X3718040","Indel21bp_X3717878","indelhaplotype","Indel_X3716932",paste(fulldata_gt$`#CHROM`,fulldata_gt$POS,fulldata_gt$ID,sep="_"))] ###only keep indels and snps with ans coding known, were identified in the processed lectin vcf files
dpgpv2 <- cbind(dpgpv[dpgpv$Sample %in% rownames(dpgpvt),1:16],dpgpvt) ### add meta data containing population information back to geotype matrix
fullfst2 <- dpgpv2
rownames(fullfst2) <- fullfst2$Sample

fullfst2 <- fullfst2[fullfst2$Population %in% names(which(table(fullfst2$Population) > 3)),] ### only retain populations with at least for samples, retain 26 populations in the end

### comparing 165BP coding deletion with pindel
pindel_calls <- read.csv(file="../Input/pindel_calls.csv") ## read in file containing samples where pindel has called a deletion
pindel_calls$Sample <- gsub("-HE","",pindel_calls$Sample) ### modify file names for compatability
pindel_calls$Sample <- gsub("_HE","",pindel_calls$Sample)
pindel_calls$GT <- 1 ### since file only contains samples where deletion was called
pindel_calls <- pindel_calls[!duplicated(pindel_calls$Sample),] ### remove duplicated samples, samples with multiple SRA accessions
fullfst3 <- left_join(fullfst2,pindel_calls,by="Sample") ### combine with genotype matrix for comparison
table(fullfst3$Indel_X3716932,fullfst3$GT) ### 1 sample has SNPs called within deletion called from Pindel
fullfst2 <- fullfst2[!fullfst2$Sample %in% fullfst3[fullfst3$Indel_X3716932 %in% 0 & fullfst3$GT %in% 1,]$Sample,] ### remove that sample from analyses

### only keep variants with AF>0.5 in at least two populations
tmpnew <- fullfst2[1:16]
for (i in 17:ncol(fullfst2)){
  ### only working with SNPs with maximum of three alleles, including reference call
  if (ncol(table(fullfst2$Population,fullfst2[,i])) > 1 & ncol(table(fullfst2$Population,fullfst2[,i])) < 4){
    if (length(table(fullfst2$Population)) - sum(table(fullfst2$Population,fullfst2[,i])[,2]/rowSums(table(fullfst2$Population,fullfst2[,i])) < 0.05 | table(fullfst2$Population,fullfst2[,i])[,2]/rowSums(table(fullfst2$Population,fullfst2[,i])) > 0.95, na.rm=T) > 1){
      tmpnew <- cbind(tmpnew,fullfst2[i])
    }
  }
}
fullfst2 <- tmpnew

### output file list containing samples in each population retained after filtering
#for (p in 1:length(unique(fullfst2$Population))){
#  write.table(fullfst2[fullfst2$Population %in% unique(fullfst2$Population)[p],]$Sample,file=paste(unique(fullfst2$Population)[p],"_POP",sep=""),quote = F,col.names = F,row.names = F)
#}

#write.table(fullfst2$Sample, file="Popover3",row.names = F,col.names = F, quote = F) ### a list containing all samples used for analysis
#write.table(sranames_meta_des[sranames_meta_des$Sample %in% fullfst2$Sample,]$Run,file="Popover3_run",row.names = F,col.names = F, quote = F) ### a list containing all samples used for analysis
### filter and write vcf file
calls <- fulldata_gt[10:ncol(fulldata_gt)] ## another df for genotype calls
calls <- calls[colnames(calls) %in% fullfst2$Sample] ### only keep filtered samples
fulldata_gt <- cbind(fulldata_gt[1:9],calls) ### add back meta data
fulldata_gt <- fulldata_gt[fulldata_gt$POS %in% as.numeric(gsub("_.*","",gsub(".*_X","",gsub("2L_","",colnames(fullfst2[17:ncol(fullfst2)]))))),] ### filter by variant

write.table(cbind(fullfst2[2],fullfst2[5]),file="pop_sample",quote = F, row.names = F)
write.table(names(fulldata_gt)[10:ncol(fulldata_gt)],file="lectin_samples_for_fst",row.names = F,col.names = F, quote = F)
write.table(fulldata_gt, file="lectin_all.vcf",sep="\t",quote = F, row.names = F)
write.table(fulldata_gt[10:ncol(fulldata_gt)], file="lectin_gt_all",sep=" ",quote = F, row.names = F, col.names = F)
write.table(fulldata_gt[1:9], file="lectin_meta_all",sep=" ",quote = F, row.names = F, col.names = F)

#### 23. Per site Fst for lectin-24a variants  ####

lectin_1kb_arms <- read.csv(file="../Input/lectin_1kb_arms.csv") ## read in per site fst for lectin region with arms calculated using vcftools

### plot per site fst, significance threshold using 7,260,944 silent mutations as background
### note that when using all 8,440,512 SNPs found in autosomes, 1% threshold is 0.412153 and 0.1% threshold is 0.6445234
p1 <- ggplot(data=lectin_1kb_arms,aes(x=POS,y=WEIR_AND_COCKERHAM_FST))+
  geom_hline(aes(yintercept=0.4120281),linetype="dotted", color="red", size=1)+
  geom_hline(aes(yintercept=0.6417723),linetype="dotted", color="red", size=1)+
  geom_point(size=2)+
  scale_y_continuous(expand = expansion(mult = c(0, 0)),name = "Per site Fst", limits = c(0, max(lectin_1kb_arms$WEIR_AND_COCKERHAM_FST,na.rm=T)+0.07))+
  scale_x_reverse(lim=c(max(lectin_1kb_arms$POS),min(lectin_1kb_arms$POS)))+
  geom_text(aes(x=3718650,y=0.4120281+0.05,label="1% threshold"),color="red",size=5)  +
  geom_text(aes(x=3718580,y=0.6417723+0.05,label="0.1% threshold"),color="red",size=5)  +
  theme_classic()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) 

df2 <- as.data.frame(rbind(c(3716157,3716302,"a"),c(3715513,3716104,"a"),c(3718185,3718485,"a"))) ### create dataframe with exons
### plot gene model, red inverted arrows are null mutations found in DPGP

#3716883,3717261 carb
p2 <- ggplot()+
  geom_gene_arrow(data=df2, aes(xmin = as.numeric(as.character(V1)), xmax = as.numeric(as.character(V2)), y = 0.275), colour = NA,fill = "light grey",arrow_body_height = grid::unit(10, "mm"),arrowhead_height = grid::unit(0, "mm"),arrowhead_width = grid::unit(0, "mm"))+
  geom_gene_arrow(aes(xmin = 3717773, xmax = 3716800, y = 0.275), colour = "black",fill = "white",arrow_body_height = grid::unit(10, "mm"),arrowhead_height = grid::unit(13, "mm"),arrowhead_width = grid::unit(4, "mm"))+
  geom_gene_arrow(aes(xmin = 3716389, xmax = 3716799, y = 0.275), colour = NA,fill = "light grey",arrow_body_height = grid::unit(10, "mm"),arrowhead_height = grid::unit(13, "mm"),arrowhead_width = grid::unit(4, "mm"))+
  geom_segment(aes(x=3716301,xend=3716388,y=0.275,yend=0.275),color="light grey",size=1)+
  geom_segment(aes(x=3716104,xend=3716156,y=0.275,yend=0.275),color="light grey",size=1)+
  geom_segment(aes(x=3717773,xend=3718184,y=0.275,yend=0.275),color="light grey",size=1)+
  geom_segment(aes(x=3718186,xend=3719050,y=0.275,yend=0.275),color="light grey",size=1)+
  scale_x_reverse(lim=c(3719050,3715513))+
  ylab("Fst")+
  ylim(0.15,0.31)+
  guides(fill=FALSE)+
  guides(colour=FALSE)+
  geom_text(aes(x=3717072,y=0.186,label='Carbohydate \nbinding domain'),size=4)  +
  geom_text(aes(x=3718380,y=0.23,label='Shaw'),fontface = "italic",size=5)  +
  geom_text(aes(x=3716199,y=0.23,label='CG2818'),fontface = "italic",size=5)  +
  geom_text(aes(x=3717304,y=0.275,label='Lectin-24A'),fontface = "italic",size=5)  +
  geom_point(data=lectin_1kb_arms[lectin_1kb_arms$Ann %in% c("stop_gained","Coding Deletion"),],aes(x=POS,y=0.305),shape = 25,fill="red",color="red",size=2)+
  geom_point(data=lectin_1kb_arms[lectin_1kb_arms$Ann %in% c("21BP"),],aes(x=POS,y=0.285),shape = 25,fill="red",color="red",size=2)+
  theme_void()

### align plots
pdf(file="per_site_fst.pdf", width = 5.6, height = 4)
plot_grid(p1,p2,ncol=1,align="v",rel_heights = c(2,1))
grid.brackets(253, 53, 219 , 53, lwd=2, col="red")
dev.off()

#### 24. Population branch statistic for premature stop codons ####
## in the first part, get the corresponding allele for the stop codons in D.simulans and D.sechellia using the BLAST output where a 200bp sequnence containing the D.melanogaster sequence was used as a query
## read in blast txt output file that contains alignments for stop codons and surrounding positions against D.simulans
stop_dsim <- read.table(file="../Input/Dsim_alns_sub",sep="")
## use "Query" and a block demarcator to get the alignment lines containing query and subject sequence
linestart <- which(stop_dsim$V4 %in% "Query") ## query line start
lineend <- linestart[-c(1)]
lineend = lineend - 1 ## query line end
lineend <- c(lineend,nrow(stop_dsim))

dmel_dsim_stop <- ""
dmel_dsim_stop <- ""
for (b in 1:length(linestart)){
  # proceed if query block exists, has to be more than one line since 200bp input
  if(nrow(stop_dsim[(linestart[b]):(lineend[b]),])>1){
    block <- stop_dsim[(linestart[b]+1):(lineend[b]),] ## block number with query and subject
    subset <- which(block$V1 %in% "Query" & as.numeric(block$V2) <= 101 & as.numeric(block$V4) >= 101) ## get the query and subject lines with the stop codon, rememver that for BLAST 100bp upstream and 100bp downstream of the stop codon were used 
    if(length(subset)>0){
      block <- block[subset:(subset+1),] ## block containing query and subject lines with the stop codon
      dmel_s <- str_split(block[1,]$V3,pattern="")[[1]] #split query alignment line into vector
      dsim_s <- str_split(block[2,]$V3,pattern="")[[1]] #split subject alignment line into vector
      dmel_dsim_s <- as.data.frame(cbind(dmel_s,dsim_s))
      dmel_dsim_s <- dmel_dsim_s[!dmel_dsim_s$dmel_s %in% "-",] ## remove gaps in subject (simulans) alignment
      dmel_dsim_s <- dmel_dsim_s[102-as.numeric(block[1,]$V2),] ## this is the allele for stop codon in D.simulans
      dmel_dsim_stop <- rbind(dmel_dsim_stop,cbind(stop_dsim[linestart[b],]$V3,dmel_dsim_s$dmel_s,dmel_dsim_s$dsim_s)) ## add the D.melanogaster and D.simulans stop codon allele into a common dataframe 
      
    }
  }
}
dmel_dsim_stop <- as.data.frame(dmel_dsim_stop[-c(1),])
colnames(dmel_dsim_stop)[2:3] <- c("dmel_s","dsim_s") ## D.melanogaster and D.simulans stop codon allele
dmel_dsim_stop$CHR <- gsub(":.*","",dmel_dsim_stop$V1) ## stop codon chr in D.melanogaster 
dmel_dsim_stop$POS <- as.numeric(gsub("-.*","",gsub(".*:","",dmel_dsim_stop$V1)))+100 ## stop codon position in D.melanogaster 
dmel_dsim_stop$ID <- paste(dmel_dsim_stop$CHR,dmel_dsim_stop$POS,sep="_") ## stop codon ID in D.melanogaster 

## read in blast txt output file that contains alignments for stop codons and surrounding positions against D.sechellia
stop_dsech <- read.table(file="../Input/Dsech_alns_sub",sep="")
## use "Query" and a block demarcator to get the alignment lines containing query and subject sequence
linestart <- which(stop_dsech$V4 %in% "Query") ## query line start
lineend <- linestart[-c(1)]
lineend = lineend - 1 ## query line end
lineend <- c(lineend,nrow(stop_dsech))

dmel_dsech_stop <- ""
for (b in 1:length(linestart)){
  # proceed if query block exists, has to be more than one line since 200bp input
  if(nrow(stop_dsech[(linestart[b]):(lineend[b]),])>1){
    block <- stop_dsech[(linestart[b]+1):(lineend[b]),] ## block number with query and subject
    subset <- which(block$V1 %in% "Query" & as.numeric(block$V2) <= 101 & as.numeric(block$V4) >= 101) ## get the query and subject lines with the stop codon, rememver that for BLAST 100bp upstream and 100bp downstream of the stop codon were used 
    if(length(subset)>0){
      block <- block[subset:(subset+1),] ## block containing query and subject lines with the stop codon
      dmel_s <- str_split(block[1,]$V3,pattern="")[[1]] #split query alignment line into vector
      dsech_s <- str_split(block[2,]$V3,pattern="")[[1]] #split subject alignment line into vector
      dmel_dsech_s <- as.data.frame(cbind(dmel_s,dsech_s))
      dmel_dsech_s <- dmel_dsech_s[!dmel_dsech_s$dmel_s %in% "-",] ## remove gaps in subject (sechellia) alignment
      dmel_dsech_s <- dmel_dsech_s[102-as.numeric(block[1,]$V2),] ## this is the allele for stop codon in D.sechellia
      dmel_dsech_stop <- rbind(dmel_dsech_stop,cbind(stop_dsech[linestart[b],]$V3,dmel_dsech_s$dmel_s,dmel_dsech_s$dsech_s)) ## add the D.melanogaster and D.sechellia stop codon allele into a common dataframe 
      
    }
  }
}
dmel_dsech_stop <- as.data.frame(dmel_dsech_stop[-c(1),])
colnames(dmel_dsech_stop)[2:3] <- c("dmel_s","dsech_s") ## D.melanogaster and D.sechellia stop codon allele
dmel_dsech_stop$CHR <- gsub(":.*","",dmel_dsech_stop$V1) ## stop codon chr in D.melanogaster 
dmel_dsech_stop$POS <- as.numeric(gsub("-.*","",gsub(".*:","",dmel_dsech_stop$V1)))+100 ## stop codon position in D.melanogaster 
dmel_dsech_stop$ID <- paste(dmel_dsech_stop$CHR,dmel_dsech_stop$POS,sep="_") ## stop codon ID in D.melanogaster 

## combine stop codon alleles from D.melanogaster, D.simulans and D.sechellia into a common dataframe
dmel_dsim_dsech_stop <- full_join(dmel_dsim_stop,dmel_dsech_stop,by="ID")
dmel_dsim_dsech_stop$CHR <- gsub("_.*","",dmel_dsim_dsech_stop$ID)
dmel_dsim_dsech_stop$POS <- as.numeric(gsub(".*_","",dmel_dsim_dsech_stop$ID))

dmel_dsim_dsech_stop$dsim_s <- toupper(dmel_dsim_dsech_stop$dsim_s ) # convert base to uppercase
dmel_dsim_dsech_stop$dsech_s <- toupper(dmel_dsim_dsech_stop$dsech_s ) # convert base to uppercase

## read in vcf file, subset from dpgp_masked.vcf, containing just stop codons
dpgp_stops_calls <- read.table(file="../Input/dpgp_stops2.vcf",header=T)
recombination_rates <- read.table(file="../Input/dpgp_stops_r2.rrc") ## recombination rate estimates for stop codon locations from doi:10.1016/j.gene.2010.04.015
recombination_rates$V2  <- as.numeric(gsub("\\.\\..*","",recombination_rates$V2))
dpgp_stops_calls$ANN <- dpgp_stops_calls$ID ## id is actually contains annotation
dpgp_stops_calls$ID <- paste(dpgp_stops_calls$CHROM,dpgp_stops_calls$POS,sep="_") #proper snp id combining chromosome and position
dpgp_stops_calls <- dpgp_stops_calls[dpgp_stops_calls$POS %in% recombination_rates$V2,] # only keep snps where recombination rates were estimated
dmel_dsim_dsech_stop <- left_join(dmel_dsim_dsech_stop,dpgp_stops_calls,by="ID") ## add genotype frequency data to the dataframe containing three drosophila stop codon calls

## identify ancestral state for stop codons
Anc <- ""
for (i in 1:nrow(dmel_dsim_dsech_stop)){
  ## if both reference and alternate allele from vcf found in D.simulans and D.sechellia, then unable to assess anectral state, code as NA
  if(dmel_dsim_dsech_stop$REF[i] %in% c(dmel_dsim_dsech_stop$dsim_s[i],dmel_dsim_dsech_stop$dsech_s[i]) & dmel_dsim_dsech_stop$ALT[i] %in% c(dmel_dsim_dsech_stop$dsim_s[i],dmel_dsim_dsech_stop$dsech_s[i])){
    Anc[i] <- NA
    ## if  reference allele found in either D.simulans and D.sechellia, reference is ancestral state
  } else if((dmel_dsim_dsech_stop$REF[i] %in% dmel_dsim_dsech_stop$dsim_s[i]) | (dmel_dsim_dsech_stop$REF[i] %in% dmel_dsim_dsech_stop$dsech_s[i])){
    Anc[i] <- "R"
    ## if  alternate allele found in either D.simulans and D.sechellia, alternate is ancestral state
  } else if((dmel_dsim_dsech_stop$ALT[i] %in% dmel_dsim_dsech_stop$dsim_s[i]) | (dmel_dsim_dsech_stop$ALT[i] %in% dmel_dsim_dsech_stop$dsech_s[i])){
    Anc[i] <- "A"
  } else{
    Anc[i] <- NA
  }
  
}
dmel_dsim_dsech_stop$Anc <- Anc

## now get stop gained mutations where the derived allele is the premature stop codon (So reference allele is the ancestral state and stop gained mutations)
dmel_dsim_dsech_stop_r <- dmel_dsim_dsech_stop[dmel_dsim_dsech_stop$Anc %in% "R",]
dmel_dsim_dsech_stop_r <- dmel_dsim_dsech_stop_r[dmel_dsim_dsech_stop_r$ANN %in% "stop_gained",]
rid <- dmel_dsim_dsech_stop_r$ID
dmel_dsim_dsech_stop_r <- as.data.frame(t(dmel_dsim_dsech_stop_r[22:(ncol(dmel_dsim_dsech_stop_r)-2)])) ## just get the stop codon genotype matrix but transposed
colnames(dmel_dsim_dsech_stop_r) <- rid ## colnames in snps
dmel_dsim_dsech_stop_r$Sample <- rownames(dmel_dsim_dsech_stop_r) ## sample names in a separate column

## add population/region descriptions for samples
pop_country <- read.table(file="../Input/pop_country",header = T) ## now read in file with population and region designations
pop_country <- pop_country[!duplicated(pop_country),]
dmel_dsim_dsech_stop_r$Sample <- gsub("\\.","-",dmel_dsim_dsech_stop_r$Sample) ## change file names for RAL (RAL. TO RAL-)
dmel_dsim_dsech_stop_r <- left_join(dmel_dsim_dsech_stop_r,pop_country,by="Sample") ## combine genotype matrix with location information for samples
georegions <- as.data.frame(rbind(cbind("ocenia","T"),cbind("asia", "B"),cbind("america", c("I","RAL","W")),cbind("europe_north_africa", c("N","FR","EG")),cbind("southern_africa", c("SB","SD","SF","SP","ZI","ZS","ZW")),cbind("central_africa", c("CO","RG","GA")),cbind("west_africa", c("GU","NG")),cbind("east_africa", c("UG","EA","EB","ED","EF","ER")))) ### grouping populations by broader regions
colnames(georegions) <- c("Region","Population")
dmel_dsim_dsech_stop_r <- left_join(dmel_dsim_dsech_stop_r,georegions,by="Population") ## add broader location information for genotype matrix

## calculate premature stop codon allele frequency for cases where alternate allele is derived premature stop
all_af_r <- ""
for (s in 1:(ncol(dmel_dsim_dsech_stop_r)-5)){
  ac <- table(dmel_dsim_dsech_stop_r[,s],dmel_dsim_dsech_stop_r$Region)
  # where the site is polymorphic
  if (nrow(ac)>1){
    af <- ac[2,]/colSums(ac) ## 1 in vcf file is the premature stop
  }
  # where the site is monomorphic for non stop gained variant
  if ((nrow(ac) <2) & (rownames(ac) == "0")){
    af <- ac
    af[af >= 0] <- 0 ## stop codon frequency is 0
  }
  # where the site is polymorphic for non stop gained variant
  if ((nrow(ac) <2) & (rownames(ac) == "1")){
    af <- ac
    af[af >= 0] <- 1 ## stop codon frequency is 1, fixed
  }
  af[colSums(ac)<20] <- NA ## if fewer than 20 alleles called, code as NA
  all_af_r <- rbind(all_af_r,af)
}
all_af_r <- all_af_r[-c(1),]
rownames(all_af_r) <- colnames(dmel_dsim_dsech_stop_r[1:(ncol(dmel_dsim_dsech_stop_r)-5)])

## next part is to calculate the distance of the premature stop codon from the actual stop codon
all_af_r_pos <- as.data.frame(rownames(all_af_r)) ## make a dataframe with just the chr and position for premature stop codons where referene allele is the ancestral state
colnames(all_af_r_pos) <- "CHR_POS"
dmel_CDS <- read.table(file="../Input/dmel-all-r5.13_CDS.gtf",sep="\t") ## gtf file containing gene positons for coding sequence only (CDS), use to extract actual stop codon locations

library(tidyr)
all_af_r_pos <- separate(all_af_r_pos,"CHR_POS",into=c("CHR","POS")) ## separate id column into chr and position
all_af_r_pos$POS <- as.numeric(all_af_r_pos$POS) #  make position numeric

actual_stop_positions <- ""
for (i in 1:nrow(all_af_r_pos)){
  dmel_CDS_part <- dmel_CDS[dmel_CDS$V1 %in% all_af_r_pos$CHR[i] & (((dmel_CDS$V4 <= as.numeric(all_af_r_pos$POS[i])) & (dmel_CDS$V5 >= as.numeric(all_af_r_pos$POS[i]))) | ((dmel_CDS$V4 >= as.numeric(all_af_r_pos$POS[i])) & (dmel_CDS$V5 <= as.numeric(all_af_r_pos$POS[i])))), ] ## get the gtf file line containing the stop codon
  print(i)
  # only analyse site if it is found with gtf file
  if(nrow(dmel_CDS_part)>0){
    ## get gene and transcript names
    gene_name <- gsub("; gene_name.*","",gsub(".*gene_id ","",dmel_CDS_part$V9[1]))
    transcript_id <- gsub("; gene_id.*","",gsub(".*transcript_id ","",dmel_CDS_part$V9))
    
    ## if stop codon occurs within multiple transcripts, then get distance to nearest stop codon
    dis_from_stop_all <- ""
    for(t in 1:length(transcript_id)){
      dmel_CDS_part2 <- dmel_CDS[grepl(transcript_id[t],dmel_CDS$V9),] ## get block with transcript of interest
      
      # if transcript is in positive sense
      if(dmel_CDS_part2$V7[1] %in% "+"){
        actual_stop <- tail(dmel_CDS_part2,n=1)$V5-2 ## location of actual stop codon
        intron_lts <- (dmel_CDS_part2$V4[2:length(dmel_CDS_part2$V4)]-dmel_CDS_part2$V5[1:(length(dmel_CDS_part2$V5)-1)])-1 ## get length of introns for transcript
        premature_location <- which(dmel_CDS_part$V4[t] == dmel_CDS_part2$V4 & dmel_CDS_part$V5[t] == dmel_CDS_part2$V5) ## in which exon does premature stop occur
        
        ## if premature stop occurs in terminal exon, no need to worry about introns
        if(premature_location == nrow(dmel_CDS_part2)){
          dis_from_stop <- as.numeric(actual_stop)-as.numeric(all_af_r_pos$POS[i]) ## distance of premature stop codon from actual stop codon
        }
        ## if premature stop does not occur in terminal exon, need to subtract about intron length between the exons where premature stop occurs and actual stop occurs from the overall distance from premature stop and actual stop codon
        if(premature_location < nrow(dmel_CDS_part2)){
          dis_from_stop <- as.numeric(actual_stop)-as.numeric(all_af_r_pos$POS[i]) ## overall distance from premature stop and actual stop codon
          dis_from_stop <- dis_from_stop - sum(intron_lts[(premature_location):length(intron_lts)]) ## subtract introns that occur between the two
        }
      }
      # if transcript is in negative sense
      if(dmel_CDS_part2$V7[1] %in% "-"){
        premature_location <- which(dmel_CDS_part$V4[t] == dmel_CDS_part2$V4 & dmel_CDS_part$V5[t] == dmel_CDS_part2$V5) ## in which exon does premature stop occur
        actual_stop <- head(dmel_CDS_part2,n=1)$V4+2 ## location of actual stop codon
        intron_lts <- (dmel_CDS_part2$V4[2:length(dmel_CDS_part2$V4)]-dmel_CDS_part2$V5[1:(length(dmel_CDS_part2$V5)-1)])-1 ## get length of introns for transcript
        
        ## if premature stop occurs in terminal exon, no need to worry about introns
        if(premature_location == 1){
          dis_from_stop <- as.numeric(all_af_r_pos$POS[i])-as.numeric(actual_stop)
        }
        ## if premature stop does not occur in terminal exon, need to subtract about intron length between the exons where premature stop occurs and actual stop occurs from the overall distance from premature stop and actual stop codon
        if(premature_location > nrow(dmel_CDS_part2)){
          dis_from_stop <- as.numeric(all_af_r_pos$POS[i])-as.numeric(actual_stop) ## overall distance from premature stop and actual stop codon
          dis_from_stop <- dis_from_stop - sum(intron_lts[1:(premature_location-1)]) ## subtract introns that occur between the two
        }
      }
      dis_from_stop_all <- c(dis_from_stop_all,dis_from_stop)
    }
    actual_stop_positions <- rbind(actual_stop_positions,cbind(gene_name,dmel_CDS_part$V7[1],actual_stop,min(as.numeric(dis_from_stop_all),na.rm=T),all_af_r_pos$CHR[i],all_af_r_pos$POS[i]))
  }
}
actual_stop_positions <- as.data.frame(actual_stop_positions[-c(1),])
actual_stop_positions$ID <- paste(actual_stop_positions$V5,actual_stop_positions$V6,sep="_") ## id column combining chr and position
colnames(actual_stop_positions)[4] <- "dis_from_stop"
actual_stop_positions$dis_from_stop <- as.numeric(actual_stop_positions$dis_from_stop) # make distance from stop codon numeric

## additional sample and snp based filtering: only keeping regions where at least 10 alleles were identified across all snps and  snps with af>0.05 and <1 in at least one population
all_af_r <- all_af_r[rownames(all_af_r) %in% actual_stop_positions$ID,] # only keep variants where it was possible to calculate distance of premature stop codon from actual stop codon
all_af_r_backup <- all_af_r #backup for rownames
all_af_r <- all_af_r_backup
all_af_r <- all_af_r[,apply(as.data.frame(all_af_r),2,function(x){sum(!is.na(x))}) > 10] # only keep regions where at least 10 alleles were identified across all snps
all_af_r <- apply(as.data.frame(all_af_r),2,as.numeric) ## make into a numeric dataframe
rownames(all_af_r) <- rownames(all_af_r_backup) # add back rownames
all_af_r <- all_af_r[rowSums(all_af_r > 0.05, na.rm=T) > 0,] # only keep snps with af>0.05 in at least one population
all_af_r <- all_af_r[rowSums(all_af_r < 1, na.rm=T) > 0,] # only keep snps with af<1 in at least one population, i.e. remove fixed snps

## filtering based on recombination rates, remove sites in lowly recombining sites as they may not necessarily be the adaptive snps but linked to an adaptive one
recombination_rates$ID <- paste(recombination_rates$V1,recombination_rates$V2,sep="_") # id column with chr and positon for recombination rates for premature stop codons
all_af_r_rr <- as.data.frame(all_af_r)
all_af_r_rr$ID <- rownames(all_af_r_rr)
all_af_r_rr <- melt(all_af_r_rr,id="ID")
all_af_r_rr <- left_join(all_af_r_rr,recombination_rates,by="ID")
# plot of recombination rates by allele frequency shows justification for filtering snps <1 cm/mb
ggplot(all_af_r_rr,aes(V3,as.numeric(value),color=variable))+
  geom_point()+
  xlab("Recombination rate cM/Mb")+
  ylab("AF")

all_af_r <- all_af_r[rownames(all_af_r) %in% recombination_rates[recombination_rates$V3 >1,]$ID,] # remove snps in regions <1 cm/mb

# plot of distance of premature stop codon from actual stop codon by allele frequency, no clear boundary
all_af_r_dis <- as.data.frame(all_af_r)
all_af_r_dis$ID <- rownames(all_af_r_dis)
all_af_r_dis <- melt(all_af_r_dis,id="ID")
all_af_r_dis <- left_join(all_af_r_dis,actual_stop_positions,by="ID")
ggplot(all_af_r_dis,aes(dis_from_stop,as.numeric(value),color=variable))+
  geom_point()+
  xlab("Distance from stop codon (bp)")+
  ylab("AF")
## zoom into first 500bp
ggplot(all_af_r_dis,aes(dis_from_stop,as.numeric(value),color=variable))+
  geom_point()+
  xlab("Distance from stop codon (bp)")+
  xlim(0,500)+
  ylab("AF")
all_af_r <- all_af_r[rownames(all_af_r) %in% all_af_r_dis[all_af_r_dis$dis_from_stop >9,]$ID,] # remove premature stop codons occuring within 10 bp on actual stop codon

dim(all_af_r) # just 319 premature stop codons from 5 regions
pheatmap(apply(all_af_r,2,as.numeric))

all_af_r <- all_af_r[,-c(2:3)] ## remove central_africa and east_africa since they will not be used for PBS (below) and repeat snp filtering
all_af_r <- all_af_r[rowSums(all_af_r > 0.05, na.rm=T) > 0,] # only keep snps with af>0.05 in at least one population
all_af_r <- all_af_r[rowSums(all_af_r < 1, na.rm=T) > 0,] # only keep snps with af<1 in at least one population, i.e. remove fixed snps
dim(all_af_r) # just 211 premature stop codons from 3 regions, these three regions were chosen as they are the largest
pheatmap(apply(all_af_r,2,as.numeric))

## next, plotting the distribution of allele frequencies of stop codons by regions and highlighting snps that occur in lectin-24A
all_af_r_df <- as.data.frame(apply(as.data.frame(all_af_r),2,as.numeric))
all_af_r_df$id <- rownames(all_af_r)
all_af_r_df <- left_join(all_af_r_df,actual_stop_positions,by=c("id"="ID")) # mainly to add gene name info to premature stop codons
all_af_r_df <- melt(all_af_r_df,id=(ncol(all_af_r_df)-ncol(actual_stop_positions)+1):ncol(all_af_r_df))
# to identify 5% threshold for allele frequency histogram plot
sig_line <- all_af_r_df[8:9] %>%
  group_by(variable) %>%
  summarise(sig=quantile(value,0.95,na.rm=T))
# histogram on premature stop codon allele frequencies, one lectin-24a stop codon is at very high frequency in southern africa
ggplot(data=all_af_r_df,aes(x=value))+
  geom_histogram()+
  geom_vline(data=all_af_r_df[all_af_r_df$gene_name %in% "FBgn0040104",],aes(xintercept = value))+
  geom_vline(data=sig_line,aes(xintercept = sig,color="Red"))+
  facet_grid(~variable)

### now calculating the population branch statistic for premature stop codons from https://doi.org/10.1016/j.gene.2018.06.077
## first get file where pairwise fst was calculated for premature stop codons
ENA_NA_stop <- read.table(file="../Input/ENA_NA_stop.weir.fst",header=T)
ENA_SA_stop <- read.table(file="../Input/ENA_SA_stop.weir.fst",header=T)
NA_SA_stop <- read.table(file="../Input/NA_SA_stop.weir.fst",header=T)
stop_fst_all <- cbind(ENA_NA_stop,ENA_SA_stop[3],NA_SA_stop[3]) # add all pairwise measumrements into a common datagrame
colnames(stop_fst_all)[3:5] <- c("ENA_NA","ENA_SA","NA_SA")
stop_fst_all$ID <- paste(stop_fst_all$CHROM,stop_fst_all$POS,sep="_") # id column with chr and position
# since negative fst is meaningless and 0 is problamatic for log scale, change to negative and 0 values to a very very small number
#stop_fst_all$ENA_NA <- pmax(stop_fst_all$ENA_NA,0.0000000001)
#stop_fst_all$ENA_SA <- pmax(stop_fst_all$ENA_SA,0.0000000001)
#stop_fst_all$NA_SA <- pmax(stop_fst_all$NA_SA,0.0000000001)
stop_fst_all[stop_fst_all$ENA_NA %in% "NaN",]$ENA_NA <- 0.0000000001
stop_fst_all[stop_fst_all$ENA_SA %in% "NaN",]$ENA_SA <- 0.0000000001
stop_fst_all[stop_fst_all$NA_SA %in% "NaN",]$NA_SA <- 0.0000000001
## change fst values to log scale, for PBS
stop_fst_all$ENA_NA <- -log(1-stop_fst_all$ENA_NA)
stop_fst_all$ENA_SA <- -log(1-stop_fst_all$ENA_SA)
stop_fst_all$NA_SA <- -log(1-stop_fst_all$NA_SA)
## calculate PBS
stop_fst_all$PBS_SA <- (stop_fst_all$ENA_SA+stop_fst_all$NA_SA-stop_fst_all$ENA_NA)/2
stop_fst_all$PBS_NA <- (-stop_fst_all$ENA_SA+stop_fst_all$NA_SA+stop_fst_all$ENA_NA)/2
stop_fst_all$PBS_ENA <- (stop_fst_all$ENA_SA-stop_fst_all$NA_SA+stop_fst_all$ENA_NA)/2
## negative PBS is an artifact, convert to 0
#stop_fst_all$PBS_SA <- pmax(stop_fst_all$PBS_SA,0)
#stop_fst_all$PBS_NA <- pmax(stop_fst_all$PBS_NA,0)
#stop_fst_all$PBS_ENA <- pmax(stop_fst_all$PBS_ENA,0)
stop_fst_all2 <- stop_fst_all[-c(3:5)]
stop_fst_all2 <- stop_fst_all2[stop_fst_all2$ID %in% all_af_r_df$id,] # only keep previously filtered premature stop gained mutations
write.table(stop_fst_all2[1:2],file="stop_codon_list",row.names = F, col.names = F, quote = F)  ## file will be used to subset vcf in vcftools to obtain fst across all stop codons
stop_fst_all2 <- melt(stop_fst_all2,id=c(1:3))
stop_fst_all2 <- left_join(stop_fst_all2,actual_stop_positions,by="ID")
stop_fst_all2$variable <- gsub("PBS_SA","Southern Africa",stop_fst_all2$variable )
stop_fst_all2$variable <- gsub("PBS_NA","North America",stop_fst_all2$variable )
stop_fst_all2$variable <- gsub("PBS_ENA","Europe & North Africa",stop_fst_all2$variable )
# to identify 5% threshold for PBS plot
sig_line <- stop_fst_all2[4:5] %>%
  dplyr::group_by(variable) %>%
  dplyr::summarise(sig=quantile(value,0.95,na.rm=T))

## PBS values to highlight for first lectin premature stop codon
vals_2L_3716969 <- stop_fst_all2[stop_fst_all2$ID %in% "2L_3716969",]
vals_2L_3716969 <- vals_2L_3716969[c(4:5)]
vals_2L_3716969 <- vals_2L_3716969[order(vals_2L_3716969$variable),]

## PBS values to highlight for second lectin premature stop codon
vals_2L_3717125 <- stop_fst_all2[stop_fst_all2$ID %in% "2L_3717125",]
vals_2L_3717125 <- vals_2L_3717125[c(4:5)]
vals_2L_3717125 <- vals_2L_3717125[order(vals_2L_3717125$variable),]
vals_2L_3717125$value2 <- vals_2L_3717125$value
## for changing postion of label in plot below so as not to overlap with text
vals_2L_3717125$value2[1] <- -0.04
vals_2L_3717125$value2[2] <- 0.023
vals_2L_3717125$value2[3] <- 0.13

## this part calculates PBS for the 165 BP coding deletion
NA_SA_codingdel.weir.fst <- read.table(file="../Input/NA_SA_codingdel.weir.fst",header=T)
ENA_SA_codingdel.weir.fst <- read.table(file="../Input/ENA_SA_codingdel.weir.fst",header=T)
ENA_NA_codingdel.weir.fst <- read.table(file="../Input/ENA_NA_codingdel.weir.fst",header=T)
ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST <- 0.0000000001

NA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST <- -log(1-NA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)
ENA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST <- -log(1-ENA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)
ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST <- -log(1-ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)

PBS_SA_coding_del <- (ENA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST+NA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST-ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)/2
PBS_NA_coding_del <- (-ENA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST+NA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST+ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)/2
PBS_ENA_coding_del <- (ENA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST-NA_SA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST+ENA_NA_codingdel.weir.fst$WEIR_AND_COCKERHAM_FST)/2

stop_fst_all2 <- rbind(stop_fst_all2,c("2L", "3716932", "2L_3716932", "Southern Africa",PBS_SA_coding_del,"FBgn0040104","-",10000,10000,"2L", "3716932"))
stop_fst_all2 <- rbind(stop_fst_all2,c("2L", "3716932", "2L_3716932", "North America",PBS_NA_coding_del,"FBgn0040104","-",10000,10000,"2L", "3716932"))
stop_fst_all2 <- rbind(stop_fst_all2,c("2L", "3716932", "2L_3716932", "Europe & North Africa",PBS_ENA_coding_del,"FBgn0040104","-",10000,10000,"2L", "3716932"))
stop_fst_all2$value <- as.numeric(stop_fst_all2$value)

# histogram on premature stop codon allele frequencies, one lectin-24a stop codon is at very high PBS in southern africa and another has high value in North America
stop_hist <- ggplot(data=stop_fst_all2[stop_fst_all2$variable %in% "Southern Africa",],aes(x=value))+
  geom_histogram(bins = 100)+
  geom_vline(data=sig_line[sig_line$variable %in% "Southern Africa",],aes(xintercept = sig),color="Red",linetype = "dashed")+
  #facet_grid(~variable,scales = "free_x")+
  xlab("Population branch statistic")+
  ylab("Number of premature stop codons")+
  geom_segment(aes(x=PBS_SA_coding_del,xend=PBS_SA_coding_del,y=3,yend=6),arrow = arrow(length = unit(0.1,"cm"),ends="first"),size=1.2, colour = "black")+
  geom_text(data=vals_2L_3716969,aes(label = "p.Phe217_Glu273del*", angle=90, x = PBS_SA_coding_del-0.02, y = 12))+
  geom_segment(data=vals_2L_3716969[vals_2L_3716969$variable %in% "Southern Africa",],aes(x=value,xend=value,y=3,yend=6),arrow = arrow(length = unit(0.1,"cm"),ends="first"),size=1.2, colour = "black")+
  geom_text(data=vals_2L_3716969[vals_2L_3716969$variable %in% "Southern Africa",],aes(label = "p.Gln254*", angle=90, x = value, y = 9))+
  geom_segment(data=vals_2L_3717125[vals_2L_3717125$variable %in% "Southern Africa",],aes(x=value,xend=value,y=c(10),yend=c(13)),arrow = arrow(length = unit(0.1,"cm"),ends="first"),size=1.2, colour = "black")+
  geom_text(data=vals_2L_3717125[vals_2L_3717125$variable %in% "Southern Africa",],aes(label = "p.Glu202*", angle=90, x = value2-0.02, y = 16))+
  theme_classic2()
stop_hist

## to create PBS tree for ≈, the one with very long branch in southern africa
fortree <- vals_2L_3716969
fortree$value <- as.numeric(sprintf(fortree$value, fmt = '%#.3f'))
fortree$variable <- gsub(" ","_",fortree$variable)
## make values into a newick tree format
pbs.tree<-read.tree(text=paste("(",toString(paste(fortree$variable,fortree$value,sep=":")),");",sep=""))
## edit tip labels for adjusting spacing in tree
pbs.tree$tip.label[1] <- "Europe_&      \nNorth_Africa       \n(ENA)"
pbs.tree$tip.label[2] <- " North\n America (NA)"
pbs.tree$tip.label[3] <- "Southern\nAfrica (SA)"
## add a small value for 0 brach so that it is visible
pbs.tree$edge.length[1] <- 0.014

## genome-wide pbs tree using all 211 stop codons
all_stop_fst <- read.table(file="../Input/all_stop_fst.txt") ## obtain pairwise fst
all_stop_fst$V2 <- -log(1-all_stop_fst$V2)  ## calculate T
all_stop_pbs <- as.data.frame(pbs.tree$tip.label)
colnames(all_stop_pbs) <- "Region"
## calculate genome-wide pbs
all_stop_pbs$PBS[1] <- (all_stop_fst$V2[1]+all_stop_fst$V2[2]-all_stop_fst$V2[3])/2
all_stop_pbs$PBS[2] <- (all_stop_fst$V2[2]+all_stop_fst$V2[3]-all_stop_fst$V2[1])/2
all_stop_pbs$PBS[3] <- (all_stop_fst$V2[1]+all_stop_fst$V2[3]-all_stop_fst$V2[2])/2

## keep same tree as for p.Gln254*, but substitute branch lengths with genome-wide averages
pbs.all.tree <- pbs.tree
pbs.all.tree$edge.length <- all_stop_pbs$PBS
pbs.all.tree$tip.label <- c("ENA"," NA","SA ")

## p1 is pbs tree for p.Gln254*
p1 <- as.ggplot(~plot.phylo(pbs.tree, type = "unrooted", edge.width = 2, font = 1,show.tip.label = T, edge.color=c("blue","purple","orange"),x.lim = c(-0.05,0.45),y.lim = c(0.22,0.36)))+
  geom_text(x=0.48,y=0.54,label=pbs.tree$edge.length[3],angle=300)+
  geom_text(x=0.77,y=0.13,label=pbs.tree$edge.length[2],angle=0)+
  geom_text(x=0.25,y=0.44,label="Genome-wide PBS",angle=0,fontface="bold")+
  ggtitle("p.Gln254* PBS")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))
p1
## p1 is genome-wide pbs tree, important to keep in same scale and p1 using x.lim and y.lim
p2 <- as.ggplot(~plot.phylo(pbs.all.tree, type = "unrooted", edge.width = 2, font = 1,show.tip.label = T, edge.color=c("blue","purple","orange"),x.lim = c(-0.05,0.45),y.lim = c(0.22,0.36),lab4ut="axial"))

## use viewport to place p2 within p1
vp <- viewport(width = 1, height = 1, x = 0.45, y = 0.75)

## plot pbs tree and stop codon histogram
pdf(file="pbs_hist.pdf",width=3.1,height = 3.6)
stop_hist
dev.off()

pdf(file="premature_stop_codon.pdf",width=6,height = 6)
print(plot_grid(p1))
print(p2, vp = vp)
dev.off()


#### 25. Induction of Lectin-24A expression in lines with premature stop codons ####
## here, lectin-24A qPCR was done on Southern African lines (from populations SD and SP) polymorphic for functional and loss of function variants. 
## Goal was to see if lectin-24A is still induced even if loss of function variant occurs in coding sequence. 

SD_SP_lines <- read.csv(file="../Input/SD_SP_lines.csv") ## contains the SP and SD genotype calls for null variants for lectin-24A
colnames(SD_SP_lines) <- c("Genotype", "c.-439-433del",	"c.-334-333insACATTCAT",	"c.-171-151del",	"p.Leu81*",	"p.Glu202*",	"p.Gln254*",	"p.Phe217_Glu273del*")  ## name variants
dct_sd_sp <- read.csv(file="../Input/dct_sd_sp.csv")  ## contains dct for lectin and tep1 (positive control)
## only keep genotypes that have been used for qpcr
SD_SP_lines <- SD_SP_lines[SD_SP_lines$Genotype %in% dct_sd_sp$Genotype,] 
SD_SP_lines$Genotype <- factor(SD_SP_lines$Genotype, levels = unique(dct_sd_sp$Genotype))
## order genotypes matrix and dct values in the same way to compare later
SD_SP_lines <- SD_SP_lines[order(SD_SP_lines$Genotype),]
dct_sd_sp$Genotype <- factor(dct_sd_sp$Genotype, levels = unique(dct_sd_sp$Genotype))
dct_sd_sp <- dct_sd_sp[order(dct_sd_sp$Genotype),]
SD_SP_lines <- SD_SP_lines[-c(2:5)] ## only keep premature stop codons and coding sequence deletion
SD_SP_lines <- melt(SD_SP_lines,id.vars="Genotype") ## make into a three column dataframe for plotting
full_dt <- full_join(SD_SP_lines,dct_sd_sp,by="Genotype") ## combine the genotypes with the delta ct values
full_dt <- full_dt[!is.na(full_dt$value),] ## remove those without dct values
## change mutation genotype to functional wild type and loss of function variant
full_dt[full_dt$value %in% "Ancestral",]$value <- "WT"
full_dt[full_dt$value %in% "Derived",]$value <- "LOF"

## plot delta ct for three premature stop codons in Southern Africa
pdf(file="dct_sd_sp.pdf",width=6,height = 3)
ggplot(full_dt,aes(x=factor(value,levels=c("WT","LOF")),Lectin.24A,color=Treatment))+
  geom_boxplot()+
  geom_jitter(position=position_jitterdodge(dodge.width=1))+
  ylab(expression(paste("log2(Relative ",italic("Lectin-24A")," expression)")))+
  facet_wrap(~variable)+
  xlab("Allele")+
  scale_color_manual(values=c("darkorange","#276DC2"))+
  theme(panel.background = element_blank(), axis.line = element_line(colour = "black"))
dev.off()


## delta ct plot for lectin-24A
dctlectin <- ggplot(dct_sd_sp,aes(x=Genotype,y=Lectin.24A,fill=Treatment))+
  geom_col(position = "dodge2")+
  xlab("")+
  ylab(expression(paste(Delta, "CT ",italic("RpL32"), " - ", Delta,"CT ", italic("Lectin-24A"))))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## delta ct plot for tep1, control
dcttep1 <- ggplot(dct_sd_sp,aes(x=Genotype,y=Tep1,fill=Treatment))+
  geom_col(position = "dodge2")+
  ylab(expression(paste(Delta, "CT ",italic("RpL32"), " - ", Delta,"CT ", italic("Tep1"))))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## create genotype matrix for SD and SP lines
SD_SP_lines <- melt(SD_SP_lines,id.vars="Genotype")
SD_SP_lines[SD_SP_lines$value %in% NA,]$value <- "Not called"
colnames(SD_SP_lines) <- c("Genotype","variant","Status")
sd_sp_gt_mat <- ggplot(SD_SP_lines,aes(x=Genotype,y=variant,fill=Status))+
  geom_tile()+ 
  ylab("Null variant")+
  theme(panel.background = element_blank(), axis.line = element_line(colour = "black"),axis.text.x = element_text(angle = 90,vjust=0.5,hjust=1))+
  scale_fill_manual(values = c("#d8b365","#2ca25f","black"))

## align qPCR data and genotype matrix
#pdf(file="dct_sd_sp.pdf",width=7,height = 8)
#plot_grid(align_plots(dctlectin,sd_sp_gt_mat,align = "hv",axis= "tb")[[1]],align_plots(dctlectin,sd_sp_gt_mat,align = "hv",axis= "tb")[[2]],ncol=1)
#dev.off()



