### STJA Phenotypic Analyses ###
require("xlsx")
require("rentrez")
require("ape")

nd2pairs<-read.xlsx2("../utility/Taxon_pair_comparisons_ND2_divergence_v2.xlsx",sheetName="Species for ND2 div comparison")

nd2pairs<-nd2pairs[(!nd2pairs$Species1==""),]

whichgene<-"ND2"
citation("rentrez")
packageVersion("rentrez")

nd2pairs

nd2pairs[i,]
i<-2

output_list<-list()
for(i in (length(output_list)+1):nrow(nd2pairs)){
	cat("Processing Species Pair ",i,sep="")
	cat("\n")

	### Species 1 ###
	sp1_search<-entrez_search(db="nucleotide", term=paste0(nd2pairs$Species1[i],"[Organism] AND ",whichgene,"[Gene]"))
	sp1_id<-sp1_search$id
	sp1_seq<-entrez_fetch(db="nuccore",id=sp1_id,rettype="fasta")
	sp1_seq_split<-strsplit(sp1_seq,">")
	sp1_seq_split<-sp1_seq_split[[1]]
	sp1_seq_split<-sp1_seq_split[-1]
	sp1_seq_split<-lapply(sp1_seq_split,function(x) strsplit(x,"\\n"))
	sp1_seq_split<-lapply(sp1_seq_split,function(x) x[[1]])
	names(sp1_seq_split)<-lapply(sp1_seq_split,function(x) x[1])
	sp1_seq_split<-lapply(sp1_seq_split,function(x) x[-1])
	sp1_seq_split<-lapply(sp1_seq_split,function(x) x[-length(x)])
	sp1_seq_split<-lapply(sp1_seq_split,function(x) paste(x,collapse=""))
	sp1_seq_split<-sapply(sp1_seq_split,function(x) x[1])
	sp1_seq_split<-lapply(sp1_seq_split,function(x) strsplit(x,""))
	sp1_seq_split<-lapply(sp1_seq_split,function(x) x[[1]])
	
	sp1_DNAbin<-as.DNAbin(sp1_seq_split)
	
	### remove isolates from certain pairs ###
	if(i %in% c(2)){
		iso_index1<-grep("isolate",names(sp1_DNAbin))
		if(length(iso_index1)>0){
			sp1_DNAbin <-sp1_DNAbin[-iso_index1]
		}
	}
	
	### remove weird PICA sequence ###
	if (i %in% 1){
		sp1_DNAbin<-sp1_DNAbin[-grep("NRM20016117",names(sp1_DNAbin))]
	}
	
	### remove weird Setophaga sequences ###
	if(i==17){
		sp1_DNAbin<-sp1_DNAbin[-grep("FJ374103.1",names(sp1_DNAbin))]
		sp1_DNAbin<-sp1_DNAbin[-grep("FJ374102.1",names(sp1_DNAbin))]
	}


	### Remove complete genomes ###
	if(i %in% c(5,6,29)){
		sp1_DNAbin<-sp1_DNAbin[-grep("complete",names(sp1_DNAbin))]
	}
	
	### remove ND6 from certain pairs ###
	if(i %in% c(16,17)){
		nd6_index1<-grep("ND6",names(sp1_DNAbin))
		if(length(nd6_index1)>0){
			sp1_DNAbin <-sp1_DNAbin[-nd6_index1]
		}
	}
	
	if(i==8){
		sp1_DNAbin <-sp1_DNAbin[-c(2)]
	}
	
	if(i==2){
		sp1_DNAbin<-sp1_DNAbin[-grep("333996 ",names(sp1_DNAbin))]
		sp1_DNAbin<-sp1_DNAbin[-grep("333981 ",names(sp1_DNAbin))]
	}
	
	
	### Species 2 ###
	sp2_search<-entrez_search(db="nucleotide", term=paste0(nd2pairs$Species2[i],"[Organism] AND ",whichgene,"[Gene]"))
	sp2_id<-sp2_search$id
	sp2_seq<-entrez_fetch(db="nuccore",id=sp2_id,rettype="fasta")
	sp2_seq_split<-strsplit(sp2_seq,">")
	sp2_seq_split<-sp2_seq_split[[1]]
	sp2_seq_split<-sp2_seq_split[-1]
	sp2_seq_split<-lapply(sp2_seq_split,function(x) strsplit(x,"\\n"))
	sp2_seq_split<-lapply(sp2_seq_split,function(x) x[[1]])
	names(sp2_seq_split)<-lapply(sp2_seq_split,function(x) x[1])
	sp2_seq_split<-lapply(sp2_seq_split,function(x) x[-1])
	sp2_seq_split<-lapply(sp2_seq_split,function(x) x[-length(x)])
	sp2_seq_split<-lapply(sp2_seq_split,function(x) paste(x,collapse=""))
	sp2_seq_split<-sapply(sp2_seq_split,function(x) x[1])
	sp2_seq_split<-lapply(sp2_seq_split,function(x) strsplit(x,""))
	sp2_seq_split<-lapply(sp2_seq_split,function(x) x[[1]])
	
	sp2_DNAbin<-as.DNAbin(sp2_seq_split)

	### remove isolates ###
	if(i %in% c(2)){
		iso_index2<-grep("isolate",names(sp2_DNAbin))
		if(length(iso_index2)>0){
			sp2_DNAbin <-sp2_DNAbin[-iso_index2]
		}
	}
	
	### Remove complete genomes ###
	if(i %in% c(5,6,29)){
		sp2_DNAbin<-sp2_DNAbin[-grep("complete",names(sp2_DNAbin))]
	}
	
	### remove ND6 from certain pairs ###
	if(i %in% c(16,17)){
		nd6_index2<-grep("ND6",names(sp2_DNAbin))
		if(length(nd6_index2)>0){
			sp2_DNAbin <-sp2_DNAbin[-nd6_index2]
		}
	}
		
	if(i==8){
		sp2_DNAbin <-sp2_DNAbin[-c(2,3)]
	}
	
	if(i==2){
		sp2_DNAbin<-sp2_DNAbin[-grep("333989",names(sp2_DNAbin))]
		sp2_DNAbin<-sp2_DNAbin[-grep("333982",names(sp2_DNAbin))]
		
		## Switch AY030142.1 from california to woodhousei ##
		sp1_DNAbin<-c(sp1_DNAbin,sp2_DNAbin[grep("AY030142.1",names(sp2_DNAbin))])
		sp2_DNAbin<-sp2_DNAbin[-grep("AY030142.1",names(sp2_DNAbin))]
	}

	### Combine and Align ###
	combined_DNAbin<-c(sp1_DNAbin,sp2_DNAbin)
	names(combined_DNAbin)
	
	aligned_DNAbin<-muscle(combined_DNAbin)
	
	
	#png(file=paste0("~/Desktop/Manuscripts/StellersJays/Figures/",paste0(nd2pairs[i,],collapse="_vs_"),".png"))
	
	checkAlignment(aligned_DNAbin)

	dev.off()
	
	### Calculate raw dna dist and average ###	
	dist.mat<-dist.dna(aligned_DNAbin,model="raw",as.matrix=T)
	
	### Extract focal cells for this comparison ###
	focus.mat<-dist.mat[1:length(sp1_DNAbin),(length(sp1_DNAbin)+1):length(combined_DNAbin)]

	# png(file="~/Desktop/SetophagaND2Hist.png")
	# hist(focus.mat)
	# dev.off()

	genbank_names<-c(list(rownames(dist.mat)[1:length(sp1_DNAbin)]),list(rownames(dist.mat)[(length(sp1_DNAbin)+1):length(combined_DNAbin)]))	
	
output_list[[i]]<-list(tabout=paste0(round(mean(focus.mat),3)," (",round(min(focus.mat),3),"-",round(max(focus.mat),3),")"),mean=mean(focus.mat),min=min(focus.mat),max=max(focus.mat),labels= genbank_names)	
}
output_list[[i]]

### studying aphelocoma ###
whichzero<-which(focus.mat==0,arr.ind=T)

cat(rownames(focus.mat[whichzero[1:4,1],whichzero[1:4,2]]),sep="\n")


for(i in 1:nrow(which.zero)){
	cat(whichzero[1,1])
}

rownames(focus.mat)[whichzero[1,1]]
colnames(focus.mat)[whichzero[1,2]]

str(whichzero)

sapply(which(focus.mat==0,arr.ind=T),function(x) c(rownames(focus.mat)[x[1]]))


dist_df<-cbind(nd2pairs,perc_div=sapply(output_list,function(x) x$tabout))

write.xlsx(dist_df,file="../output/SisterSpeciesComparisons_v4.xlsx")

sink(file="../output/SisterSpeciesPairGenBankNumbers_v2.txt")
for(i in 1:length(output_list)){
	cat("Species pair",i,sep=" ")
	cat("\n")
	cat(unlist(nd2pairs[i,1:2]),sep =" vs ")
	cat("\n\n")
	cat("Species 1 -- ",nd2pairs[i,1])
	cat("\n")
	for(j in 1:length(output_list[[i]]$labels[[1]])){
		cat(output_list[[i]]$labels[[1]][j])
		cat("\n")
	}
	cat("\n\n")
	
	cat("Species 2 -- ",nd2pairs[i,2])
	cat("\n")
	for(j in 1:length(output_list[[i]]$labels[[2]])){
		cat(output_list[[i]]$labels[[2]][j])
		cat("\n")
	}
	cat("\n\n")
	#output_list[[i]]$labels
	cat("###############")
	cat("\n\n")

}
sink()

#### 

