resolve.gaps<-function(f,format="fasta") { #Helps users decide which sequences and loci to keep in the dataset in order to maximize the power of the analysis #f: the filepath for the aligned sequences #format: the format the alignement is in (interleaved, sequential, clustal, or fasta by default) require(ape) mism<-function(x,y) sum(x!=y) #count mismatches between two sequences segmat<-function(x) { #create a matrix of mismatches between all sequences dm=matrix(NA,nr=nrow(x),nc=nrow(x),dimnames=list(labels(x),labels(x))) for(i in 1:nrow(x)) dm[i,]=apply(x,1,mism,y=x[i,]) as.dist(dm) } seqs=read.dna(f,format) logmat=matrix(NA,nr=nrow(seqs),nc=ncol(seqs)) for(i in 1:nrow(logmat)) logmat[i,]=grepl("[^actg]",seqs[i,]) #mask everything ambigious logmat=logmat[(ix=rev(order(apply(logmat,1,sum)))),] #order sequences by # of missing bp seqs=seqs[ix,] goodseqs=rep(T,nrow(seqs)); curr=0 while(1) { goodlocs=apply(logmat[goodseqs,],2,function(x) all(x==F)) #image(seqs[goodseqs,goodlocs],xlab=paste(curr,"sequence removed")) if(sum(goodlocs)==length(goodlocs)) break if(curr==0) res=data.frame(loci=sum(goodlocs),seqs=nrow(seqs),TotBP=sum(goodlocs)*nrow(seqs),SegSites=length(seg.sites(seqs[,goodlocs])),TotMM=sum(segmat(seqs[,goodlocs]))) else res=rbind(res,c(sum(goodlocs),sum(goodseqs),sum(goodlocs)*sum(goodseqs),length(seg.sites(seqs[goodseqs,goodlocs])),sum(segmat(seqs[goodseqs,goodlocs])))) goodseqs[(curr=curr+1)]=F cat("-Removing",labels(seqs)[curr],"\n") } rownames(res)=c("(none)",labels(seqs)[2:curr-1])->vs layout(1:3); par(mar=c(1.5,3,.1,.1),mgp=c(1.8,.7,0)) plot(res[,3],ylab="Total BP",xlab="",pch=ifelse(res[,3]==max(res[,3]),19,1)) plot(res[,4],ylab="Segregating sites",xlab="",pch=ifelse(res[,4]==max(res[,4]),19,1)) plot(res[,5],ylab="Total Mismatches",xlab="",pch=ifelse(res[,5]==max(res[,5]),19,1)) res }