library(diversitree)
library(greenbrown)
library(phytools)
library(phangorn)

# Get list of data matrices with their full paths
matrices <- list.files("../0_missing", pattern="*.nex", full.names=TRUE)

#Set working directory to where you want to store newly generated matrices
setwd("../50_missing")

# Use Liam Revel's wrapper to write matrices in standard data format; modified to specify symbols for compatibility with software other than Mesquite
writeNexusData<-function(x, file, format = "dna", interleaved=FALSE){
  if(format=="dna"||format=="DNA") write.nexus.data(x,file,format,interleaved=FALSE)
  else if(format=="protein"||format=="PROTEIN") write.nexus.data(x,file,format,interleaved=FALSE)
  else if(format=="standard"||format=="STANDARD"){
   X<-vector(mode="list",length=nrow(x))
   for(i in 1:nrow(x)) X[[i]]<-x[i,]
   names(X)<-rownames(x)
   write.nexus.data(X,file,format="dna",interleaved=FALSE)
   ff<-readLines(file)
   ff<-gsub("DNA","STANDARD  SYMBOLS=\"1 2\" ",ff)
   write(ff,file)
  }
}

# Loop through all matrices
for (i in 1:length(matrices)) {
matrix <- read.nexus.data(matrices[i])
mtrx <- matrix(0, length(matrix), 2.5*length(matrix))
rownames(mtrx) <- names(matrix)
for (j in 1:nrow(mtrx)) {
mtrx[j,] <- matrix[[j]]
 }
class(mtrx)<- "numeric"
N <- round(0.5*nrow(mtrx)*ncol(mtrx))		# set total number of entries to replace 20 or 50 percent 
repeat{
x <- round(runif(1, 1, nrow(mtrx)))				# get a random row number
y <- round(runif(1, 1, ncol(mtrx)))				# get a random column number
mtrx[x,y] <- NA							# replace sampled entry with NA
if (isTRUE(AllEqual(mtrx[,y])) == TRUE) {			# check if sampled character is still variable after replacing entry with NA
mtrx[x,y] <- matrix [[x]][[y]]					# if a character becomes invariant, change NA back to its original value and repeat the loop
}
if (sum(is.na(mtrx)) == N) {					# once the number of NA's in the matrix is equal to our target value of 20 or 50 percent, exit the loop
	break
   }
}
mtrx[is.na(mtrx)] <- "?"					# replace NA's with "?" 

#write data matrix with missing data before next iteration
spec <- paste0(basename(matrices[i]), "_", "50%", ".nex")
writeNexusData(mtrx, spec, format="standard", interleaved=FALSE)
}
