####These are functions in support of filtering a vcf file based on missing data in loci
####or individuals, in order to write a structure file to use in adegenet. I have 
####generally used this workflow to pare down loci enough to be able to load the dataset
####into excel, which allows me to do further exploration/manipulation.

####written by Rob Massatti, massatti@umich.edu



##function to trim out loci given a specific cutoff (percentage) and dataset
##can use iteratively to keep trimming datasets more and more
loci_trim <- function(data, cutoff) {
	#make vector of the number of individuals that have data for each locus
	numIndivid <- gsub('NS=(\\d+).+','\\1', data[,8])
	numIndivid <- as.numeric(numIndivid)/length(10:(length(data[1,])))
	trim_vec <- numIndivid >= cutoff
	return(data[trim_vec,])

}

##function to trim out individuals given a specific amt of missing data
##can use iteratively to keep trimming datasets more and more
ind_trim_cutoff <- function(data, cutoff) {
	sum_SNP <- NULL
	for(i in 10:length(data[1,])) {
		current <- 0
		for(j in 1:length(data[,1])) {
			if(data[j,i] != "./.") {
				current <- current + 1
			}
		}
		sum_SNP <- c(sum_SNP, current/(length(data[,1])))
	}
	trim_vec <- sum_SNP >= cutoff
	trim_vec <- c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, trim_vec)
	#table(sum_SNP)
	return(data[,trim_vec])
}

##function to trim out specific individual (only 1 at a time)
##can use iteratively to keep trimming datasets more and more
ind_trim_name <- function(data, name) {
	names_vec <- names(data) != name
	return(data[,names_vec])
}


#function to view the distribution of missing data in loci
loci_hist <- function(data) {
	numIndivid <- gsub('NS=(\\d+).+','\\1', data[,8])
	numIndivid <- as.numeric(numIndivid)/length(10:(length(data[1,])))
	vec <- seq(from=0, to=1, by=0.05)
	hist(numIndivid, xlab = "Proportion of population in which loci present", ylab = "Number of loci")
	table(numIndivid)
}

#function to view the distribution of missing data across individuals
ind_hist <- function(data) {
	sum_SNP <- NULL
	for(i in 10:length(data[1,])) {
		current <- 0
		for(j in 1:length(data[,1])) {
			if(data[j,i] != "./.") {
				current <- current + 1
			}
		}
		sum_SNP <- c(sum_SNP, current/(length(data[,1])))
		cat(names(data[i]))
		cat('\t')
		cat(current/(length(data[,1])))
		cat('\n')
	}
	hist(sum_SNP)
	table(sum_SNP)
}

##writes data to structure file
write_stru <- function(data, file_name) {
	cat('\t', file = file_name, sep = '', append = T)
	cat('\t', file = file_name, sep = '', append = T)
	for(i in 1:length(data[,1])) {
		cat(data[i,1], file = file_name, sep = '', append = T)
		cat('\t', file = file_name, sep = '', append = T)
	}
	cat('\n', file = file_name, sep = '', append = T)
	for(j in 10:length(data[1,])) {
		cat(names(data[j]), file = file_name, sep = '', append = T)
		cat('\t', file = file_name, sep = '', append = T)
		cat('1', file = file_name, sep = '', append = T)
		cat('\t', file = file_name, sep = '', append = T)
		for(k in 1:length(data[,1])) {
			allele <- gsub('(.)\\|.', '\\1', data[k,j])
			if(is.na(as.numeric(allele)) == TRUE) {
				#cat(as.numeric('0'), file = file_name, sep = '', append = T)
				cat(as.numeric('-9'), file = file_name, sep = '', append = T)
				#cat(sample(0:1,1), file = file_name, sep = '', append = T)
				cat('\t', file = file_name, sep = '', append = T)
			} else {
				cat(as.numeric(allele), file = file_name, sep = '', append = T)
				cat('\t', file = file_name, sep = '', append = T)
				#if(as.numeric(allele) == 0) {
				#	cat(as.numeric("1"), file = file_name, sep = '', append = T)
				#	cat('\t', file = file_name, sep = '', append = T)
				#} else {
				#	cat(as.numeric("2"), file = file_name, sep = '', append = T)
				#	cat('\t', file = file_name, sep = '', append = T)
				#}
			}
		}
		cat('\n', file = file_name, sep = '', append = T)
		cat(names(data[j]), file = file_name, sep = '', append = T)
		cat('\t', file = file_name, sep = '', append = T)
		cat('1', file = file_name, sep = '', append = T)
		cat('\t', file = file_name, sep = '', append = T)
		for(l in 1:length(data[,1])) {
			allele <- gsub('.\\|(.)', '\\1', data[l,j])
			if(is.na(as.numeric(allele)) == TRUE) {
				#cat(as.numeric('0'), file = file_name, sep = '', append = T)
				cat(as.numeric('-9'), file = file_name, sep = '', append = T)
				#cat(sample(0:1,1), file = file_name, sep = '', append = T)
				cat('\t', file = file_name, sep = '', append = T)
			} else {
				#if(as.numeric(allele) == 0) {
				#	cat(as.numeric("1"), file = file_name, sep = '', append = T)
				#	cat('\t', file = file_name, sep = '', append = T)
				#} else {
				#	cat(as.numeric("2"), file = file_name, sep = '', append = T)
				#	cat('\t', file = file_name, sep = '', append = T)
				#}
				cat(as.numeric(allele), file = file_name, sep = '', append = T)
				cat('\t', file = file_name, sep = '', append = T)
			}
		}
		cat('\n', file = file_name, sep = '', append = T)
	}
}


