## Hannes Rathmann 2020 (c)
## R code for building a table of utility estimates for different dental traits and trait combinations
## Ref. Rathmann and Reyes-Centeno (Testing the utility of dental morphological trait combinations for inferring human neutral genetic variation) PNAS. DOI: xxx

##################### load microsatellite loci mean size data #####################

file_matrix=choose.files(default="", caption="Select a CSV file with microsatellite loci mean sizes", multi=F)
gen.dat <- read.csv(file=file_matrix)
rownames(gen.dat) <- gen.dat[,1]
gen.dat <- gen.dat[,-1]

##################### load dental trait frequency data #####################

file_matrix=choose.files(default="", caption="Select a CSV file with dental trait frequencies", multi=F)
dent.dat <- read.csv(file=file_matrix)
dent.dat <- dent.dat[,-1]
dent.vars <- colnames(dent.dat)

##################### z-score transformation of trait data #####################

dent.dat[dent.dat==1.00] <- 0.995
dent.dat[dent.dat==0.00] <- 0.005
dent.dat <- qnorm(as.matrix(dent.dat))

##################### START #####################

nr.run <- 1000 # number of resampling runs

##################### find all possible dental trait combinations and save as list #####################

# This will do the calculations for individual traits only, not trait combinations:

comb.list <- as.list(dent.vars) 

# This will do the calculations for individual traits and all possible trait combinations.
# WARNING: Only do this when your machine has ~ 100 GB RAM.
# Depending on your machine, this calculation may run for several weeks.
# Consider to run this in parallel on several cores or on a cluster to reduce computational time.

# comb.list <- unlist(Map(combn,list(dent.vars),seq_along(dent.vars),simplify=F),recursive=F)

##################### set some indices and define empty vectors that will be filled in a loop #####################

nr.pops <- nrow(gen.dat)
nr.pairs <- nr.pops*(nr.pops-1)/2
nr.loci <- length(gen.dat)
ut <- upper.tri(matrix(NA,nr.pops,nr.pops))

trait.combination <- vector(mode="character",length(comb.list))
r.median <- vector(mode="integer",length(comb.list))
r.lower <- vector(mode="integer",length(comb.list))
r.upper <- vector(mode="integer",length(comb.list))
p.value <- vector(mode="integer",length(comb.list))

for (i in 1:length(comb.list)){

##################### for each trait combination, calculate D2 distances #####################

	trait.set <- comb.list[[i]]
	trait.combination[i] <- paste(trait.set,collapse="/")
	trait.nr <- length(trait.set)
	trait.set.data <- dent.dat[,trait.set]
	D2.mat <- as.matrix(dist(trait.set.data),method="euclidean")^2
	D2.pair <- D2.mat[ut]

##################### and calculate trait-adjusted loci-resampled Dm2 distances #####################

	Dm2.pair <- matrix(NA,nr.run,nr.pairs)
	for (j in 1:nr.run){
		loci.sample <- sample.int(nr.loci,trait.nr)
		Dm2.mat <- as.matrix(dist(gen.dat[,loci.sample],method="euclidean"))^2/trait.nr
		Dm2.pair[j,] <- Dm2.mat[ut]
	}

##################### and compare D2 to trait-adjusted loci-resampled Dm2 #####################

	r.range <- vector(mode="integer",nr.run)
	for (k in 1:nr.run){
		r.range[k] <- cor(D2.pair,Dm2.pair[k,],method="pearson")
	}
	qs <- quantile(r.range,probs=c(0.025,0.5,0.975),names=F)
	r.median[i] <- qs[2]
	r.lower[i] <- qs[1]
	r.upper[i] <- qs[3]

##################### and calculate p-values via permutation of D2 #####################

	perm.r.range <- vector(mode="integer",nr.run)
	for (l in 1:nr.run){
		perm.sample <- sample.int(nr.pops)
		perm.D2.mat <- D2.mat[perm.sample,perm.sample]
		perm.D2.pair <- perm.D2.mat[ut]
		perm.r.range[l] <- cor(perm.D2.pair,Dm2.pair[l,],method="pearson")
	}
	p.value[i] <- sum(perm.r.range>=qs[2])/nr.run
}

##################### create table with results #####################

trait.number <- lengths(comb.list)
r.median <- round(r.median, digits=5)
r.lower <- round(r.lower, digits=5)
r.upper <- round(r.upper, digits=5)
p.value <- round(p.adjust(p.value,method="BH"),digits=3)

df <- data.frame(trait.combination,trait.number,r.median,r.lower,r.upper,p.value)

write.table(df,file="combinat_table.txt",row.names=F) # save results as text file

##################### inspecting results #####################

head(df,27)