#####################################################################################################################################################################
########################################################## Create linkage map for use in my own simulation ##########################################################
#####################################################################################################################################################################
# Written 19/12/2013
# This script takes as input a chromosome file and a linkage map file.
## The chromosome file should contain the following columns: "Tgu", "length" (Tgu gives the chromosome name, length is the chromosome length in bp)
## The linkage map file should contain the following columns: "SNP", "start_physisch", "cM", "chromosome" (SNP name, position (bp), position (Kosambi cM), chromosome name (same as "Tgu"))
# The following input parameters must be provided:
## (1) n chromosomes needed ('nn'),
## (2) the estimated genome size in bp ('gsize'),
## (3) the estimated genome size in cM ('cM.size'),
## (3) the window size in which loess is going to predict recombination events ('wndw.size').
## ------------------------------------------------------------------------------------------------------------------

# Then the script works as follows:
# (1) It creates additional n chromosomes by duplicating other chromosomes (in the end there are nn), all in all totaling the estimated genome size.
## The zebra finch genome is estimated to be 1.2 Gb in size and consists of 39 autosomes + TguZ.
## (1.1) Calculate the number of base pairs needed and remove chromosomes with two few (<10) marker.
## (1.2) For duplication: Remove chromosomes that are too long (macrochromosomes are definitively not missing in the assembly, thus remove all chromosomes > 20,000,000 bp).
## (1.3) Get all possible combinations (without repetition) of the remaining chromosomes and replace chromosome names with their length.
## (1.4) Sum up each combination.
## (1.5) Select those combinations that sum up closest to the genome size minus the assembled chromosomes.
## (1.6) Of those combinations that sum up closest to the needed size select those with the least duplications per chromosome.
## (1.7) Select one of the best combinations randomly.
## (1.8) Duplicate markers on these selected chromosomes (and give them a unique name).
## (1.9) Duplicate the selected chromosomes (and give them a unique name).
## (1.10) How long is the genome in total after duplicating chromosomes? 
# (2) It predicts genetic positions with the loess function in windows of specified size ('wndw.size').
## (2.1) Loop over all chromosomes (including the duplicated ones). Optically the loess smoothed curves fit quite well to the data points with the given settings.
## A SIM.map.txt file is created which can be used directly in the Monte Carlo gene dropping simulation.
# (2.2) Visualize the linkage map on each chromosome.

rm(list=ls())
# ------------------------------------------------------------------------------------------------------------------
# (0) inputs -----------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# define path to files here
path = "C:\\Users\\Data\\"
# prediction window size
wndw.size <- 100000
# name outfile
outfile <- paste("data_SIM.Humans.",wndw.size/1000,"kb.mapB37.txt",sep="")
# ---

# files
tgus <- read.table(paste(path, "data_HumanB37.chromosomes.txt", sep=""), sep="\t", header=TRUE)
# define function 'roundUp'
roundUp <- function(x,to) {to*(x%/%to + as.logical(x%%to))}
roundDown <- function(x,to) {to*(x%/%to - as.logical(x%%to))}

# combine linkage chromosome files
file_list <- list.files(paste(path,"smoothed_map_b37_4\\",sep=""))
for (ll in 1:length(file_list)) {
	# if the merged dataset doesn't exist, create it
	if (!exists("markers")) {
		markers <- read.table(paste(path,"smoothed_map_b37_4\\",file_list[ll],sep=""), header=TRUE, sep="\t")
	}
	# if the merged dataset does exist, append to it
	else if (exists("markers")) {
		temp_out <- read.table(paste(path,"smoothed_map_b37_4\\",file_list[ll],sep=""), header=TRUE, sep="\t")
		markers <- rbind(markers, temp_out)
		rm(temp_out)
	}
}
markers$Chromosome <- paste("chr",markers$Chromosome,sep="")


# ------------------------------------------------------------------------------------------------------------------
# (2) predict genetic positions with the loess function ----------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# (2.1) Loop over all chromosomes (including the duplicated ones)
for(i in 1:nrow(tgus)) {
	# select chromosome
	tgu <- tgus[i,]
#	tgu <- tgus[which(tgus$Tgu=="chr1"),]
	# select marker on that chromosome
	marker <- subset(markers, Chromosome==as.character(tgu$Tgu))
	# check whether both genetic and physical positions of markers are decreasing or increasing 
	if(as.character(tgu$Tgu)!="chrX") {if(marker$Build37_map_physical_position[1]<marker$Build37_map_physical_position[nrow(marker)] & marker$Sex.averaged_map_position_smoothed[1]<marker$Sex.averaged_map_position_smoothed[nrow(marker)]) {decr <- FALSE} else {decr <- TRUE}}
	if(as.character(tgu$Tgu)=="chrX") {if(marker$Build37_map_physical_position[1]<marker$Build37_map_physical_position[nrow(marker)] & marker$Female_map_position_smoothed[1]<marker$Female_map_position_smoothed[nrow(marker)]) {decr <- FALSE} else {decr <- TRUE}}
	# count how many markers there are per chromosome. Adjust the span value in the loess function accordingly.
	n.marker <- nrow(marker)
	# Loess smoothing. Optically the smoothed curves fit quite well to the data points with these settings.
	if(as.character(tgu$Tgu)!="chrX") {x <- loess(marker$Sex.averaged_map_position_smoothed~marker$Build37_map_physical_position, span = 10/n.marker, degree=1, control = loess.control(surface = "direct"))}
	if(as.character(tgu$Tgu)=="chrX") {x <- loess(marker$Female_map_position_smoothed~marker$Build37_map_physical_position, span = 10/n.marker, degree=1, control = loess.control(surface = "direct"))}	
	# predict genetic position in windows of size wndw.size
	pos.phys <- seq(1,tgu$length, wndw.size)
	pos.genet <- predict(x,pos.phys)
	# make the smoothed curve monotone increasing or decreasing
	pos.genet <- sort(pos.genet, decreasing = decr)
	pos.phys <- sort(pos.phys, decreasing = decr)
	# create unique marker names at each predicted value
	nams <- matrix(NA,ncol=1,nrow=length(pos.genet))
	for(j in 1:length(pos.genet)) {
		nams[j] <- paste(as.character(tgu$Tgu),".",j,sep="")
	}
	# create the output file: first column chromosome name, second column marker name, third column position in bp, forth column genetic position (prediction from loess)
	out <- cbind(pos.phys,pos.genet)
	out <- data.frame(out)
	out <- out[with(out, order(pos.phys)), ]
	out <- cbind(tgu$Tgu,nams,out)
	colnames(out) <- c("Chromosome","Marker","Pos.Phys","Pos.Genet")
	# Let the the maps for each chromosome start at 1 bp and 0 cM
	if(out$Pos.Genet[1]<0) {
		out$Pos.Genet <- out$Pos.Genet+abs(out$Pos.Genet[1])
	}
	if(out$Pos.Genet[1]>0) {
		out$Pos.Genet <- out$Pos.Genet-abs(out$Pos.Genet[1])
	}
	# write to hard drive
	if(i==1) {
		write.table(out,paste(path, outfile, sep=""), append=FALSE, row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE, eol="\n")
	} else {
		write.table(out,paste(path, outfile, sep=""), append=TRUE, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE, eol="\n")
	}
}

# (2.2) Visualize the linkage map on each chromosome
link.maps <- read.table(paste(path,outfile, sep=""), sep="\t", header=TRUE)
par(mfrow=c(6,4))
par(mar=c(3.5, 3.5, 0.9, 0.5))
par(mgp=c(2,0.5,0))
#par(mfrow=c(1,1))
for(i in 1:nrow(tgus)) {
	# select chromosome
	tgu <- tgus[i,]
	# select marker on chromosome
	marker <- subset(markers, Chromosome==as.character(tgu$Tgu))
	link.map <- subset(link.maps,Chromosome==as.character(tgu$Tgu))
	plot(link.map$Pos.Phys, link.map$Pos.Genet, col='red', lwd=2, type="n", xlab=paste(tgu$Tgu," - physical", sep=""), ylab=paste(tgu$Tgu," - genetic", sep=""), ylim=c(0,max(link.map$Pos.Genet,marker$cM)))
	if(marker$Build37_map_physical_position[1]>marker$Build37_map_physical_position[nrow(marker)]) {
		points(abs(marker$Build37_map_physical_position-max(marker$Build37_map_physical_position)),marker$Sex.averaged_map_position_smoothed)
	} else {
		points(marker$Build37_map_physical_position,marker$Sex.averaged_map_position_smoothed)
	}
	points(link.map$Pos.Phys, link.map$Pos.Genet, col='red', lwd=2, type="l")

}

#####################################################################################################################################################################
#####################################################################################################################################################################
#####################################################################################################################################################################

