#####################################################################################################################################################################
########################################################## Create linkage map for use in my own simulation ##########################################################
#####################################################################################################################################################################
# Written 19/12/2013
# This script takes as input a chromosome file and a linkage map file.
## The chromosome file should contain the following columns: "Tgu", "length" (Tgu gives the chromosome name, length is the chromosome length in bp)
## The linkage map file should contain the following columns: "SNP", "position", "cM", "chromosome" (SNP name, position (bp), position (Kosambi cM), chromosome name (same as "Tgu"))
# The following input parameters must be provided:
## (1) n chromosomes needed ('nn'),
## (2) the estimated genome size in bp ('gsize'),
## (3) the estimated genome size in cM ('cM.size'),
## (3) the window size in which loess is going to predict recombination events ('wndw.size').
## ------------------------------------------------------------------------------------------------------------------

# Then the script works as follows:
# (1) It creates additional n chromosomes by duplicating other chromosomes (in the end there are nn), all in all totaling the estimated genome size.
## The zebra finch genome is estimated to be 1.2 Gb in size and consists of 39 autosomes + TguZ.
## (1.1) Calculate the number of base pairs needed and remove chromosomes with two few (<10) marker.
## (1.2) For duplication: Remove chromosomes that are too long (macrochromosomes are definitively not missing in the assembly, thus remove all chromosomes > 20,000,000 bp).
## (1.3) Get all possible combinations (without repetition) of the remaining chromosomes and replace chromosome names with their length.
## (1.4) Sum up each combination.
## (1.5) Select those combinations that sum up closest to the genome size minus the assembled chromosomes.
## (1.6) Of those combinations that sum up closest to the needed size select those with the least duplications per chromosome.
## (1.7) Select one of the best combinations randomly.
## (1.8) Duplicate markers on these selected chromosomes (and give them a unique name).
## (1.9) Duplicate the selected chromosomes (and give them a unique name).
## (1.10) How long is the genome in total after duplicating chromosomes? 
# (2) It predicts genetic positions with the loess function in windows of specified size ('wndw.size').
## (2.1) Loop over all chromosomes (including the duplicated ones). Optically the loess smoothed curves fit quite well to the data points with the given settings.
## A SIM.map.txt file is created which can be used directly in the Monte Carlo gene dropping simulation.
# (2.2) Visualize the linkage map on each chromosome.

rm(list=ls())
# increase R's recursion limit for the combinations (you need to do it when n>45)
options(expressions=5e5)
#install.packages("gtools")
library(gtools)
require(sqldf)
# ------------------------------------------------------------------------------------------------------------------
# (0) inputs -----------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# define path to files here
path = "C:\\Users\\Data\\"
# n chromosomes needed
nn <- 39
# estimated genome size - physical
g.size <- 1222864721-72861351
# estimated genome size - genetic (average 45.45 rec events on all autosomes + 2.01 rec events on Z chromosome)
cM.size <- (45.45)*50
# prediction window size
wndw.size <- 100000
# name outfile
outfile <- paste("data_ZF324.SIM",wndw.size/1000,"kb.map.txt",sep="")
# ---

# files
tgus <- read.table(paste(path, "data_ZF324.chromosomes.txt", sep=""), sep="\t", header=TRUE)
tgus <- subset(tgus,Tgu!="chrZ")
markers <- read.table(paste(path, "data_ZF324.SNPinfoRANDOM_8_13.txt", sep=""), sep="\t", header=TRUE)
markers <- subset(markers,chromosome!="chrZ")
# remove marker on Tgu4A that screws up the whole linkage map
markers <- subset(markers, SNP!="A25817" & SNP!="A03165" & SNP!="A01215")
# define function 'roundUp'
roundUp <- function(x,to) {to*(x%/%to + as.logical(x%%to))}
roundDown <- function(x,to) {to*(x%/%to - as.logical(x%%to))}

# ------------------------------------------------------------------------------------------------------------------
# (1) create additional n chromosomes (through duplication of existing ones) ---------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# (1.1) Round up the physical size of each chromosome and the whole genome size to the nearest wndw.size
tgus$length <- roundUp(tgus$length,wndw.size)
g.size <- roundUp(g.size,wndw.size)


# (1.2) Calculate the number of base pairs needed and remove chromosomes with two few (<10) marker
n.marker <- table(markers$chromosome)
keep <- n.marker[n.marker>=10]
filtered <- tgus$Tgu %in% names(keep)
tgus <- tgus[filtered,]


# (1.3) For each remaining chromosome use the loess function (see below) to get an estimate of the chromosome size in cM
for(i in 1:nrow(tgus)) {
	tgu <- tgus[i,]
#	tgu <- tgus[which(tgus$Tgu=="chr1"),]
	# select marker on that chromosome
	marker <- subset(markers, chromosome==as.character(tgu$Tgu))
	# check whether both genetic and physical positions of markers are decreasing or increasing 
	if(marker$position[1]<marker$position[nrow(marker)] & marker$cM[1]<marker$cM[nrow(marker)]) {decr <- FALSE} else {decr <- TRUE}
	# count how many markers there are per chromosome. Adjust the span value in the loess function accordingly.
	n.marker <- nrow(marker)
	# Loess smoothing. Optically the smoothed curves fit quite well to the data points with these settings.
	x <- loess(marker$cM~marker$position, span = 10/n.marker, degree=1, control = loess.control(surface = "direct"))
	# predict genetic position in windows of size wndw.size
	pos.phys <- seq(1,tgu$length+1, wndw.size)
	pos.genet <- predict(x,pos.phys)
	# make the smoothed curve monotone increasing or decreasing
	pos.genet <- sort(pos.genet, decreasing = decr)
	pos.phys <- sort(pos.phys, decreasing = decr)
	out <- cbind(pos.phys,pos.genet)
	out <- data.frame(out)
	out <- out[with(out, order(pos.phys)), ]
	if(out$pos.genet[1]<0) {
		out$pos.genet <- out$pos.genet+abs(out$pos.genet[1])
	}
	if(out$pos.genet[1]>0) {
		out$pos.genet <- out$pos.genet-abs(out$pos.genet[1])
	}
	tgus$cM_predicted[i] <- max(out$pos.genet)
}
tgus$cM_predicted[which(tgus$cM_predicted<50)] <- 50

# (1.4) For duplication: Remove chromosomes that are too long (macrochromosomes are definitively not missing in the assembly, thus remove all chromosomes > 60,000,000 bp)
length.covered.bp <- sum(tgus$length)
length.covered.cM <- sum(tgus$cM_predicted)
length.needed.bp <- g.size-length.covered.bp
length.needed.cM <- cM.size-length.covered.cM
chromos.needed <- nn-nrow(tgus)

chr.add.length.bp <- length.needed.bp/chromos.needed
chr.add.length.bp <- roundUp(chr.add.length.bp,wndw.size)
chr.add.length.cM <- length.needed.cM/chromos.needed 
if(chr.add.length.cM<50) {chr.add.length.cM <- 50}
wndws <- chr.add.length.bp/wndw.size
tgus.dup <- data.frame(matrix(c(rep(NA,4*((wndws+1)*chromos.needed))),ncol=4))
colnames(tgus.dup) <- c("Chromosome","Marker","Pos.Phys","Pos.Genet")
k=1
for(i in 1:chromos.needed) {
	for(j in 0:wndws) {
		tgus.dup$Chromosome[k] <- paste("chr",i,".1",sep="")
		tgus.dup$Marker[k] <- paste(tgus.dup$Chromosome[k],".",j+1,sep="")
		tgus.dup$Pos.Phys[k] <- j*wndw.size+1
		tgus.dup$Pos.Genet[k] <- chr.add.length.cM/wndws*j
		k <- k+1
	}
}



# ------------------------------------------------------------------------------------------------------------------
# (2) predict genetic positions with the loess function ----------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# (2.1) Loop over all chromosomes (including the duplicated ones)
for(i in 1:nrow(tgus)) {
	# select chromosome
	tgu <- tgus[i,]
#	tgu <- tgus[which(tgus$Tgu=="chr1"),]
	# select marker on that chromosome
	marker <- subset(markers, chromosome==as.character(tgu$Tgu))
	# check whether both genetic and physical positions of markers are decreasing or increasing 
	if(marker$position[1]<marker$position[nrow(marker)] & marker$cM[1]<marker$cM[nrow(marker)]) {decr <- FALSE} else {decr <- TRUE}
	# count how many markers there are per chromosome. Adjust the span value in the loess function accordingly.
	n.marker <- nrow(marker)
	# Loess smoothing. Optically the smoothed curves fit quite well to the data points with these settings.
	x <- loess(marker$cM~marker$position, span = 10/n.marker, degree=1, control = loess.control(surface = "direct"))
	# predict genetic position in windows of size wndw.size
	pos.phys <- seq(1,tgu$length+1, wndw.size)
	pos.genet <- predict(x,pos.phys)
	# make the smoothed curve monotone increasing or decreasing
	pos.genet <- sort(pos.genet, decreasing = decr)
	pos.phys <- sort(pos.phys, decreasing = decr)
	# create unique marker names at each predicted value
	nams <- matrix(NA,ncol=1,nrow=length(pos.genet))
	for(j in 1:length(pos.genet)) {
		nams[j] <- paste(as.character(tgu$Tgu),".",j,sep="")
	}
	# create the output file: first column chromosome name, second column marker name, third column position in bp, forth column genetic position (prediction from loess)
	out <- cbind(pos.phys,pos.genet)
	out <- data.frame(out)
	out <- out[with(out, order(pos.phys)), ]
	out <- cbind(tgu$Tgu,nams,out)
	colnames(out) <- c("Chromosome","Marker","Pos.Phys","Pos.Genet")
	# Let the the maps for each chromosome start at 1 bp and 0 cM
	if(out$Pos.Genet[1]<0) {
		out$Pos.Genet <- out$Pos.Genet+abs(out$Pos.Genet[1])
	}
	if(out$Pos.Genet[1]>0) {
		out$Pos.Genet <- out$Pos.Genet-abs(out$Pos.Genet[1])
	}
	# write to hard drive
	if(i==1) {
		write.table(out,paste(path, outfile, sep=""), append=FALSE, row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE, eol="\n")
	} else {
		write.table(out,paste(path, outfile, sep=""), append=TRUE, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE, eol="\n")
	}
}
write.table(tgus.dup,paste(path, outfile, sep=""), append=TRUE, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE, eol="\n")


# (1.13) How long is the genome in total after duplicating chromosomes physically and genetically?
link.maps <- read.table(paste(path,outfile, sep=""), sep="\t", header=TRUE)
bp <- 0
cM <- 0
for(i in 1:length(unique(link.maps$Chromosome))) {
	chr <- subset(link.maps,Chromosome==unique(link.maps$Chromosome)[i])
	bp <- max(chr$Pos.Phys) + bp
	if(max(chr$Pos.Genet)<50) {cM <- 50+cM} else {cM <- max(chr$Pos.Genet)+cM}
	}
bp # 1150100000
cM # 2272




# (2.2) Visualize the linkage map on each chromosome
link.maps <- read.table(paste(path,outfile, sep=""), sep="\t", header=TRUE)
par(mfrow=c(5,5))
par(mar=c(3.5, 3.5, 0.9, 0.5))
par(mgp=c(2,0.5,0))
#par(mfrow=c(1,1))
for(i in 1:nrow(tgus)) {
	# select chromosome
	tgu <- tgus[i,]
#	tgu <- tgus[which(tgus$Tgu=="chr13"),]
	# select marker on chromosome
	marker <- subset(markers, chromosome==as.character(tgu$Tgu))
	link.map <- subset(link.maps,Chromosome==as.character(tgu$Tgu))
	plot(link.map$Pos.Phys, link.map$Pos.Genet, col='red', lwd=2, type="n", xlab=paste(tgu$Tgu," - physical", sep=""), ylab=paste(tgu$Tgu," - genetic", sep=""), ylim=c(0,max(link.map$Pos.Genet,marker$cM)))
	if(marker$position[1]>marker$position[nrow(marker)]) {
		points(abs(marker$position-max(marker$position)),marker$cM)
	} else {
		points(marker$position,marker$cM)
	}
	points(link.map$Pos.Phys, link.map$Pos.Genet, col='red', lwd=2, type="l")

}

#####################################################################################################################################################################
#####################################################################################################################################################################
#####################################################################################################################################################################

