#------------------------------------------------------------------------
# Script to simulate a low-quality reference genome based on human genome
#------------------------------------------------------------------------


#------------------------------------------------------------------------
# Functions
#------------------------------------------------------------------------

getStart1 <- function(chromosome, telomere_len_autosomes, pseudo_autosomal, buffer){
  # get start: after first telomere or PAR
  if (chromosome$type == "autosome"){
    return(telomere_len_autosomes + buffer)
  } else if (chromosome$chr == "X"){
    return(pseudo_autosomal$stop[1] + buffer)
  } else if (chromosome$chr == "Y") {
    return(pseudo_autosomal$stop[3] + buffer)
  } else {
    stop("Unknown case")
  }
}

getStop1 <- function(centromere, buffer){
  # get stop: before begin of centromere
  return(centromere$start - buffer)
}

getStart2 <- function(centromere, buffer){
  # get start: after end of centromere
  return(centromere$stop + buffer)
}

getStop2 <- function(chromosome, telomere_len_autosomes, pseudo_autosomal, buffer){
  # get end: before second telomere or PAR
  if (chromosome$type == "autosome"){
    return(chromosome$length - telomere_len_autosomes - buffer)
  } else if (chromosome$chr == "X"){
    return(pseudo_autosomal$start[2] - buffer)
  } else if (chromosome$chr == "Y") {
    return(pseudo_autosomal$start[4] - buffer)
  } else {
    stop("Unknown case")
  }
}

#------------------------------------------------------------------------
# Read chromosome information and define usable regions
#------------------------------------------------------------------------

# read chromosome lengths and sort

chromosomes <- read.table("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/Homo_sapiens.GRCh38.dna.primary_assembly_onlyChromosomes.fa.fai")[,1:2]
chromosomes <- chromosomes[chromosomes$V1 != "MT",]
tmp_autosomes <- chromosomes[1:22,]
tmp_autosomes <- tmp_autosomes[sort(as.numeric(tmp_autosomes$V1), index.return = T)$ix,]
chromosomes <- rbind(tmp_autosomes, chromosomes[23:24,])
names(chromosomes) <- c("chr", "length")
chromosomes$type <- c(rep("autosome", 22), rep("gonosome", 2))

# read centromeric regions
# (downloaded from https://www.ncbi.nlm.nih.gov/grc/human on 31.08.2023)

centromeres <- read.table("~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/Modeled_regions_for_GRCh38.tsv")
centromeres <- centromeres[centromeres$V1 != "HET7",] # remove heterochromatin region, we only want centromeres
names(centromeres) <- c("name", "chr", "start", "stop", "length")

if (any(chromosomes$chr != centromeres$chr)){ stop("Chromosomes don't match") }

# define telomeric regions
# autosomes: telomeres comprise 5–15 kb https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4027178/
# sex chromosomes: pseudo-autosomal regions are at the telomeres
# (copied from https://www.ncbi.nlm.nih.gov/grc/human on 31.08.2023)

telomere_len_autosomes <- 15000

pseudo_autosomal <- data.frame("name" = c("PAR_1", "PAR_2", "PAR_1", "PAR_2"), 
                               "chr" = c(rep("X", 2), rep("Y", 2)), 
                               "start" = c(10001, 155701383, 10001, 56887903),
                               "stop" = c(2781479, 156030895, 2781479, 57217415))

# define buffer: around centromeres, telomeres and PARs
buffer <- 100000

# create regions: per chromosome
regions <- as.data.frame(matrix(NA, nrow = nrow(chromosomes), ncol = 5))
names(regions) <- c("chr", "start_1", "stop_1", "start_2", "stop_2")
regions$chr <- chromosomes$chr

for (c in 1:nrow(chromosomes)){
  regions$start_1[c] <- getStart1(chromosome = chromosomes[c,], telomere_len_autosomes, pseudo_autosomal, buffer)
  regions$stop_1[c] <- getStop1(centromere = centromeres[c,], buffer)
  
  regions$start_2[c] <- getStart2(centromere = centromeres[c,], buffer)
  regions$stop_2[c] <- getStop2(chromosome = chromosomes[c,], telomere_len_autosomes, pseudo_autosomal, buffer)
}

if (any(regions$stop_1 < regions$start_1) | any(all(regions$stop_2 < regions$start_2))){ stop("something went wrong with regions")}

#------------------------------------------------------------------------
# Split regions into chunks
#------------------------------------------------------------------------

chunk_sizes <- c(10^6,
                 5*10^5, 2*10^5, 10^5,
                 5*10^4, 2*10^4, 10^4
                 )

chunks <- data.frame(matrix(NA, nrow = 0, ncol = 4))
names(chunks) <- c("chr", "chunk", "start", "stop")
for (c in 1:nrow(regions)){ # loop over chromosomes
  counter <- 1
  for (arm in 1:2){ # loop over the two arms of a chromosome
    start <- regions[c, 2*(arm-1) + 2]
    stop  <- regions[c, 2*(arm-1) + 3]

    cur_start <- start
    cur_junk_size_index <- 1
    finished <- FALSE
    while((stop - cur_start) > min(chunk_sizes)){
      chunk_start <- cur_start
      chunk_stop <- chunk_start + chunk_sizes[cur_junk_size_index]
      while (chunk_stop > stop){ # chunk exceeds region
        cur_junk_size_index <- cur_junk_size_index + 1 # try next-smaller chunk size
        if (cur_junk_size_index > length(chunk_sizes)){  # reached the end
          finished <- TRUE; 
          break;
        }
        chunk_stop <- chunk_start + chunk_sizes[cur_junk_size_index]
      }
      if (finished){ break; } # reached the end
      chunks[nrow(chunks) + 1, 1:2] <- c(regions$chr[c], counter)
      chunks[nrow(chunks), 3:4]   <- c(chunk_start, chunk_stop)
      
      # increment start
      counter <- counter + 1
      cur_start <- chunk_stop + 1
      cur_junk_size_index <- cur_junk_size_index + 1
      if (cur_junk_size_index > length(chunk_sizes)){ # re-cycle chunk sizes
        cur_junk_size_index <- 1
      }
    }
  }
}

# check

for (c in 1:nrow(regions)){
  chunk <- chunks[chunks$chr == regions$chr[c],3:4]
  if (any(chunk$stop <= chunk$start)){ stop ("invalid") }
  if (any(!(chunk$stop - chunk$start %in% chunk_sizes))){ stop ("invalid") }
  
  if (any(chunk$start < regions$start_1[c])){ stop("start_1 invalid"); }
  if (any(chunk$start > regions$stop_2[c])){ stop("stop_2 invalid"); }
  if (any(chunk$start > regions$stop_1[c] & chunk$stop < regions$start_2[c])){ stop("stop_2 invalid"); }
  
  print(regions$chr[c])
  chunk <- chunks[chunks$chr == regions$chr[c],]
  print(table(chunk$stop - chunk$start))
  
  plot(chunk$start, chunk$start - chunk$stop, type = "l")
}

#------------------------------------------------------------------------
# Write to file
#------------------------------------------------------------------------

write.table(chunks, file = "~/ownCloud - Madleina Caduff (unifr.ch)@drive.switch.ch/sexEstimation/lowQualityReference/chunks.txt",
            append = F, quote = F, row.names = F, col.names = F)






fai <- read.table("/data/sex_estimation/aDNA/lowQualityReference/Homo_sapiens.GRCh38_lowQuality.fa.fai")
dim(fai)

plot(fai$V2)

table(as.character(sapply(fai$V1, function(x) return(str_split(x, "_")[[1]][1])))) == table(chunks$chr)

