#----------------------------------------
# Script to downsample counts
#----------------------------------------

# get args from command line
args <- commandArgs(trailingOnly=TRUE)
filename <- args[1]
prefixOut <- args[2]
filenameStatePos <- args[3]
replicate <- as.integer(args[4])

#----------------------------------------

getFactorXY <- function(s, karyotype){
  if (s == "XY" & karyotype == "XY"){ 
    return(list(factorX=1, factorY=1))
  } else if (s == "XY" & karyotype == "XXY"){ 
    return(list(factorX=2, factorY=1))
  } else if (s == "XY" & karyotype == "XYY"){ 
    return(list(factorX=1, factorY=2))
  } else if (s == "XY" & karyotype == "XXYY"){ 
    return(list(factorX=2, factorY=2))
  } else if (s == "XX" & karyotype == "XX"){ 
    return(list(factorX=1, factorY=1))
  } else if (s == "XX" & karyotype == "X"){ 
    return(list(factorX=0.5, factorY=1))
  } else if (s == "XX" & karyotype == "XXX"){ 
    return(list(factorX=1.5, factorY=1))
  } else {
    stop("unknown sex - karyotype combination!")
  }
}

simulateAneuploidsAndDownsample <- function(x, s, numReads, karyotype) {
  numContigs <- length(x)
  if (sum(x) <= numReads) { return(x) }
  
  # Calculate the probabilities for each contig based on their counts (aneuploid)
  fac <- getFactorXY(s, karyotype)

  N <- sum(x)
  denom <- N + (fac$factorX - 1) * x$X + (fac$factorY - 1) * x$Y
  n_X <- x$X * fac$factorX
  n_Y <- x$Y * fac$factorY
  n_A <- x[1:22]
  
  prob_X <- n_X / denom
  prob_Y <- n_Y / denom
  probs_A <- n_A / denom
  probs <- c(probs_A, prob_X, prob_Y)
  
  # Sample contigs based on the probabilities
  sampledContigs <- sample.int(numContigs, size = numReads, replace = TRUE, prob = probs)
  # Count the number of occurrences for each sampled contig
  sampledCounts <- tabulate(sampledContigs, nbins = numContigs)
  return(sampledCounts)
}

set.seed(replicate)

# Read counts
counts <- read.table(filename, header = T, check.names = F)
if (any(rowSums(counts[3:ncol(counts)]) < 20000)){ stop("Not enough counts") }

# Read state posterior s
statePos_s <- read.table(filenameStatePos, header = T, check.names = F)
max_s <- as.numeric(apply(statePos_s, 2, which.max) - 1)
if (any(max_s > 1)){ stop("Detected aneuploids!") }

# downsample
reads_per_round <- c(200000, 100000, 50000, 20000, 10000, 5000, 2000, 1000, 500, 200, 100)

for (round in 1:length(reads_per_round)){ 
  # downsample
  actual_counts <- counts[,3:ncol(counts)]

  # simulate aneuploids
  karyo <- list(male = c("XY", "XXY", "XYY", "XXYY"), female = c("XX", "X", "XXX"))
  aneuploid_counts <- matrix(NA, nrow = 0, ncol = ncol(actual_counts))
  sampleNames <- c()
  for (i in 1:nrow(actual_counts)){
    for (k in karyo[[max_s[i] + 1]]){
      sex <- c("XY", "XX")[max_s[i] + 1]
      aneuploid_counts <- rbind(aneuploid_counts, 
                                simulateAneuploidsAndDownsample(actual_counts[i,], sex, reads_per_round[round], k))
      sampleNames <- c(sampleNames, paste0(counts$individual[i], "_", k))
    }
  }
  
  # construct data frame
  full <- cbind(sampleNames, 
                rep(as.character(counts$sequencing_type)[1], length(sampleNames)), 
                aneuploid_counts)
  colnames(full) <- colnames(counts)
  
  # write file
  outname <- paste0(prefixOut, "_downsampled_rep_", replicate, "_round_", round, ".txt")
  write.table(x = full, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = T)
}

outname <- paste0(prefixOut, "_rep_", replicate, "_reads_per_round.txt")
write.table(x = reads_per_round, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = F)
