#----------------------------------------
# Script to downsample counts
#----------------------------------------

# get args from command line
args <- commandArgs(trailingOnly=TRUE)
filename <- args[1]
prefixOut <- args[2]
replicate <- as.integer(args[3])
factor <- as.numeric(args[4])

#----------------------------------------

simulateTrisomy21AndDownsample <- function(x, numReads, factor) {
  numContigs <- length(x)
  if (sum(x) <= numReads) { return(x) }
  
  # Trisomy -> 1.5 times more counts than diploid
  fac <- factor
  
  N <- sum(x)
  denom <- N + (fac - 1) * x$`21`
  n_21 <- x$`21` * fac
  n_other <- x[names(x) != "21"]
  
  prob_21 <- n_21 / denom
  probs_other <- n_other / denom
  probs <- x
  probs[names(x) != "21"] <- probs_other
  probs[names(x) == "21"] <- prob_21
  
  # Sample contigs based on the probabilities
  sampledContigs <- sample.int(numContigs, size = numReads, replace = TRUE, prob = probs)
  # Count the number of occurrences for each sampled contig
  sampledCounts <- tabulate(sampledContigs, nbins = numContigs)
  return(as.numeric(sampledCounts))
}

set.seed(replicate)

# Read counts
counts <- read.table(filename, header = T, check.names = F)
if (any(rowSums(counts[3:ncol(counts)]) < 20000)){ stop("Not enough counts") }

# downsample
reads_per_round <- c(200000, 100000, 50000, 20000, 10000, 5000, 2000, 1000, 500, 200, 100)

for (round in 1:length(reads_per_round)){ 
  # downsample
  actual_counts <- counts[,3:ncol(counts)]
  
  # simulate trisomies
  trisomy_counts <- matrix(NA, nrow = nrow(actual_counts), ncol = ncol(actual_counts))
  for (i in 1:nrow(actual_counts)){
    trisomy_counts[i,] <- as.numeric(simulateTrisomy21AndDownsample(actual_counts[i,], reads_per_round[round], factor))
  }
  
  cha <- apply(trisomy_counts, 1:2, as.character)  

  # construct data frame
  full <- cbind(counts$individual, 
                counts$sequencing_type,
                cha)
  colnames(full) <- colnames(counts)
  
  # write file
  outname <- paste0(prefixOut, "_downsampled_rep_", replicate, "_round_", round, "_factor21_", factor, ".txt")
  write.table(x = full, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = T)
}

outname <- paste0(prefixOut, "_rep_", replicate, "_factor21_", factor, "_reads_per_round.txt")
write.table(x = reads_per_round, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = F)

