#!/usr/bin/env Rscript

#----------------------------------------
# Script to downsample counts
#----------------------------------------

# get args from command line
args <- commandArgs(trailingOnly=TRUE)
filename <- args[1]
prefixOut <- args[2]
replicate <- as.integer(args[3])

#----------------------------------------

downsample <- function(x, numReads) {
  x <- as.integer(x)
  numContigs <- length(x)
  if (sum(x) <= numReads) { return(x) }
  
  # Calculate the probabilities for each contig based on their counts
  probs <- x / sum(x)
  # Sample contigs based on the probabilities
  sampledContigs <- sample.int(numContigs, size = numReads, replace = TRUE, prob = probs)
  # Count the number of occurrences for each sampled contig
  sampledCounts <- tabulate(sampledContigs, nbins = numContigs)
  return(sampledCounts)
}

set.seed(replicate)

# Read counts
counts <- read.table(filename, header = T, check.names = F)
if (any(rowSums(counts[3:ncol(counts)]) < 20000)){ stop("Not enough counts") }

# downsample
reads_per_round <- c(200000, 100000, 50000, 20000, 10000, 5000, 2000, 1000, 500, 200, 100)
for (round in 1:length(reads_per_round)){

  # downsample
  actual_counts <- counts[,3:ncol(counts)]
  downsampled_counts <- matrix(NA, nrow = nrow(actual_counts), ncol = ncol(actual_counts))
  for (i in 1:nrow(actual_counts)){
    downsampled_counts[i,] <- downsample(actual_counts[i,], reads_per_round[round])
  }

  # construct data frame
  full <- counts
  full[,3:ncol(full)] <- downsampled_counts

  # write file
  outname <- paste0(prefixOut, "_downsampled_rep_", replicate, "_round_", round, ".txt")
  write.table(x = full, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = T)
}

outname <- paste0(prefixOut, "_rep_", replicate, "_reads_per_round.txt")
write.table(x = reads_per_round, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = F)

