#!/usr/bin/env Rscript

#----------------------------------------
# Script to downsample counts
#----------------------------------------

# get args from command line
args <- commandArgs(trailingOnly=TRUE)
filename <- args[1]
minimumNumberOfReads <- as.integer(args[2])
prefixOut <- args[3]
replicate <- as.integer(args[4])

#----------------------------------------

downsample <- function(x, numReads) {
  x <- as.integer(x)
  numContigs <- length(x)
  if (sum(x) <= numReads) { return(x) }
  
  # Calculate the probabilities for each contig based on their counts
  probs <- x / sum(x)
  # Sample contigs based on the probabilities
  sampledContigs <- sample.int(numContigs, size = numReads, replace = TRUE, prob = probs)
  # Count the number of occurrences for each sampled contig
  sampledCounts <- tabulate(sampledContigs, nbins = numContigs)
  return(sampledCounts)
}

set.seed(replicate)

# Read counts
counts <- read.table(filename, header = T, check.names = F)

# only keep individuals that have minimum number of reads
filtered_counts <- counts[rowSums(counts[3:ncol(counts)]) >= minimumNumberOfReads,]
if (nrow(filtered_counts) == 0){
  stop("No individuals have more than ", minimumNumberOfReads, " reads!")
}

# downsample
counter <- 1
reads_per_round <- minimumNumberOfReads
for (r in 1:length(reads_per_round)){

  # downsample
  actual_counts <- filtered_counts[,3:ncol(filtered_counts)]
  downsampled_counts <- matrix(NA, nrow = nrow(actual_counts), ncol = ncol(actual_counts))
  for (i in 1:nrow(actual_counts)){
    downsampled_counts[i,] <- downsample(actual_counts[i,], reads_per_round[r])
  }

  # construct data frame
  full <- filtered_counts
  full[,3:ncol(full)] <- downsampled_counts

  # write file
  outname <- paste0(prefixOut, "_downsampled_rep_", replicate, "_round_", r, ".txt")
  write.table(x = full, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = T)
}

outname <- paste0(prefixOut, "_rep_", replicate, "_reads_per_round.txt")
write.table(x = reads_per_round, file = outname, append = F, quote = F, sep = "\t", row.names = F, col.names = F)

