###########################################################
# This script combines separate rna-seq samples in a folder into one file
# with one column for each sample.
# If the dataset uses PA14 strain instead of PAO1, then PA14 genes will be
# mapped to PAO1 genes using the Pseudomonas_aeruginosa_PA14-PAO1_orthologs.txt
# file included in the repository.
#
# Usage:
#       Rscript preprocess_separate_rnaseq.R rnaseq_folder output_file is_PA14
#
#       rnaseq_folder: the folder that stores all rna-seq samples in an
#                      experiment
#       output_file: a tab-delimited file with one sample per column
#       is_PA14: whether the dataset uses PA14 strain
###########################################################

# the folder that stores each individual RNAseq file
rnaseq_folder <- commandArgs(trailingOnly = TRUE)[1]
output_file <- commandArgs(trailingOnly = TRUE)[2] # the output file
is_PA14  <- commandArgs(trailingOnly = TRUE)[3] # logical, is PA14 strain

# ortholog_map_file contains the PAO1-PA14 orthologs and is provided.
ortholog_map_file <- "./Pseudomonas_aeruginosa_PA14-PAO1_orthologs.txt"
is_PA14  <- as.logical(is_PA14)

# read in each sample in the folder and combine them together
rnaseq_files <- list.files(rnaseq_folder)
rnaseq_data <- c()
for (file in rnaseq_files){
  this_exp <- read.delim(file.path(rnaseq_folder, file), header = F, sep = "\t")
  colnames(this_exp) <- c("gene", file)
  if (is.null(rnaseq_data)){
    rnaseq_data <- this_exp
  }
  else{
    rnaseq_data <- merge(rnaseq_data, this_exp, by = "gene")
  }
}
gene_ID <- as.character(rnaseq_data$gene)
rnaseq_data$gene <- unlist(lapply(gene_ID, function(x) unlist(
    strsplit(x, ","))[1]))

if (is_PA14){
  # read in the PA14-PAO1 orthologs map
  ortholog_data <- read.delim(ortholog_map_file, header = T, sep = "\t")
  ortholog_data <- ortholog_data[, c("Locus.Tag", "Locus.Tag.1")]
  # convert PA14 ID to PAO1 ID
  rnaseq_data_merged <- merge(ortholog_data, rnaseq_data, by.x = "Locus.Tag",
    by.y = "gene")
  rnaseq_data_merged <- rnaseq_data_merged[, -1]
  write.table(rnaseq_data_merged, output_file, row.names = F, col.names = T,
    quote = F, sep = "\t")
} else{
  write.table(rnaseq_data, output_file, row.names = F, col.names = T, quote = F,
   sep = "\t")
}