###########################################################
# This script processes a RNAseq testset from the file that stores expression
# values of all genes to TDM normalized expression values and only genes in
# the reference file are kept.
# TDM https://github.com/greenelab/TDM is a normalization approach developed
# by Thompson et al. It normalizes RNAseq expression values to a comparable
# range of microarray expression values.
#
# Usage:
#     Rscript process_rnaseq_testset.R test_file ref_file out_file
#
#     test_file: a tab-delimited file that stores RNAseq data of a testset
#     ref_file: the reference microarray expression file that has genes to keep
#     out_file: the processed rnaseq data file
###########################################################

# use the pacman to install and load required packages
pacman::p_load("data.table", "devtools")
# install and load TDM package to normalize RNAseq data to a comparable range
# with microarray data
devtools::install_github("greenelab/TDM")
library(TDM)

test_file <- commandArgs(trailingOnly = TRUE)[1]
ref_file <- commandArgs(trailingOnly = TRUE)[2]
out_file <- commandArgs(trailingOnly = TRUE)[3]

target_data <- read.table(test_file, header = T, sep = "\t")
ref_data <- read.table(ref_file, header = T, sep = "\t")
colnames(target_data)[1] <- "gene"
colnames(ref_data)[1] <- "gene"

target_data <- data.table(target_data)
setkey(target_data, gene)
ref_data <- data.table(ref_data)
setkey(ref_data, gene)

data_tdm <- tdm_transform(target_data, ref_data)
write.table(data_tdm, out_file, row.names = F, col.names = T, quote = F,
    sep = "\t")