###########################################################
# This script processes a microarray testset from CEL files to one pcl file.
# The quantile normalization is done using the quantile distribution derived
# from processing the entire expression expression compendium
#
# Usage:
#     Rscript process_microarray_testset.R test_dir quantile_ref output_file
#
#     test_dir: a testset folder that stores CEL files in one experiment
#     quantile_ref: the reference quantile distribution
#     output_file: the processed pcl file of the testset
###########################################################

# use the pacman to install and load required packages
pacman::p_load("affy", "affyio", "preprocessCore")

test_dir <- commandArgs(trailingOnly = TRUE)[1]
quantile_ref <- commandArgs(trailingOnly = TRUE)[2]
output_file <- commandArgs(trailingOnly = TRUE)[3]

quantile_ref <- read.table(quantile_ref, header = F)
testfiles <- list.celfiles(test_dir)
testptype <- sapply(testfiles, function(f) read.celfile.header(paste(test_dir,
     f, sep = "/"))[1])
testpfiles <- paste(test_dir, subset(testfiles, testptype == "Pae_G1a"),
                    sep = "/")
# read in cel files and produce an AffyBatch object
testset <- ReadAffy(filenames = testpfiles)
# return perfect match probes
PMmat <- pm(testset, NULL)
# background correction
PMmat_bg <- rma.background.correct(PMmat)
# quantile normalization using the reference distribution
PMmat_normed <- normalize.quantiles.use.target(PMmat_bg,
                                               target = quantile_ref[, 1])
# get probe sets
test_probe_list <- probeNames(testset, NULL)
# summarize probes into genes
PMmat_sumed <- subColSummarizeMedianpolishLog(PMmat_normed,
                                              group.labels = test_probe_list)
colnames(PMmat_sumed) <- testfiles
PMmat_sumed <- cbind(gene = rownames(PMmat_sumed), PMmat_sumed)
write.table(PMmat_sumed, output_file, row.names = F, col.names = T,
            sep = "\t", quote = F)