###########################################################
# This script records the quantile distribution used in normalizing
# the samples in the compendium.
#
# Usage:
#       Rscript record_quantile_distribution.R compendium_dir output_file
#
#       compendium_dir: the folder that stores CEL files for all samples in
#                       the compendium
#       output_file: a tab-delimited file that stores the quantile distribution
#                    used in quantile normalization
###########################################################

# use the pacman to install and load required packages
# the affy libraries are for working with CEL files
pacman::p_load("affy", "affyio", "preprocessCore")

compendium_dir <- commandArgs(trailingOnly = TRUE)[1]
output_file <- commandArgs(trailingOnly = TRUE)[2]

# list all cel files in the directory
allfiles <- list.celfiles(compendium_dir)
# a vector with the type of each array
allptype <- sapply(allfiles, function(f) read.celfile.header(
  paste(compendium_dir, f, sep = "/"))[1])
# allpfiles vector only contains files of type "Pae_G1a"
allpfiles <- paste(compendium_dir, subset(allfiles, allptype == "Pae_G1a"),
  sep = "/")
compendium <- ReadAffy(filenames = allpfiles)  # read in cel files and produce
                                               # an AffyBatch object
compendium_PMmat <- pm(compendium, NULL) #return perfect match probes
# background correction
compendium_PMmat_bg <- rma.background.correct(compendium_PMmat)
# sort each array on the probe value
compendium_PMmat_bg_sorted <- data.frame(apply(compendium_PMmat_bg, 2, sort))
# calculate the quantile distribution
compendium_PMmat_bg_mean <- apply(compendium_PMmat_bg_sorted, 1, mean)
write.table(compendium_PMmat_bg_mean, output_file, row.names = F, col.names = F)
