###########################################################
# This script process a folder of CEL files into a pcl file using the rma
# function provided in the affy package.
#
# Usage:
#       Rscript process_to_pcl.R process_dir output_file
#
#       process_dir: a folder of CEL files of one experiment
#       output_file: the pcl file with processed gene expression values
#
###########################################################

# use the pacman to install and load required packages
# the affy libraries are for working with CEL files
pacman::p_load("affy", "affyio")

# The first argument is the directory to process
process_dir <- commandArgs(trailingOnly = TRUE)[1]
output_file <- commandArgs(trailingOnly = TRUE)[2]

# files now holds a list of all celfiles in the directory to be processed
files <- list.celfiles(process_dir)

# ptype is now a vector with the type of each array
ptype <- sapply(files, function(f) read.celfile.header(paste(process_dir, f,
                                                             sep = "/"))[1])

# ptype levels are the levels of that vector (i.e. the types of arrays present)
# this is important because only arrays of the same type can be processed
# together
ptype_levels <- levels(as.factor(unlist(ptype)))

for (level in ptype_levels) {
    if (level == "Pae_G1a") {
        # pfiles vector only contains files of this type
        pfiles <- paste(process_dir, subset(files, ptype == level), sep = "/")
        # ReadAffy loads the array data using the custom CDF
        Data <- ReadAffy(filenames = pfiles)
        # rma processes the data by multi-array average expression measure
        express <- rma(Data)
        # this line writes out the PCL file.
        write.exprs(express, file = output_file)
    }
}