################################################################################
# This script performs several analyses centered around medium annotations:
# plotting signature activities per medium type, calculating medium activation
# score per signature per medium, and identifying signatures that are active in
# multiple media.
#
# Usage:
#     Rscript medium_analysis.R activity_file annotation_file
#
#     activity_file: file path to the activity file that stores signature
#                    activity of each sample
#     annotation_file: file path to the annotation file that stores the medium
#                      type of each sample
#
################################################################################

####### load required packages

pacman::p_load("ggplot2", "plyr")

####### load in command arguments

comArgs <- commandArgs(trailingOnly = TRUE)
activity_file <- comArgs[1]
annotation_file <- comArgs[2]

####### specify constant

# this cutoff is determined by the distribution of the activation scores
activation_cutoff <- 0.4
activity_plot_folder <- "medium_activity_plots/"
dir.create(activity_plot_folder)

####### load in data

# read in the signature activities
activity_compendium <- read.table(activity_file, header = T,
                                  row.names = 1, sep = "\t")
signatureN <- ncol(activity_compendium)
sample_names <- rownames(activity_compendium)

# read in the medium annotations
annotations <- read.table(annotation_file, sep = "\t", fill = T,
                          quote = "", header = T, stringsAsFactors = F)
# remove duplicates
annotations_noDup <- subset(annotations, !duplicated((annotations$cel_file)))
# merge signature activities and medium annotations
merged <- merge(activity_compendium, annotations_noDup, by.x = 0,
                by.y = "cel_file", all.x = F, all.y = F)
merged$medium <- trimws(merged$medium)
merged$medium <- factor(merged$medium)

#######################################################
# count number of samples/experiments in each medium

exp_medium <- ddply(merged, .(experiment, medium), summarize,
                    freq = length(medium))
count_table <- cbind(table(merged$medium), table(exp_medium$medium))
colnames(count_table) <- c("#sample", "#experiment")
write.table(count_table, "./medium_count.txt", sep = "\t", row.names = T,
            col.names = NA, quote = F)

#######################################################
# plot signature activities grouped by medium types

# loop through signatures that start from the second column
for (i in 1:signatureN + 1) {
  ggplot(data = merged, aes(x = merged[, "medium"], y = merged[, i])) +
    geom_boxplot() + geom_point(size = 0.5) + coord_flip() +
    labs(x = "medium", y = colnames(merged)[i])
  ggsave(paste0(activity_plot_folder, colnames(merged)[i], "_medium.pdf"),
         height = 50, width = 8, limitsize = FALSE)
}

#######################################################
# calculate medium activation score per signature

all_medium <- levels(merged$medium)
all_medium_result <- list()
for (i in 1:length(all_medium)) {

  this_medium <- all_medium[i]
  score_list <- c()

  # only analyze medium with more than 1 sample
  if (nrow(merged[merged$medium == this_medium, ]) > 1) {

    # loop through signatures that start from the second column
    for (signature in 1:signatureN + 1) {
      diff <- abs(
        mean(merged[merged$medium == this_medium, signature]) -
          mean(merged[merged$medium != this_medium, signature]))
      activation_score <- diff/(max(merged[, signature]) -
                                  min(merged[, signature]))
      score_list <- c(score_list, activation_score)
    }

    this_medium_result <- data.frame(
      medium = rep(this_medium, length(score_list)),
      signature = colnames(merged)[1:signatureN + 1],
      activation_score = score_list)
    # order by activation score
    this_medium_result <- this_medium_result[
      order(this_medium_result$activation_score, decreasing = TRUE), ]
    all_medium_result[[i]] <- this_medium_result

  }
}

all_medium_result <- do.call(rbind, all_medium_result)


#######################################################
# plot the distribution of activation scores

pdf("activation_score_density.pdf", height = 5, width = 5)
plot(density(all_medium_result$activation_score),
     main = "Distribution of medium activation scores")
dev.off()

#######################################################
# filter the result with the activation score cutoff

all_medium_result_selected <- all_medium_result[
  all_medium_result$activation_score >= activation_cutoff, ]
write.table(all_medium_result_selected,
            paste("signature_medium_activation>", activation_cutoff,
                  ".txt", sep = ""), sep = "\t", row.names = F,
            col.names = T, quote = F)

#######################################################
# identify signatures active for multiple media

signature_media <- ddply(all_medium_result_selected, .(signature),
                         summarize, mean_activation = mean(activation_score),
                         freq = length(medium),
                         media = paste(medium, collapse = ", "))
signature_media <- signature_media[signature_media$freq > 1, ]
signature_media <- signature_media[order(signature_media$mean_activation,
                                         decreasing = TRUE), ]
write.table(signature_media, "Signature_MeanActivationScore_Media.txt",
            sep = "\t", row.names = F, col.names = T, quote = F)
