downloadProcessGEO <- function(series, remove.pattern=c("miRNA"), remove.cel= c("GSM750879_01_E3.CEL.gz"), normalize=FALSE )
 # series: a data.frame with at least 3 columns:
 #   "GSE.no."             : unique GEO series identifier (e.g. GSE15636) (character vector)
 #   "GPL.no."             : unique GEO platform identifier (e.g. GPL570) (character vector)
 #   "Annotation.package"  : corresponding Bioconductor annotation package (e.g. hgu133plus2.db) (character vecor)
 # remove.pattern: part of the name of files that should be removed from the downloaded series before analysis (e.g. miRNA) (character vector)
 # remove.cel: corrupt, or other CEL files that should be excluded before analyses (e.g. GSM750879_01_E3.CEL.gz) (character vector)
    {
    for (i in seq_along( series[,"GSE.no."]) ) {
      # download raw data archive, extract and cleanup files
      GEOquery::getGEOSuppFiles(series[i,"GSE.no."])
      cat("Extracting TAR archive containing raw data for series", series[i,"GSE.no."], "\n\n")
      untar( paste(series[i,"GSE.no."],"/", series[i,"GSE.no."],"_RAW.tar",sep=""),
          exdir=series[i,"GSE.no."] )
          
      # some GEO entries also contain miRNA arrays that have miRNA in their filename; delete these since these are not of interest
      remove.files <- dir(path=series[i,"GSE.no."], pattern=remove.pattern, recursive = TRUE)
      if (length(remove.files)!=0) {
          file.remove( as.character( paste(series[i,"GSE.no."], remove.files, sep="/") ) )
          cat("\nGEO data serie includes file(s) with pattern: '",remove.pattern,"' that are removed\nbefore processing:\n", remove.files, "\n\n\n")
          }  
          
      #download metadata from GEO (GSEMatrix), to be matched with samples, but if there are > 1 GEO platforms present in the series download,
      #only include relevant metadata matrix and remove metadata matrix from all other experiments (e.g. miRNA)
      gds <- GEOquery::getGEO(GEO=series[i,"GSE.no."], destdir=series[i,"GSE.no."], GSEMatrix = TRUE, getGPL = FALSE)
      if (length(gds) ==1) { cat("\nIdentified a single GEO metadata file (GSEMatrix)\n for the required platform entry:", series[i,"GPL.no."], ".\n")
	                         cat("This metadata file is:", names(gds), "\n\n") }
      if (length(gds)!=1) {
          matches = regexpr("(?<=^|-|_|/| )GPL.*?(?=$| |_|-)", names(gds), perl=T) #define what to match
          GSEmatrix2keep <- names(gds)[ regmatches(names(gds), matches) %in% series[i,"GPL.no."] ]
          GSEmatrix2remove <- names(gds)[ regmatches(names(gds), matches) %!in% series[i,"GPL.no."] ]
          #gds[which(names(gds) %!in% GSEmatrix2keep)] <- NULL
          gds <- gds[names(gds) == GSEmatrix2keep]
          if( length(GSEmatrix2remove) > 0) {
            file.remove( paste(series[i,"GSE.no."], GSEmatrix2remove, sep="/") ) 
            cat("\nIdentified one or more GEO metadata files (GSEMatrix)\n for non-relevant platform entries!\n")
            cat("\nRemoving the non-relevant GSEMatrix from the download:\n", GSEmatrix2remove, "\n")
            cat("because only GEO metadata for GEO platform", series[i,"GPL.no."], "should be retained.\n")
            cat("Retained metadata file is:", GSEmatrix2keep, "\n\n")
            }
        }

      # perform rma bg-correction and summerization, but no normalization!
      celfls <- oligoClasses::list.celfiles(series[i,"GSE.no."], full.names=TRUE, listGzipped=TRUE)
      # series GSE30292 contains a corrupt CEL file (GSM750879_01_E3.CEL.gz), so remove it before processing 
      if (sum( sub(".*/", "", celfls) %in% remove.cel ) !=0) {
          file.remove( celfls[ sub(".*/", "", celfls) %in% remove.cel ] ) 
          cat("\nData serie includes user-listed CEL file(s) that are removed:", celfls[ sub(".*/", "", celfls) %in% remove.cel ], "\n")
          }
      affy.data <- oligo::read.celfiles(filenames = list.celfiles(series[i,"GSE.no."],
          full.names=TRUE, listGzipped=TRUE) )
      data.bgexpr <- oligo::rma(affy.data, normalize=normalize)

      # add annotation data
      do.call(library, list(series[i,"Annotation.package"]) ) 
      data.bgexpr <- affycoretools::annotateEset(data.bgexpr, series[i,"Annotation.package"]) 

      # add metadata
      tmp <- strsplit(  sampleNames(data.bgexpr) , "[[:punct:]]")             #split on period in file names
      names.samples <- sapply(tmp, function(x) x[1])                          #only keep part before first period
      sampleNames(data.bgexpr) <- names.samples                               #rename samples BEFORE adding pData...
      pData(data.bgexpr) <- Biobase::pData(gds[[1]])[names.samples,]

      # save as r-object in series subdirectory
      objectName <- paste("data.bgexpr", series[i,"GSE.no."], sep=".")
      assign( objectName, data.bgexpr)
      fileName <- paste("./", series[i,"GSE.no."], "/data.rma.bgexpr.", series[i,"GSE.no."], ".Rdata", sep="")
      save(list=objectName, file=fileName )
      }
     rm(list = ls())
    }



`%!in%` = Negate(`%in%`)



filterDataset <- function(eset, ctrls="AFFX")
    {
    cat("\nArray design:", annotation(eset))
    controls <- grep(ctrls, rownames(eset))
    cat("\nNumber of features in dataset:", dim(eset)[1], "\n")
    cat("\nRemoved", length( controls), "control features.\n")
    if(length(controls) > 0) eset <- eset[-controls,]
    cat("\nContinued with", dim(eset)[1], "features.\n")
    cat("\nOf these", length(!is.na(fData(eset)$ENTREZID)), "features, " )
    cat(sum(!is.na(fData(eset)$ENTREZID)), "features have been annotated with an ENTREZID.\n" )
    eset <- eset[!is.na(fData(eset)$ENTREZID),]
    o <- order( rowMeans((exprs(eset))) , decreasing=TRUE)
    dup <- duplicated(fData(eset)$ENTREZID[o])
    eset <- eset[o,][!dup,]
    cat("\nSorted expression data from high to low average signal.\n" )
    cat("\nRemoved", sum(dup), "duplicate features (based on ENTREZID).\n")
    cat("\nKept", dim(eset)[1], "unique features.\n")
    featureNames(eset) <- fData(eset)$ENTREZID
    cat("\nRenamed features to ENTREZID.\n")
    annotation(eset) <- "none"
    cat("\nSet annotation slot to 'none'.\n")
    return(eset)
    }



combineDataset <- function(datasets, common.features=NULL)
    {
    library(data.table)
     if(class(datasets) != "character") stop("Input for dataset should be a character vector!")
     if(length(datasets) < 2) stop("Input for dataset should be at least 2 data sets!")
     if(class(common.features) != "character") stop("Input for common features should be a character vector!")
     if(length(common.features)==0) stop("Input for common features should be at least 1 feature!")
     
     for (i in seq_along(datasets)) {
     tmp.exprs <- exprs(get(datasets[i])[common.features])
     tmp.pData <- pData(get(datasets[i]))
     tmp.pData$EXPERIMENT <- rep( strsplit(datasets[i]  , "[[:punct:]]")[[1]][1]   , dim(tmp.pData)[1])
     fData.out <- fData(get(datasets[1]))[common.features, -1] #use fixed info based on 1st dataset!
        if(i==1)
            {
            expr.out <- tmp.exprs
            pData.out <- tmp.pData
            cat("Processed dataset", i,":", datasets[i], "\n" )
            } else {
            expr.out <- cbind( expr.out, tmp.exprs )
            pData.out <- rbind( setDT(pData.out), setDT(tmp.pData), fill=TRUE)
            cat("Processed dataset", i,":", datasets[i], "\n" )
            }
        }
    dups <- duplicated(colnames(expr.out))
    cat("\nRemoved", sum(dups), "duplicate samples (based on GSM identifier).\n")
    cat("\nKept", sum(!dups), "unique samples.\n")
    cat("\nIncluded", dim(expr.out)[1], "features.\n")
    expr.out <- expr.out[, !dups]
    pData.out <- as.data.frame(pData.out)
    pData.out <- pData.out[!dups,]
    rownames(pData.out) <- colnames(expr.out)
    eSet.out <- new("ExpressionSet", assayData=assayDataNew(exprs=expr.out),
                annotation="none", phenoData=AnnotatedDataFrame(pData.out), featureData=AnnotatedDataFrame(fData.out))
    return(eSet.out)
    }



intersectGenes <- function(dataset, common.features=NULL)
    {
    # datasets: a character vector with the names of the ExpressionSets that should be combined
    #           note that these ExpressionSets should already have been loaded in the environment.
    if(class(dataset) != "character") stop("Input should be a character vector!")
    all.genes <- list()
      for(i in seq_along(dataset)){
       all.genes[[i]] <- rownames( get(dataset[i]) )
       }
    common.genes <- Reduce(intersect, all.genes)
    return(common.genes)
    }



removeSingletExperiment <- function(dataset, common.features=NULL)
    {
    if(class(dataset) != "ExpressionSet") stop("Input should be an ExpressionSet!")
    if( dim(pData(dataset))[2] < 2) stop("Input does not contain any phenoData!")
    if( !"EXPERIMENT" %in% colnames(pData(dataset)) ) stop("phenoData input does not contain EXPERIMENT info.\n Please add this column!")
    singlet.experiments <- names(table(pData(dataset)$EXPERIMENT) [(table(pData(dataset)$EXPERIMENT)==1)])
      if(length(singlet.experiments) != 0) {
      cat("\nRemoved", length(singlet.experiments), "experiments consiting of only a single sample (based on EXPERIMENT):\n")
      cat("\nThese experiments are:", singlet.experiments)
      m3 <- match( singlet.experiments, pData(dataset)$EXPERIMENT)
      dataset.out <- dataset[, -m3]
            } else {
      cat("\nNote: no experiments consisting of only a single sample (based on EXPERIMENT) are present!\n")
      dataset.out <- dataset
            }
    return(dataset.out)
   }



withinVariation <- function(X, design){
    # borrowed from the package miXomics ( https://github.com/mixOmicsTeam/mixOmics/blob/master/R/withinVariation.R) 
    # need a matrix for matrix calculations
    X = as.matrix(X)
    rep.measures = factor(design[, 1])
    factors = design[, -1, drop = FALSE] 
    
    if(any(summary(as.factor(rep.measures)) == 1))
      stop("Multilevel analysis can only be performed when all samples are repeated at least once.", call. = FALSE)
    
    # calculate the variation
    # ---------------------------
    # added condition for the spls case where the condition is not needed
    # all we need is the rep.measures
    if ((ncol(factors) == 0) | (ncol(factors) == 1))
    {
      message("Splitting the variation for 1 level factor.")
      
      # save sample names for the output
      indiv.names = rownames(X)
      rownames(X) = as.character(rep.measures)
      
      # compute the mean for each unique individual
      # dealing with specific case with only one subject (leave one out case during prediction)
      X.mean.indiv = matrix(apply(X, 2, tapply, rep.measures, mean, na.rm = TRUE), # to deal with only one subject
                            nrow = length(unique(rep.measures)), ncol = dim(X)[2], dimnames = list(levels(as.factor(rep.measures)), colnames(X)))
      
      # fill the between matrix with those means 
      Xb = X.mean.indiv[as.character(rep.measures), ]
      
      # compute the within matrix as a difference between the original data 
      # and the between matrix
      Xw = X - Xb
      
      # put dimnames
      dimnames(Xw) = list(indiv.names, colnames(X))    
    } else {  # for 2 levels split
      message("Splitting the variation for 2 level factors.")
      
      ###### off set term
      Xm = colMeans(X)
      
      ###### compute the mean within each subject
      Xs = apply(X, 2, tapply, rep.measures, mean, na.rm = TRUE)
      Xs = Xs[rep.measures, ]
      
      # for the first factor
      xbfact1 = apply(X, 2, tapply, paste0(rep.measures, factors[, 1]), mean, na.rm = TRUE)
      xbfact1 = xbfact1[paste0(rep.measures, factors[, 1]), ]
      
      # for the second factor
      xbfact2 = apply(X, 2, tapply, paste0(rep.measures, factors[, 2]), mean, na.rm = TRUE)
      xbfact2 = xbfact2[paste0(rep.measures, factors[, 2]), ]
      
      #### fixed effect
      # for the first factor
      Xmfact1 = apply(X, 2, tapply, factors[, 1], mean, na.rm = TRUE)
      Xmfact1 = Xmfact1[factors[, 1], ]
      
      # for the second factor
      Xmfact2 = apply(X, 2, tapply, factors[, 2], mean, na.rm = TRUE)
      Xmfact2 = Xmfact2[factors[, 2], ]
      
      # compute the within matrix 
      Xw = X + Xs - xbfact1 - xbfact2 + Xmfact1 + Xmfact2
      Xw = sweep(Xw, 2, 2*Xm)  # see formula in refernece Liquet et al.
      
      # put dimnames
      dimnames(Xw) = dimnames(X)  
    }
    return(invisible(Xw))
  }



biplot.all <- function(z, rel_widths = c(4, 1), x=x, y=y, colby=NULL, colkey=NULL, title=NULL, subtitle=NULL, lab=NULL, ...) {
    pl = PCAtools::biplot(z,
    x=x, y=y,
    gridlines.major = FALSE,
    gridlines.minor = FALSE,
    colby = colby,
    colkey = colkey,
    legendPosition = 'right', legendLabSize = 10, legendIconSize = 5,
    drawConnectors = TRUE,
    maxoverlapsConnectors = Inf,
    title = title,
    subtitle = subtitle,
    lab=lab,
    borderWidth = 0.5
        )
    
    legend.pl <- get_legend( pl + theme(legend.box.margin = margin(0, 0, 0, 12) ) )
    pl <- plot_grid( pl + theme(legend.position="none"), align = 'vh')
    pl <- plot_grid( pl, legend.pl, rel_widths = rel_widths)
    return(pl)
}



run.all.biplots <- function(zzz, colby=colby, colvec=NULL, x="PC1", y="PC2", ...) {
 for (j in seq_along(colby) ) {
    if(j==1) {
      plotname <- paste("plot", colby[j], sep=".")
      if( is.null(colvec) ) {
         plot33 <- biplot.all(zzz, x=x, y=y, colby=colby[j] )
         outlist <- list(plot33 )
         names(outlist) <- plotname
            }
           else {
         c.vec <- colvec[ 1:length( levels( zzz$metadata[,colby[j] ] ) ) ]; names(c.vec) <- levels( zzz$metadata[,colby[j] ]) 
         plot33 <- biplot.all(zzz, x=x, y=y, colby=colby[j], colkey=c.vec)
         outlist <- list(plot33 )
         names(outlist) <- plotname
                 }
          } else {
         plotname <- paste("plot", colby[j], sep=".")
         if( is.null(colvec) ) {
         plot33 <- biplot.all(zzz, x=x, y=y, colby=colby[j] )
         outlist2 <- list(plot33 )
         names(outlist2) <- plotname
         outlist <- c(outlist, outlist2)
          } else {
         c.vec <- colvec[ 1:length( levels( zzz$metadata[,colby[j] ] ) ) ]; names(c.vec) <- levels( zzz$metadata[,colby[j] ])
         plot33 <- biplot.all(zzz, x=x, y=y, colby=colby[j], colkey=c.vec)
         outlist2 <- list(plot33 )
         names(outlist2) <- plotname
         outlist <- c(outlist, outlist2)
             }
          }
      }
   return(outlist)
}



# Function 'filterCategory' is a helper function for 'dotplotInclEmpty'
filterCategory <- function(df, N, pvalueCutoff, qvalueCutoff)
    {
    df_topn <- NULL
    for(i in levels(df$Cluster)){
       df_slice <- df %>%
	                   dplyr::filter(Cluster == i) %>% 
                       dplyr::arrange(p.adjust) %>% 
                       dplyr::filter(p.adjust < pvalueCutoff & qvalue < qvalueCutoff) %>% 
                        head(N)
      df_topn <- dplyr::bind_rows(df_topn, df_slice)
    }
    return(df_topn)
    }



# https://github.com/YuLab-SMU/enrichplot/blob/01dfdd27acc02ca57e7103fe5689a892c06eebd5/R/utilities.R#L340
default_labeller <- function(n) {
    function(str){
        str <- gsub("_", " ", str)
        yulab.utils::str_wrap(str, n)
    }
}



dotplotInclEmptyClusters <- function(geneClusters, fun = "enrichGO", data = "", ont = NULL, pool = FALSE, readable = TRUE, OrgDb = "org.Hs.eg.db",
                                 keyTypeGO = "ENTREZID", organism = "hsa", keyTypeKEGG = "kegg", universe = NULL, source_from = NULL,
                                 use_internal_data = FALSE, pvalueCutoff = 0.05, pAdjustMethod = "BH", minGSSize = 10, maxGSSize = 500, qvalueCutoff = 0.2,
                                 showCategory=5, title="", font.size=12, label_format= 30, n.dodge=1, angle=0)
# function is based on code posted here: https://github.com/YuLab-SMU/clusterProfiler/issues/157
    {
    if(fun != "enrichGO" & fun != "enrichKEGG") stop("Input function must either be 'enrichGO' or 'enrichKEGG'!\n")
    
    if(fun == "enrichGO")
        {
        do.call(library, list(as.character(OrgDb)))
        if(is.null(ont)) { ont= "BP" }
    
        # run compareCluster without any significance cutoff
        cc.res <- clusterProfiler::compareCluster(
                                   geneClusters = geneClusters,
                                   fun = fun,
                                   OrgDb = OrgDb,
                                   keyType = keyTypeGO,
                                   ont = ont,
                                   pool = pool,
                                   universe = universe,
                                   pvalueCutoff = 1,    #Set p value cutoff to 1 in order to show all results
                                   qvalueCutoff = 1,    #Set q value cutoff to 1 in order to show all results
                                   pAdjustMethod = pAdjustMethod,
                                   minGSSize = minGSSize,
                                   maxGSSize = maxGSSize)
        }
    if(fun == "enrichKEGG")
    {
        # run compareCluster without any significance cutoff
        cc.res <- clusterProfiler::compareCluster(
                                   geneClusters = geneClusters,
                                   fun = fun,
                                   organism = organism,
                                   keyType = keyTypeKEGG,
                                   universe = universe,
                                   pvalueCutoff = 1,    #Set p value cutoff to 1 in order to show all results
                                   qvalueCutoff = 1,    #Set q value cutoff to 1 in order to show all results
                                   pAdjustMethod = pAdjustMethod,
                                   minGSSize = minGSSize,
                                   maxGSSize = maxGSSize,
                                   use_internal_data = use_internal_data)
    }
    
    # process output
    cc.res <- enrichplot::pairwise_termsim(cc.res)
    if(readable){ cc.res <- DOSE::setReadable(cc.res, OrgDb = OrgDb, keyType=keyTypeGO) }
    # modify output by adding column GeneRatio
    df <- data.frame(cc.res) %>% 
            tidyr::separate(GeneRatio,c("count","total"),sep="/") %>% 
            dplyr::mutate(GeneRatio = as.double(count)/as.double(total))
    
    df_top.cats <- filterCategory(df=df,N=showCategory,pvalueCutoff=pvalueCutoff, qvalueCutoff=qvalueCutoff)    # Filter the results to show only top N GO terms less than p/q cutoff in all clusters
    
    orderVec <- NULL
    for(i in 1:length(df_top.cats$Description)){
        if(df_top.cats$Description[i] %in% orderVec){
          next
        } else {
          orderVec <- c(orderVec,df_top.cats$Description[i])
        }
    }
    
    df_top.cats <- df_top.cats %>% transform(Description = factor(Description, levels = rev(orderVec)))
    
    # labelDF: based on results from compareCluster OUTput; input clusters that contain no (0) genes are not plotted!
    labelDF <- data.frame(cc.res) %>% 
         tidyr::separate(GeneRatio,c("count","total"),sep="/") %>% 
         dplyr::mutate(GeneRatio = as.double(count)/as.double(total)) %>% 
         dplyr::select(Cluster, total) %>% 
         dplyr::distinct() %>% 
         dplyr::mutate(labels = sprintf("%s\n(%s)",Cluster,total))
    
    #labelDF2:  based on results from input list; clusters with no (0) genes are thus plotted!
    labelDF2 <- data.frame(Cluster=names(geneClusters),
                           total = rep.int(0, length(names(geneClusters) )) )
    labelDF2$labels <-  sprintf("%s\n(%s)",labelDF2$Cluster,labelDF2$total)
    #match with 'labelDF' (obtained from compareCuster output; since some genes are dropped [not annotated])
    m <- match(labelDF$Cluster, labelDF2$Cluster)
    labelDF2[m,] <- labelDF
    
    #if(is.null(label_format)) {label_format <- max(nchar (as.character((df_top.cats$Description )))) }
    
    # https://github.com/YuLab-SMU/enrichplot/blob/01dfdd27acc02ca57e7103fe5689a892c06eebd5/R/dotplot.R#L145
    label_func <- default_labeller(label_format)
    if(is.function(label_format)) { label_func <- label_format}
    
    p <- ggplot(df_top.cats, aes(x=Cluster, y=Description)) +
               geom_point(aes(size = GeneRatio, color = p.adjust)) +
                   labs(size="GeneRatio", col="Adj. P-value") +
               scale_x_discrete(drop=FALSE, labels=labelDF2$labels,   # labelDF2
                   guide = guide_axis(n.dodge=n.dodge, angle=angle) ) +   
               scale_y_discrete(labels = label_func) +
               ggtitle(title) +  
               DOSE::theme_dose(font.size) +  
               scale_colour_gradient(limits=c(0, max(df_top.cats$p.adjust)), low="red",high="blue") +
               #theme(axis.title.x = element_blank(), axis.title.y = element_blank())  
               theme(axis.title.y = element_blank()) 
    
    #return(p)
    outList <- list("compareClusterOut" = cc.res,
                    "dotplot.out" = p)
    invisible(return(outList))
}


