Materials & Methods

Materials

Methods

library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
library(ggplot2)
library(MASS)
library(org.Sc.sgd.db)
library(GOstats)


# import custom ggplot2 theme
source("theme.R")

Datasets

# import GEO expression dataset
input.file = "2014Chloe.ExpressionNorm.ID.txt"
prefix = input.file %>% basename() %>% gsub(pattern = ".txt", replacement = "")
prefix

## [1] "2014Chloe.ExpressionNorm.ID"

# import log expression values
data = read_tsv(input.file)

# import glycolytic process genes
glyco = readODS::read_ods("glycolytic_process_genes.ods", sheet = "manually_curated") %>% as_tibble()
colnames(glyco) = c("Gene","IdGene","GO.term","GO.id","Method","Source")
glyco

Results

Import data

#  compute log ratio: log(T/T0) = log(Tx) - log(T0)
#  transform to fold change relative to express at T0
data = data %>% 
    mutate(
        T0_S1_x  = 2^(T0_S1  - T0_S1), 
        T15_S1_x = 2^(T15_S1 - T0_S1), 
        T30_S1_x = 2^(T30_S1 - T0_S1), 
        T45_S1_x = 2^(T45_S1 - T0_S1), 
        T60_S1_x = 2^(T60_S1 - T0_S1),
        T0_S2_x  = 2^(T0_S2  - T0_S2), 
        T15_S2_x = 2^(T15_S2 - T0_S2), 
        T30_S2_x = 2^(T30_S2 - T0_S2), 
        T45_S2_x = 2^(T45_S2 - T0_S2), 
        T60_S2_x = 2^(T60_S2 - T0_S2),
        T0_S3_x  = 2^(T0_S3  - T0_S3), 
        T15_S3_x = 2^(T15_S3 - T0_S3), 
        T30_S3_x = 2^(T30_S3 - T0_S3), 
        T45_S3_x = 2^(T45_S3 - T0_S3), 
        T60_S3_x = 2^(T60_S3 - T0_S3)) %>% 
    dplyr::select(IdGene, contains("_x"))

names(data) = names(data) %>% sub(pattern = "_x", replacement = "")

# import SGD annotation
sgd = org.Sc.sgdGENENAME
annot.genes = as.data.frame(sgd)
names(annot.genes) = c("IdGene", "GeneName")

Time course analysis using step regression

Backward variable selection

# gather columns Tx_Sy in rows
model.data = data %>% 
    gather(key = "Time_Serie", value = Expression, 2:16) %>% 
    separate(Time_Serie, c("Time","Serie")) %>%
    transmute(
        IdGene,
        Time = as.double(substring(Time, 2)), 
        Time2 = Time^2,
        Expression)

# create a nested dataframe grouped by gene
model.data = model.data %>% group_by(IdGene) %>% nest()

# function to compute step regression 
model.fn = function(df) {
    full.model = lm(formula = Expression ~ Time + Time2, data = df)
    stepAIC(full.model, direction = "backward", trace = FALSE)
}

# apply step regression per gene
model.data = model.data %>% mutate(model = map(data, model.fn))

# function used to extract stats and coeffs 
get_results = function(model) {
    
    # output list
    result = list(
        r.squared = NA,
        p.value = NA,
        AIC = Inf,
        coefficients = NA,
        Time = 0,
        Time2 = 0
    )
    
    # extract model stats and coefficients
    stats = glance(model)
    coeff = coefficients(model)
    
    # store stats in list
    result[["r.squared"]] = stats[["r.squared"]]
    result[["p.value"]]   = stats[["p.value"]]
    result[["AIC"]]       = stats[["AIC"]]
    
    # store coeff if exist
    if ( "Time"  %in% names(coeff) ) {
        result[["Time"]] = coeff[["Time"]]
    } else {
        result[["Time"]] = 0
    }
    
    if ( "Time2" %in% names(coeff) ) {
        result[["Time2"]] = coeff[["Time2"]]
    } else {
        result[["Time2"]] = 0
    }

    return(result)
}

# extract model statistics and coefficients using map()
#   get the nested dataframe "result"
model.data = model.data %>%
    transmute(
        IdGene,
        model,
        result = map(model, get_results)
    )

# extract parameters from "result" column using map_dbl()
result.data = model.data %>%
    transmute(
        IdGene,
        Time      = map_dbl(result, "Time"),
        Time2     = map_dbl(result, "Time2"),
        r.squared = map_dbl(result, "r.squared"),
        AIC       = map_dbl(result, "AIC"),
        p.value   = map_dbl(result, "p.value")
    )

# correction for multiple testing
multi.test.method = "BH"
result.data = result.data %>% 
    bind_cols(
        adj.p.val = p.adjust(result.data$p.value, method = multi.test.method))

# filtered genes data
r2.thresthold = .7
p.val.threshold = 0.01

# filtered data on thresthold
# signif.gene = result.data %>% filter(r.squared < r2.threshold)
signif.gene = result.data %>% filter(adj.p.val < p.val.threshold)

# plot time ~ time2
ggplot(signif.gene, aes(Time, Time2)) +
    geom_point(alpha = .05) +
    #stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
    theme_agile()

# zoom plot
ggplot(signif.gene, aes(Time, Time2)) +
    geom_point(alpha = .05) + 
    # scale_x_continuous(limits = c(-.25,.25)) +
    # scale_y_continuous(limits = c(-.002,.002)) +
    coord_cartesian(xlim = c(-.25,.25), ylim = c(-.002,.002)) +
    #stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
    theme_agile()

# extract glycolytic enzyme encoding gene model coeff
glyc.gene.coeff = signif.gene %>% filter(IdGene %in% glyco[["IdGene"]])
glyc.gene.coeff

Manual clustering

# function used for the attribution to cluster 
extract.cluster = function(t, t2) {
    if(t > 0 && t2 < 0)  return("1")
    if(t < 0 && t2 > 0)  return("2")
    if(t > 0 && t2 == 0) return("3")
    if(t < 0 && t2 == 0) return("4")
    if(t == 0 && t2 > 0) return("5")
    if(t == 0 && t2 < 0) return("6")
    if(t > 0 && t2 > 0)  return("7")
    if(t < 0 && t2 < 0)  return("8")
}

# extract cluster value 
clust.res = signif.gene %>% 
    mutate(Cluster = map2(Time, Time2, extract.cluster)) %>% 
    mutate(Cluster = flatten_chr(Cluster)) %>% 
    left_join(annot.genes, by = "IdGene") %>%
    dplyr::select(IdGene, GeneName, Time:Cluster)

clust.res %>% tail(100) %>% head()

write_tsv(clust.res, path = "step-regression_results.tsv")

# summary per cluster
clust.summary = clust.res %>% group_by(Cluster) %>% summarise(n()) %>% mutate(cumsum = cumsum(`n()`))
clust.summary

write_tsv(clust.summary, path = "clustering_summary.tsv")

# average coeff values
clust.coeff = clust.res %>% group_by(Cluster) %>% summarise(avg.Time = mean(Time), avg.Time2 = mean(Time2))
clust.coeff

write_tsv(clust.coeff, path = "average_cluster-coeff.tsv")

# extract glycolytic enzyme encoding gene data
glyc.gene.cluster = clust.res %>% 
    filter(IdGene %in% glyco[["IdGene"]]) %>%
    dplyr::select(IdGene, GeneName, Time:Cluster)
glyc.gene.cluster

Color representation of coefficients

# plot time ~ time2
ggplot(clust.res, aes(Time, Time2)) +
    geom_point(aes(color = Cluster), alpha = .8, size = .3) +
    # stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
    # scale_x_continuous(limits = c(-.2, +.2)) +
    # scale_y_continuous(limits = c(-.002, +.002)) +
    scale_color_brewer(type = "qual", palette = "Set2") +
    theme_agile()

# zoom plot
ggplot(clust.res, aes(Time, Time2)) +
    geom_point(aes(color = Cluster), alpha = .8, size = .3) +
    coord_cartesian(xlim = c(-.25,.25), ylim = c(-.002,.002)) +
    #stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
    theme_agile()

Export gene lists by cluster

# add cluster id to data
clusters = clust.res %>% dplyr::select(IdGene, Cluster, GeneName)

# extract expression data
cluster.data = data %>%
    gather(key = "Time_Serie", value = Expression, 2:16) %>%
    separate(Time_Serie, c("Time","Serie")) %>%
    mutate(
        Time = as.double(substring(Time, 2)),
        Serie = substring(Serie, 2)) %>%
    left_join(clusters, by = "IdGene") %>% 
    filter(!is.na(Cluster))

# for each time point, compute average expression
express.res = cluster.data %>%
  group_by(IdGene, GeneName, Time, Cluster) %>% 
  summarise(Avg.express = mean(Expression)) %>%
  spread(key = Time, value = Avg.express, sep = "_") %>%
  dplyr::rename(t0 = Time_0, t15 = Time_15, t30 = Time_30, t45 = Time_45, t60 = Time_60)

# save results for up-regulated genes
express.res %>% 
        filter(Cluster %in% c(1,3,5,7)) %>% 
        write_tsv("cluster.up-regulated.genes.tsv", col_names = TRUE)

# save results for down-regulated genes
express.res %>% 
        filter(Cluster %in% c(2,4,6,8)) %>% 
        write_tsv("cluster.down-regulated.genes.tsv", col_names = TRUE)

# save results for each cluster
for (i in seq(1,8)) {
    out.file = paste("cluster.",i,".genes.tsv", sep = "")
    express.res %>% 
        filter(Cluster == i) %>% 
        write_tsv(out.file, col_names = TRUE)    
}

Select top up- and down-regulated genes and compute GO-term enrichment

# plot histo of fold changes for up-regulated genes 
fc.up =express.res %>% 
  group_by(IdGene) %>% 
  filter(Cluster %in% c(1,3,5,7)) %>% 
  mutate(FC = max(t0,t15,t30,t45,t60)) %>% 
  ungroup() %>% 
  dplyr::select(IdGene, FC)

# compute quantile of the distribution for up-genes
quantile(fc.up[["FC"]], probs = c(0, 10, 50, 90, 100)/100)

##        0%       10%       50%       90%      100% 
##  1.010253  1.285885  2.044742  5.813714 69.918925

# plot histo of fold changes for down-regulated genes 
#  compute -1/FC for down-regulated genes
fc.down = express.res %>% 
  group_by(IdGene) %>% 
  filter(Cluster %in% c(2,4,6,8)) %>% mutate(FC = min(-1/t0,-1/t15,-1/t30,-1/t45,-1/t60)) %>% 
  #mutate(FC = max(1/t0,1/t15,1/t30,1/t45,1/t60)) %>% 
  ungroup() %>% 
  dplyr::select(IdGene, FC)

# compute quantile of the distribution for down-genes
quantile(fc.down[["FC"]], probs = c(0, 10, 50, 90, 100)/100)

##         0%        10%        50%        90%       100% 
## -64.499744  -6.670903  -1.909342  -1.190043  -1.000000

fc.up %>% bind_rows(fc.down) %>% ggplot() + geom_histogram(aes(FC), binwidth = .25) + theme_agile()

# filter top FC genes (FC > 6, top 10% quantile)
top.gene.up = fc.up %>% filter(FC > 6)
top.gene.up %>% write_tsv("top-up-regulated-genes.tsv", col_names = TRUE)

top.gene.down = fc.down %>% filter(FC < -6) 
top.gene.down %>% write_tsv("top-down-regulated-genes.tsv", col_names = TRUE)

## GO-term enrichment

# universe gene list, remove EC1118 specific genes
universe.gene.id = signif.gene[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)

# top up/down gene list, remove EC1118 specific genes
top.gene.up.id = top.gene.up[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)
top.gene.down.id = top.gene.down[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)
top.list = list("up" = top.gene.up.id, "down" = top.gene.down.id)

# function to compute hyper geometric test
hyper.geom.test = function(selected.genes, p.val.cutoff) {

  # set hyperG test parameters
  params = new(
    "GOHyperGParams",
    geneIds = selected.genes,
    universeGeneIds = universe.gene.id,
    annotation = "org.Sc.sgd.db",
    ontology = "BP",
    pvalueCutoff = p.val.cutoff,
    conditional = FALSE,
    testDirection = "over")
  
  # hypergeometric test
  hgOver = hyperGTest(params)
  hgOver
  
  # extract p-values
  hyperG.pval = pvalues(hgOver) %>% 
    as.data.frame() %>% 
    as_tibble(rownames = "go_id") %>% 
    dplyr::rename(p.value = '.')
  
  adjust.p.value = p.adjust(hyperG.pval[["p.value"]], "BH")
  
  hyperG.pval = hyperG.pval %>%
    bind_cols(adj.p.val = adjust.p.value) %>%
    filter(adj.p.val < p.val.cutoff)
  
  # extract GO terms
  go.term = as.data.frame(GOTERM) %>% 
    as_tibble() %>% 
    filter(Ontology == "BP") %>% 
    dplyr::select(go_id, Term) %>% distinct()
  
  # join pval x go term
  hyperG.pval %>% left_join(go.term, by = "go_id")
}

# apply hyper geom test
p.value.cutoff = .001
hyper.geom.test.results = lapply(top.list, function(x) { hyper.geom.test(x, p.value.cutoff) })

# save GO-term enrichment results
hyper.geom.test.results$up   %>% write_tsv("top-up-regulated-genes.go-term.tsv")
hyper.geom.test.results$down %>% write_tsv("top-down-regulated-genes.go-term.tsv")

Generate plots of expression profiles

Plot per cluster

# generate average expression profile per cluster
cluster.profile = clust.coeff %>%  
    mutate(time = list(seq(0,60))) %>% 
    unnest() %>% 
    mutate(profile.express = 1.0 + avg.Time*time + avg.Time2*time^2)

# compute average expression per time point for each serie
avg.by.serie = cluster.data %>% 
    group_by(Cluster, Time, Serie) %>% 
    summarise(Avg.express = mean(Expression))

# plot graph
cluster.plot =  ggplot() +
    geom_line(
        aes(time, profile.express),
        data = cluster.profile,
        color = "black", alpha = 1, size = .5, linetype = 1) +
    geom_point(
        aes(Time, Avg.express, color = Serie),
        data = avg.by.serie, 
        shape = 21, stroke = .3) + 
    theme_agile() +
    scale_x_continuous(limits = c(0,60), breaks = seq(0,60,15)) +
    labs(x = "Time (min)", y = "relative expression") +
    facet_wrap(~Cluster, scales = "free", ncol = 2)

cluster.plot

# save plot
plot.file = paste(prefix, ".manual-clusters.plot.pdf", sep = "")
ggsave(
    plot.file, 
    plot = cluster.plot, 
    device = "pdf", 
    width = 5, 
    height = 9)

Plot for glycolysis associated genes

# extract expression data for glyco genes
glyco.data = data %>% dplyr::filter(IdGene %in% glyco[["IdGene"]]) %>% 
    gather(key = "Time_Serie", value = Expression, 2:16) %>% 
    separate(Time_Serie, c("Time","Serie")) %>%
    mutate(
        Time = as.double(substring(Time, 2)), 
        Serie = substring(Serie, 2)) %>% 
    left_join(glyco, by = "IdGene") %>%
    dplyr::select(Gene, Time, Serie, Expression)

# compute average expression level (over S1,2,3) 
avg.by.time = glyco.data %>% 
    group_by(Gene, Time) %>% 
    summarise(Avg.express = mean(Expression))

# plot glyco genes expression profiles
glyco.plot =  ggplot() + 
    geom_line(
        aes(Time, Avg.express), 
        data = avg.by.time, 
        color = "red", alpha = 1, size = .3, linetype = 2) +
    geom_point(
        aes(Time, Expression, color = Serie),
        data = glyco.data, 
        shape = 21, stroke = .3) + 
    theme_agile() +
    scale_x_continuous(limits = c(0,60), breaks = seq(0,60,15)) +
    labs(x = "Time (min)", y = "Average expression") +
    facet_wrap(~Gene, scales = "free_y", ncol = 4)

glyco.plot

# save plot
glyco.plot.file = paste(prefix, ".glycolytic.plot.pdf", sep = "")
ggsave(
    glyco.plot.file, 
    plot = glyco.plot, 
    device = "pdf", 
    width = 10, 
    height = 10)

Session

sessionInfo()

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
##  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] GO.db_3.5.0          bindrcpp_0.2         GOstats_2.44.0      
##  [4] graph_1.56.0         Category_2.44.0      Matrix_1.2-12       
##  [7] org.Sc.sgd.db_3.5.0  AnnotationDbi_1.40.0 IRanges_2.12.0      
## [10] S4Vectors_0.16.0     Biobase_2.38.0       BiocGenerics_0.24.0 
## [13] MASS_7.3-49          ggplot2_2.2.1        broom_0.5.1         
## [16] purrr_0.2.4          tidyr_0.8.0          dplyr_0.7.4         
## [19] readr_1.1.1          knitr_1.20          
## 
## loaded via a namespace (and not attached):
##  [1] genefilter_1.60.0      tidyselect_0.2.4       splines_3.4.4         
##  [4] lattice_0.20-35        colorspace_1.3-2       generics_0.0.2        
##  [7] htmltools_0.3.6        yaml_2.2.0             survival_2.41-3       
## [10] RBGL_1.54.0            blob_1.1.1             XML_3.98-1.16         
## [13] rlang_0.2.0            pillar_1.0.1           glue_1.2.0            
## [16] DBI_1.0.0              Rgraphviz_2.22.0       bit64_0.9-7           
## [19] bindr_0.1              plyr_1.8.4             stringr_1.3.0         
## [22] munsell_0.4.3          gtable_0.2.0           codetools_0.2-15      
## [25] evaluate_0.12          memoise_1.1.0          GSEABase_1.40.1       
## [28] Rcpp_0.12.15           xtable_1.8-3           scales_0.5.0          
## [31] backports_1.1.2        annotate_1.56.2        bit_1.1-14            
## [34] hms_0.4.0              digest_0.6.15          stringi_1.1.6         
## [37] grid_3.4.4             rprojroot_1.3-2        tools_3.4.4           
## [40] bitops_1.0-6           magrittr_1.5           lazyeval_0.2.1        
## [43] RCurl_1.95-4.11        tibble_1.4.1           RSQLite_2.1.1         
## [46] pkgconfig_2.0.1        assertthat_0.2.0       rmarkdown_1.10        
## [49] AnnotationForge_1.20.0 R6_2.2.2               nlme_3.1-131          
## [52] compiler_3.4.4

Gene expression analysis by step regression and manual clustering

Frédéric Bigey

2018-12-20

Materials & Methods

Materials

Methods

Datasets

Results

Import data

Time course analysis using step regression

Backward variable selection

Manual clustering

Color representation of coefficients

Export gene lists by cluster

Select top up- and down-regulated genes and compute GO-term enrichment

Generate plots of expression profiles

Plot per cluster

Plot for glycolysis associated genes

Session