Import data
# compute log ratio: log(T/T0) = log(Tx) - log(T0)
# transform to fold change relative to express at T0
data = data %>%
mutate(
T0_S1_x = 2^(T0_S1 - T0_S1),
T15_S1_x = 2^(T15_S1 - T0_S1),
T30_S1_x = 2^(T30_S1 - T0_S1),
T45_S1_x = 2^(T45_S1 - T0_S1),
T60_S1_x = 2^(T60_S1 - T0_S1),
T0_S2_x = 2^(T0_S2 - T0_S2),
T15_S2_x = 2^(T15_S2 - T0_S2),
T30_S2_x = 2^(T30_S2 - T0_S2),
T45_S2_x = 2^(T45_S2 - T0_S2),
T60_S2_x = 2^(T60_S2 - T0_S2),
T0_S3_x = 2^(T0_S3 - T0_S3),
T15_S3_x = 2^(T15_S3 - T0_S3),
T30_S3_x = 2^(T30_S3 - T0_S3),
T45_S3_x = 2^(T45_S3 - T0_S3),
T60_S3_x = 2^(T60_S3 - T0_S3)) %>%
dplyr::select(IdGene, contains("_x"))
names(data) = names(data) %>% sub(pattern = "_x", replacement = "")
# import SGD annotation
sgd = org.Sc.sgdGENENAME
annot.genes = as.data.frame(sgd)
names(annot.genes) = c("IdGene", "GeneName")
Time course analysis using step regression
Backward variable selection
# gather columns Tx_Sy in rows
model.data = data %>%
gather(key = "Time_Serie", value = Expression, 2:16) %>%
separate(Time_Serie, c("Time","Serie")) %>%
transmute(
IdGene,
Time = as.double(substring(Time, 2)),
Time2 = Time^2,
Expression)
# create a nested dataframe grouped by gene
model.data = model.data %>% group_by(IdGene) %>% nest()
# function to compute step regression
model.fn = function(df) {
full.model = lm(formula = Expression ~ Time + Time2, data = df)
stepAIC(full.model, direction = "backward", trace = FALSE)
}
# apply step regression per gene
model.data = model.data %>% mutate(model = map(data, model.fn))
# function used to extract stats and coeffs
get_results = function(model) {
# output list
result = list(
r.squared = NA,
p.value = NA,
AIC = Inf,
coefficients = NA,
Time = 0,
Time2 = 0
)
# extract model stats and coefficients
stats = glance(model)
coeff = coefficients(model)
# store stats in list
result[["r.squared"]] = stats[["r.squared"]]
result[["p.value"]] = stats[["p.value"]]
result[["AIC"]] = stats[["AIC"]]
# store coeff if exist
if ( "Time" %in% names(coeff) ) {
result[["Time"]] = coeff[["Time"]]
} else {
result[["Time"]] = 0
}
if ( "Time2" %in% names(coeff) ) {
result[["Time2"]] = coeff[["Time2"]]
} else {
result[["Time2"]] = 0
}
return(result)
}
# extract model statistics and coefficients using map()
# get the nested dataframe "result"
model.data = model.data %>%
transmute(
IdGene,
model,
result = map(model, get_results)
)
# extract parameters from "result" column using map_dbl()
result.data = model.data %>%
transmute(
IdGene,
Time = map_dbl(result, "Time"),
Time2 = map_dbl(result, "Time2"),
r.squared = map_dbl(result, "r.squared"),
AIC = map_dbl(result, "AIC"),
p.value = map_dbl(result, "p.value")
)
# correction for multiple testing
multi.test.method = "BH"
result.data = result.data %>%
bind_cols(
adj.p.val = p.adjust(result.data$p.value, method = multi.test.method))
# filtered genes data
r2.thresthold = .7
p.val.threshold = 0.01
# filtered data on thresthold
# signif.gene = result.data %>% filter(r.squared < r2.threshold)
signif.gene = result.data %>% filter(adj.p.val < p.val.threshold)
# plot time ~ time2
ggplot(signif.gene, aes(Time, Time2)) +
geom_point(alpha = .05) +
#stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
theme_agile()

# zoom plot
ggplot(signif.gene, aes(Time, Time2)) +
geom_point(alpha = .05) +
# scale_x_continuous(limits = c(-.25,.25)) +
# scale_y_continuous(limits = c(-.002,.002)) +
coord_cartesian(xlim = c(-.25,.25), ylim = c(-.002,.002)) +
#stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
theme_agile()

# extract glycolytic enzyme encoding gene model coeff
glyc.gene.coeff = signif.gene %>% filter(IdGene %in% glyco[["IdGene"]])
glyc.gene.coeff
Manual clustering
# function used for the attribution to cluster
extract.cluster = function(t, t2) {
if(t > 0 && t2 < 0) return("1")
if(t < 0 && t2 > 0) return("2")
if(t > 0 && t2 == 0) return("3")
if(t < 0 && t2 == 0) return("4")
if(t == 0 && t2 > 0) return("5")
if(t == 0 && t2 < 0) return("6")
if(t > 0 && t2 > 0) return("7")
if(t < 0 && t2 < 0) return("8")
}
# extract cluster value
clust.res = signif.gene %>%
mutate(Cluster = map2(Time, Time2, extract.cluster)) %>%
mutate(Cluster = flatten_chr(Cluster)) %>%
left_join(annot.genes, by = "IdGene") %>%
dplyr::select(IdGene, GeneName, Time:Cluster)
clust.res %>% tail(100) %>% head()
write_tsv(clust.res, path = "step-regression_results.tsv")
# summary per cluster
clust.summary = clust.res %>% group_by(Cluster) %>% summarise(n()) %>% mutate(cumsum = cumsum(`n()`))
clust.summary
write_tsv(clust.summary, path = "clustering_summary.tsv")
# average coeff values
clust.coeff = clust.res %>% group_by(Cluster) %>% summarise(avg.Time = mean(Time), avg.Time2 = mean(Time2))
clust.coeff
write_tsv(clust.coeff, path = "average_cluster-coeff.tsv")
# extract glycolytic enzyme encoding gene data
glyc.gene.cluster = clust.res %>%
filter(IdGene %in% glyco[["IdGene"]]) %>%
dplyr::select(IdGene, GeneName, Time:Cluster)
glyc.gene.cluster
Color representation of coefficients
# plot time ~ time2
ggplot(clust.res, aes(Time, Time2)) +
geom_point(aes(color = Cluster), alpha = .8, size = .3) +
# stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
# scale_x_continuous(limits = c(-.2, +.2)) +
# scale_y_continuous(limits = c(-.002, +.002)) +
scale_color_brewer(type = "qual", palette = "Set2") +
theme_agile()

# zoom plot
ggplot(clust.res, aes(Time, Time2)) +
geom_point(aes(color = Cluster), alpha = .8, size = .3) +
coord_cartesian(xlim = c(-.25,.25), ylim = c(-.002,.002)) +
#stat_smooth(method = "lm", formula = "y ~ x", size = .3) +
theme_agile()

Export gene lists by cluster
# add cluster id to data
clusters = clust.res %>% dplyr::select(IdGene, Cluster, GeneName)
# extract expression data
cluster.data = data %>%
gather(key = "Time_Serie", value = Expression, 2:16) %>%
separate(Time_Serie, c("Time","Serie")) %>%
mutate(
Time = as.double(substring(Time, 2)),
Serie = substring(Serie, 2)) %>%
left_join(clusters, by = "IdGene") %>%
filter(!is.na(Cluster))
# for each time point, compute average expression
express.res = cluster.data %>%
group_by(IdGene, GeneName, Time, Cluster) %>%
summarise(Avg.express = mean(Expression)) %>%
spread(key = Time, value = Avg.express, sep = "_") %>%
dplyr::rename(t0 = Time_0, t15 = Time_15, t30 = Time_30, t45 = Time_45, t60 = Time_60)
# save results for up-regulated genes
express.res %>%
filter(Cluster %in% c(1,3,5,7)) %>%
write_tsv("cluster.up-regulated.genes.tsv", col_names = TRUE)
# save results for down-regulated genes
express.res %>%
filter(Cluster %in% c(2,4,6,8)) %>%
write_tsv("cluster.down-regulated.genes.tsv", col_names = TRUE)
# save results for each cluster
for (i in seq(1,8)) {
out.file = paste("cluster.",i,".genes.tsv", sep = "")
express.res %>%
filter(Cluster == i) %>%
write_tsv(out.file, col_names = TRUE)
}
Select top up- and down-regulated genes and compute GO-term enrichment
# plot histo of fold changes for up-regulated genes
fc.up =express.res %>%
group_by(IdGene) %>%
filter(Cluster %in% c(1,3,5,7)) %>%
mutate(FC = max(t0,t15,t30,t45,t60)) %>%
ungroup() %>%
dplyr::select(IdGene, FC)
# compute quantile of the distribution for up-genes
quantile(fc.up[["FC"]], probs = c(0, 10, 50, 90, 100)/100)
## 0% 10% 50% 90% 100%
## 1.010253 1.285885 2.044742 5.813714 69.918925
# plot histo of fold changes for down-regulated genes
# compute -1/FC for down-regulated genes
fc.down = express.res %>%
group_by(IdGene) %>%
filter(Cluster %in% c(2,4,6,8)) %>% mutate(FC = min(-1/t0,-1/t15,-1/t30,-1/t45,-1/t60)) %>%
#mutate(FC = max(1/t0,1/t15,1/t30,1/t45,1/t60)) %>%
ungroup() %>%
dplyr::select(IdGene, FC)
# compute quantile of the distribution for down-genes
quantile(fc.down[["FC"]], probs = c(0, 10, 50, 90, 100)/100)
## 0% 10% 50% 90% 100%
## -64.499744 -6.670903 -1.909342 -1.190043 -1.000000
fc.up %>% bind_rows(fc.down) %>% ggplot() + geom_histogram(aes(FC), binwidth = .25) + theme_agile()

# filter top FC genes (FC > 6, top 10% quantile)
top.gene.up = fc.up %>% filter(FC > 6)
top.gene.up %>% write_tsv("top-up-regulated-genes.tsv", col_names = TRUE)
top.gene.down = fc.down %>% filter(FC < -6)
top.gene.down %>% write_tsv("top-down-regulated-genes.tsv", col_names = TRUE)
## GO-term enrichment
# universe gene list, remove EC1118 specific genes
universe.gene.id = signif.gene[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)
# top up/down gene list, remove EC1118 specific genes
top.gene.up.id = top.gene.up[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)
top.gene.down.id = top.gene.down[["IdGene"]] %>% grep(pattern = "^EC", invert = TRUE, value = TRUE)
top.list = list("up" = top.gene.up.id, "down" = top.gene.down.id)
# function to compute hyper geometric test
hyper.geom.test = function(selected.genes, p.val.cutoff) {
# set hyperG test parameters
params = new(
"GOHyperGParams",
geneIds = selected.genes,
universeGeneIds = universe.gene.id,
annotation = "org.Sc.sgd.db",
ontology = "BP",
pvalueCutoff = p.val.cutoff,
conditional = FALSE,
testDirection = "over")
# hypergeometric test
hgOver = hyperGTest(params)
hgOver
# extract p-values
hyperG.pval = pvalues(hgOver) %>%
as.data.frame() %>%
as_tibble(rownames = "go_id") %>%
dplyr::rename(p.value = '.')
adjust.p.value = p.adjust(hyperG.pval[["p.value"]], "BH")
hyperG.pval = hyperG.pval %>%
bind_cols(adj.p.val = adjust.p.value) %>%
filter(adj.p.val < p.val.cutoff)
# extract GO terms
go.term = as.data.frame(GOTERM) %>%
as_tibble() %>%
filter(Ontology == "BP") %>%
dplyr::select(go_id, Term) %>% distinct()
# join pval x go term
hyperG.pval %>% left_join(go.term, by = "go_id")
}
# apply hyper geom test
p.value.cutoff = .001
hyper.geom.test.results = lapply(top.list, function(x) { hyper.geom.test(x, p.value.cutoff) })
# save GO-term enrichment results
hyper.geom.test.results$up %>% write_tsv("top-up-regulated-genes.go-term.tsv")
hyper.geom.test.results$down %>% write_tsv("top-down-regulated-genes.go-term.tsv")
Generate plots of expression profiles
Plot per cluster
# generate average expression profile per cluster
cluster.profile = clust.coeff %>%
mutate(time = list(seq(0,60))) %>%
unnest() %>%
mutate(profile.express = 1.0 + avg.Time*time + avg.Time2*time^2)
# compute average expression per time point for each serie
avg.by.serie = cluster.data %>%
group_by(Cluster, Time, Serie) %>%
summarise(Avg.express = mean(Expression))
# plot graph
cluster.plot = ggplot() +
geom_line(
aes(time, profile.express),
data = cluster.profile,
color = "black", alpha = 1, size = .5, linetype = 1) +
geom_point(
aes(Time, Avg.express, color = Serie),
data = avg.by.serie,
shape = 21, stroke = .3) +
theme_agile() +
scale_x_continuous(limits = c(0,60), breaks = seq(0,60,15)) +
labs(x = "Time (min)", y = "relative expression") +
facet_wrap(~Cluster, scales = "free", ncol = 2)
cluster.plot

# save plot
plot.file = paste(prefix, ".manual-clusters.plot.pdf", sep = "")
ggsave(
plot.file,
plot = cluster.plot,
device = "pdf",
width = 5,
height = 9)
Plot for glycolysis associated genes
# extract expression data for glyco genes
glyco.data = data %>% dplyr::filter(IdGene %in% glyco[["IdGene"]]) %>%
gather(key = "Time_Serie", value = Expression, 2:16) %>%
separate(Time_Serie, c("Time","Serie")) %>%
mutate(
Time = as.double(substring(Time, 2)),
Serie = substring(Serie, 2)) %>%
left_join(glyco, by = "IdGene") %>%
dplyr::select(Gene, Time, Serie, Expression)
# compute average expression level (over S1,2,3)
avg.by.time = glyco.data %>%
group_by(Gene, Time) %>%
summarise(Avg.express = mean(Expression))
# plot glyco genes expression profiles
glyco.plot = ggplot() +
geom_line(
aes(Time, Avg.express),
data = avg.by.time,
color = "red", alpha = 1, size = .3, linetype = 2) +
geom_point(
aes(Time, Expression, color = Serie),
data = glyco.data,
shape = 21, stroke = .3) +
theme_agile() +
scale_x_continuous(limits = c(0,60), breaks = seq(0,60,15)) +
labs(x = "Time (min)", y = "Average expression") +
facet_wrap(~Gene, scales = "free_y", ncol = 4)
glyco.plot

# save plot
glyco.plot.file = paste(prefix, ".glycolytic.plot.pdf", sep = "")
ggsave(
glyco.plot.file,
plot = glyco.plot,
device = "pdf",
width = 10,
height = 10)