This is an analysis of differential expression at the gene level between treatment groups for control and infected mallard ileum samples.
Prep data for analysis
cnt <- cnt1 %>%
as_tibble() %>%
select(-gene)
newNames.ileum <- names(cnt) %>%
as_tibble() %>%
mutate(value = gsub("Iluem_", "", .$value)) %>%
separate(value, into = c(NA, "pt1", "pt2")) %>%
mutate(pt1 = na_if(pt1, "MALL"),
pt2 = na_if(pt2, "ileum")) %>%
mutate(bird = ifelse(is.na(pt1), pt2, pt1)) %>%
mutate(bird = as.numeric(bird),
bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
mutate(newName = paste0("ileum_", bird))
colnames(cnt) <- newNames.ileum$newName
cnt <- cnt[,order(colnames(cnt))] %>%
mutate(gene = cnt1$gene) %>%
column_to_rownames("gene") %>%
as_tibble(rownames = NA)
### Covars
covars <- covars %>%
mutate(bird = as.numeric(bird)) %>%
mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
arrange(bird) %>%
mutate(bird = as.factor(bird)) %>%
mutate(group = str_remove(group, "-"))
covars.ileum <- covars %>%
filter(bird %in% newNames.ileum$bird)
cnt <- cnt %>% select(-ileum_01, -ileum_72)
covars.ileum <- covars.ileum %>% filter(bird != "01",
bird != "72")
Prep data for analysis
Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.
#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]
#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)
#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE TRUE
## 17388 728
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 14563
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE
## 14563
dim(dge)
## [1] 14563 40
#TMM normalization
dge <- calcNormFactors(dge, method = "TMM")
Differential expression analysis
Establish design & contrast matrices
## Establish design matrix
design = model.matrix(~ 0 +
group +
age +
sex +
pool)
colnames(design) <- gsub("group", "", colnames(design))
contr.matrix = makeContrasts(
CtlvI1 = Ctl - I1,
CtlvI2 = Ctl - I2,
CtlvI5 = Ctl - I5,
I1vI2 = I1 - I2,
I1vI5 = I1 - I5,
I2vI5 = I2 - I5,
levels = colnames(design))
## Mean-variance trend and sample weight plots
v <- voomWithQualityWeights(dge, design, plot = TRUE)

## Fitting the model
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
tfit <- treat(vfit, lfc = 0.5)
plotSA(tfit)

Count DE genes
For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.
dt <- decideTests(tfit, p.value = 0.1, adjust.method = "fdr")
summary(dt) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
|
CtlvI1
|
CtlvI2
|
CtlvI5
|
I1vI2
|
I1vI5
|
I2vI5
|
Down
|
0
|
0
|
0
|
0
|
0
|
1
|
NotSig
|
14563
|
14563
|
14563
|
14563
|
14561
|
14559
|
Up
|
0
|
0
|
0
|
0
|
2
|
3
|
dt.tib <- as_tibble(dt, rownames = NA) %>%
rownames_to_column("gene") %>%
mutate_at(vars(starts_with("C")), as.numeric) %>%
mutate_at(vars(starts_with("I")), as.numeric) %>%
filter_at(vars(CtlvI1:I2vI5), any_vars(. != 0))
Volcano plot
tmp1 <- topTreat(tfit, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Control vs. Infected, Day 1") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
tmp2 <- topTreat(tfit, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Control vs. Infected, Day 2") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
tmp3 <- topTreat(tfit, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Control vs. Infected, Day 5") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
tmp4 <- topTreat(tfit, coef = 4, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Infected, Day 1 vs. Infected, Day 2") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
tmp5 <- topTreat(tfit, coef = 5, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Infected, Day 1 vs. Infected, Day 5") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
tmp6 <- topTreat(tfit, coef = 6, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Infected, Day 2 vs. Infected, Day 5") +
ylab("-Log10(p-value)") +
xlab("Log2(Fold change)") +
theme_bw() +
theme(legend.position = "none")
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)

Heatmap
# Get the gene names for DE genes
deGeneCounts <- lcpm %>%
as_tibble(rownames = NA) %>%
rownames_to_column("gene") %>%
filter(gene %in% dt.tib$gene) %>%
column_to_rownames("gene")
group2 <- group %>%
as_tibble() %>%
mutate(group = sub("I2", "I02", value)) %>%
mutate(group = sub("I1", "I01", group)) %>%
mutate(group = sub("I5", "I05", group))
newNames1 <- colnames(deGeneCounts) %>%
as_tibble() %>%
separate(value, into = c(NA, "sample")) %>%
mutate(group = group2$group) %>%
unite("sample", 2:1, sep = "_")
colnames(deGeneCounts) <- newNames1$sample
deGeneCounts <- deGeneCounts %>% dplyr::select(order(colnames(.))) %>%
as.matrix()
heatmap.annot <- annot %>%
filter(ensembl_gene_id %in% dt.tib$gene) %>%
mutate(hgnc_symbol = replace_na(hgnc_symbol, ".")) %>%
unite(annot.hm, c(ensembl_gene_id, hgnc_symbol), sep = "-", remove = FALSE) %>%
select(annot.hm, ensembl_gene_id)
deGeneCounts <- deGeneCounts %>%
as_tibble(rownames = NA) %>%
mutate(gene = rownames(.)) %>%
rename(ensembl_gene_id = gene) %>%
left_join(heatmap.annot) %>%
column_to_rownames("annot.hm") %>%
select(-ensembl_gene_id) %>%
as.matrix()
## Set up palette
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
hc <- hclust(as.dist(1-cor(t(deGeneCounts))))
# Plot the heatmap
heatmap.2(deGeneCounts,
Colv = FALSE,
Rowv = as.dendrogram(hc),
col = rev(morecols(50)),
trace = "none",
colsep = c(5, 17, 27),
dendrogram = "row",
density.info = "none",
key = TRUE,
margins = c(5, 22),
scale ="row")

Boxplots for differentially expressed genes
lcpm.DE <- lcpm %>%
as_tibble(rownames = NA) %>%
rownames_to_column("identifier") %>%
filter(identifier %in% dt.tib$gene) %>%
pivot_longer(cols = contains("_"),
names_to = "sample",
values_to = "lcpm") %>%
separate(sample, into = c("tissue", "bird")) %>%
mutate(bird = as.factor(bird)) %>%
left_join(covars, by = "bird") %>%
select(identifier, bird, lcpm, group) %>%
mutate(group = recode(group, C29 = "Control"),
group = recode(group, C1 = "Control"),
tmpID = ifelse(group == "Control", 'Control', 'Infected'))
aprioriPlotting <- function(target, ...) {
annot.target <- annot %>%
select(
ensembl_gene_id,
hgnc_symbol) %>%
filter(ensembl_gene_id == target) %>%
mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))
plot <- lcpm.DE %>%
filter(identifier == target) %>%
mutate(group = fct_relevel(group, "Control", "I1", "I2", "I5", "I15", "I29")) %>%
ggplot(aes(x = group, y = lcpm)) +
facet_grid(. ~ tmpID, scales = "free", space = "free") +
ylab("Log2(Counts per million)") +
xlab("Group") +
geom_point(position = position_dodge(width=0.75), show.legend = FALSE) +
geom_boxplot(alpha = 0.5) +
geom_label_repel(aes(label = bird, fill = NULL), show.legend=FALSE) +
theme_classic() +
labs(title= paste0(target, " - ", annot.target[1,2])) +
theme(legend.title = element_blank())
print(plot)
}
for(target in sort(unique(lcpm.DE$identifier))) {
aprioriPlotting(target)
}





Annotations for differentially expressed genes
ns denotes non-significant genes for each comparison and numerical values are the Log2(Fold change) difference
ensembl_gene_id
|
hgnc_symbol
|
CtlvI1
|
CtlvI2
|
CtlvI5
|
I1vI2
|
I1vI5
|
I2vI5
|
ENSAPLG00020005696
|
CA7
|
ns
|
ns
|
ns
|
ns
|
3.49
|
ns
|
ENSAPLG00020005864
|
NA
|
ns
|
ns
|
ns
|
ns
|
ns
|
-3.25
|
ENSAPLG00020006372
|
CALB1
|
ns
|
ns
|
ns
|
ns
|
6.91
|
4.85
|
ENSAPLG00020004306
|
FGG
|
ns
|
ns
|
ns
|
ns
|
ns
|
3.28
|
ENSAPLG00020010772
|
SLC26A9
|
ns
|
ns
|
ns
|
ns
|
ns
|
4.54
|
Gene functions
bind_rows(tmp1a, tmp2a, tmp3a, tmp4a, tmp5a, tmp6a) %>%
select(-comp, -adj.P.Val, -logFC) %>%
filter(ensembl_gene_id %in% dt.tib$gene) %>%
distinct(.) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id
|
gene_biotype
|
hgnc_symbol
|
description
|
ENSAPLG00020005696
|
protein_coding
|
CA7
|
carbonic anhydrase 7 [Source:NCBI gene;Acc:101798320]
|
ENSAPLG00020005864
|
protein_coding
|
NA
|
vasotocin-neurophysin VT-like [Source:NCBI gene;Acc:113843450]
|
ENSAPLG00020006372
|
protein_coding
|
CALB1
|
calbindin 1 [Source:NCBI gene;Acc:101802006]
|
ENSAPLG00020004306
|
protein_coding
|
FGG
|
fibrinogen gamma chain [Source:NCBI gene;Acc:101793661]
|
ENSAPLG00020010772
|
protein_coding
|
SLC26A9
|
solute carrier family 26 member 9 [Source:HGNC Symbol;Acc:HGNC:14469]
|
Pathway analysis
de.entrez <- annot %>% filter(ensembl_gene_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)
K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>%
as_tibble(rownames = NA) %>%
rownames_to_column("PathwayID") %>%
filter(P.DE < 0.05) %>%
arrange(P.DE)
keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>%
as_tibble() %>%
rename("entrezgene_id" = GeneID)
keggGenes = data.frame()
for (path in nrow(K)){
procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
df <- data.frame(procPath)
keggGenes <- rbind(keggGenes, df)
}
inner_join(keggGenes, K) %>%
as_tibble() %>%
mutate(entrezgene_id = as.numeric(entrezgene_id)) %>%
left_join(annot) %>%
select(Pathway, PathwayID, N, DE, P.DE, ensembl_gene_id, hgnc_symbol, entrezgene_id) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway
|
PathwayID
|
N
|
DE
|
P.DE
|
ensembl_gene_id
|
hgnc_symbol
|
entrezgene_id
|
Nitrogen metabolism
|
path:apla00910
|
3
|
1
|
0.0025722
|
ENSAPLG00020005696
|
CA7
|
101798320
|