This is an analysis of differential expression at the gene level between shedding rate groupings based on average shedding rates for infected mallard ileum samples.
## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(ggrepel)
library(gplots)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)
## Load data
cnt1 <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/ileumCounts.txt", delim = "\t") %>% distinct(gene, .keep_all = TRUE)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationGenesIleum110120.txt", delim = "\t")
Prep data for analysis
cnt <- cnt1 %>%
as_tibble() %>%
select(-gene)
newNames.ileum <- names(cnt) %>%
as_tibble() %>%
mutate(value = gsub("Iluem_", "", .$value)) %>%
separate(value, into = c(NA, "pt1", "pt2")) %>%
mutate(pt1 = na_if(pt1, "MALL"),
pt2 = na_if(pt2, "ileum")) %>%
mutate(bird = ifelse(is.na(pt1), pt2, pt1)) %>%
mutate(bird = as.numeric(bird),
bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
mutate(newName = paste0("ileum_", bird))
colnames(cnt) <- newNames.ileum$newName
cnt <- cnt[,order(colnames(cnt))] %>%
mutate(gene = cnt1$gene) %>%
column_to_rownames("gene") %>%
as_tibble(rownames = NA)
### Covars
covars <- covars %>%
mutate(bird = as.numeric(bird)) %>%
mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
arrange(bird) %>%
mutate(bird = as.factor(bird)) %>%
mutate(group = str_remove(group, "-"))
covars.ileum <- covars %>%
filter(bird %in% newNames.ileum$bird)
cnt <- cnt %>% select(-ileum_01, -ileum_72)
covars.ileum <- covars.ileum %>% filter(bird != "01",
bird != "72")
Prep data for analysis
Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.
#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]
#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)
#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE TRUE
## 17388 728
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 14563
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE
## 14563
dim(dge)
## [1] 14563 40
Subset data
Following normalization, we subset the data based on infection groups.
#Assign groups in DGElist
group <- recode(covars.ileum$group, C1 = "Ctl", C29 = "Ctl") %>%
as.factor()
dge$samples$group <- group
### Divide out I1 and I3
dge.I5 <- dge[,grep('\\bI5\\b', dge$samples$group)]
dge.I5 <- calcNormFactors(dge.I5, method = "TMM")
covars.I5 <- covars.ileum %>% filter(group == "I5")
Differential expression analysis
## Establish design matrix: I5
design.I5 = model.matrix(~ 0 +
factor(covars.I5$SSgroup.virus.avg) +
covars.I5$age +
covars.I5$sex +
covars.I5$Bursa_pool)
colnames(design.I5) <- c("HIGH", "LOW", "MODERATE",
"age", "sexM", "poolMSU2",
"poolMSU3", "poolMSU4", "poolMSU5")
contr.matrix.I5 = makeContrasts(
LvM = LOW - MODERATE,
MvH = MODERATE - HIGH,
LvH = LOW - HIGH,
levels = colnames(design.I5))
## Mean-variance trend plots
v.I5 <- voomWithQualityWeights(dge.I5, design.I5, plot = FALSE)
#Fitting models
vfit.I5 <- lmFit(v.I5, design.I5)
vfit.I5 <- contrasts.fit(vfit.I5, contrasts = contr.matrix.I5)
tfit.I5 <- treat(vfit.I5, lfc = 0.5)
## Identify DE genes
dt.I5 <- decideTests(tfit.I5, p.value = 0.1, adjust.method = "fdr")
dt.tib.I5 <- as_tibble(dt.I5, rownames = NA) %>%
rownames_to_column("gene") %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
rename(LvM.I5 = LvM,
MvH.I5 = MvH,
LvH.I5 = LvH)
dt.tib <- dt.tib.I5 %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
filter_at(vars(LvM.I5:LvH.I5), any_vars(. != 0))
Numbers of DE genes
allResults <- as.data.frame(summary(dt.I5)) %>%
filter(Var1 != "NotSig")
kable(allResults) %>%
kable_styling("striped", full_width = F)
Var1
|
Var2
|
Freq
|
Down
|
LvM
|
0
|
Up
|
LvM
|
0
|
Down
|
MvH
|
2
|
Up
|
MvH
|
28
|
Down
|
LvH
|
0
|
Up
|
LvH
|
20
|
Volcano plot
tmp1 <- topTreat(tfit.I5, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. Moderate shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp2 <- topTreat(tfit.I5, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Moderate vs. High shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp3 <- topTreat(tfit.I5, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. High shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
grid.arrange(p1, p2, p3, nrow = 1)

Heatmap
# Get the gene names for DE genes
lcpm2 <- as_tibble(dge.I5$counts, rownames = NA) %>%
cpm(., log = TRUE)
newNames <- colnames(lcpm2) %>%
as_tibble() %>%
separate(value, into = c(NA, "bird")) %>%
bind_cols(covars.I5) %>%
unite("newNames", SSgroup.virus.avg, group, bird)
colOrder <- covars.I5 %>%
mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
arrange(SSgroup.virus.avg, group, bird) %>%
unite("newNames", SSgroup.virus.avg, group, bird)
colnames(lcpm2) <- newNames$newNames
lcpm2 <- lcpm2[, colOrder$newNames]
heatmap.annot <- annot %>%
filter(ensembl_gene_id %in% dt.tib$gene) %>%
mutate(hgnc_symbol = replace_na(hgnc_symbol, ".")) %>%
unite(annot.hm, c(ensembl_gene_id, hgnc_symbol), sep = "-", remove = FALSE) %>%
select(annot.hm, ensembl_gene_id) %>%
distinct(annot.hm, .keep_all = TRUE)
deGeneCounts <- lcpm2 %>%
as_tibble() %>%
mutate(ensembl_gene_id = rownames(dge.I5$counts)) %>%
filter(ensembl_gene_id %in% dt.tib$gene) %>%
left_join(heatmap.annot) %>%
column_to_rownames("annot.hm") %>%
select(-ensembl_gene_id) %>%
as.matrix()
## Set up palette
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
hc <- hclust(as.dist(1-cor(t(deGeneCounts))))
# Plot the heatmap
heatmap.2(deGeneCounts,
Colv = FALSE,
Rowv = as.dendrogram(hc),
col = rev(morecols(50)),
trace = "none",
colsep = c(4, 10),
dendrogram = "row",
density.info = "none",
key = TRUE,
#main = "Ileum - Gene level",
margins = c(10, 15),
scale ="row")

Boxplots for differentially expressed genes
lcpm.DE <- lcpm %>%
as_tibble(rownames = NA) %>%
rownames_to_column("identifier") %>%
filter(identifier %in% dt.tib$gene) %>%
pivot_longer(cols = contains("_"),
names_to = "sample",
values_to = "lcpm") %>%
separate(sample, into = c("tissue", "bird")) %>%
mutate(bird = as.factor(bird)) %>%
left_join(covars, by = "bird") %>%
filter(group == "C1" | group == "C14" | group == "I5") %>%
select(identifier, bird, lcpm, SSgroup.virus.avg, group) %>%
mutate(group = recode(group, C29 = "Control"),
group = recode(group, C1 = "Control"),
tmpID = ifelse(group == "Control", 'Control', 'Infected'))
aprioriPlotting <- function(target, ...) {
annot.target <- annot %>%
select(
ensembl_gene_id,
hgnc_symbol) %>%
filter(ensembl_gene_id == target) %>%
mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))
plot <- lcpm.DE %>%
filter(identifier == target) %>%
mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
ggplot(aes(x = SSgroup.virus.avg, y = lcpm)) +
facet_grid(. ~ tmpID, scales = "free", space = "free") +
ylab("Log2(Counts per million)") +
xlab("Shedding Group") +
scale_fill_grey(start = 0.35, end = 1) +
geom_point(position = position_dodge(width=0.75), aes(group = group), show.legend = FALSE) +
geom_boxplot(alpha = 0.5) +
geom_label_repel(aes(label = bird, group = group, fill = NULL), position = position_dodge(width=0.75), show.legend=FALSE) +
theme_classic() +
labs(title= paste0(target, " - ", annot.target[1,2])) +
theme(legend.title = element_blank())
print(plot)
}
for(target in sort(unique(lcpm.DE$identifier))) {
aprioriPlotting(target)
}
































Annotations for differentially expressed genes
ns denotes non-significant genes for each comparison and numerical values are the log(fold count) difference
ensembl_gene_id
|
hgnc_symbol
|
LvM
|
MvH
|
LvH
|
ENSAPLG00020000564
|
MALRD1
|
ns
|
8.61
|
ns
|
ENSAPLG00020004469
|
NA
|
ns
|
-6.31
|
ns
|
ENSAPLG00020006175
|
NA
|
ns
|
4.1
|
ns
|
ENSAPLG00020017074
|
NA
|
ns
|
-4.39
|
ns
|
ENSAPLG00020016143
|
PCK1
|
ns
|
7.87
|
ns
|
ENSAPLG00020006372
|
CALB1
|
ns
|
9.75
|
5.82
|
ENSAPLG00020005323
|
PLB1
|
ns
|
7.25
|
ns
|
ENSAPLG00020001436
|
CPO
|
ns
|
7.34
|
ns
|
ENSAPLG00020002099
|
SUSD2
|
ns
|
7.97
|
ns
|
ENSAPLG00020005311
|
SLC3A1
|
ns
|
5.69
|
3.15
|
ENSAPLG00020017868
|
NA
|
ns
|
8.92
|
5.48
|
ENSAPLG00020016544
|
ACE2
|
ns
|
7.11
|
4
|
ENSAPLG00020002285
|
SELENOP
|
ns
|
6.1
|
ns
|
ENSAPLG00020012027
|
NA
|
ns
|
6.25
|
ns
|
ENSAPLG00020001209
|
APOB
|
ns
|
6.37
|
ns
|
ENSAPLG00020005067
|
NA
|
ns
|
8.37
|
4.98
|
ENSAPLG00020002557
|
NA
|
ns
|
12.35
|
8.7
|
ENSAPLG00020014491
|
NA
|
ns
|
10.06
|
6.93
|
ENSAPLG00020014852
|
NA
|
ns
|
4.17
|
ns
|
ENSAPLG00020005112
|
NA
|
ns
|
10.15
|
6.77
|
ENSAPLG00020014497
|
NA
|
ns
|
11.16
|
8.31
|
ENSAPLG00020010883
|
NA
|
ns
|
12.68
|
8.24
|
ENSAPLG00020010852
|
NA
|
ns
|
10.75
|
7.61
|
ENSAPLG00020015123
|
NA
|
ns
|
11.76
|
8.76
|
ENSAPLG00020002126
|
NA
|
ns
|
10.33
|
6.88
|
ENSAPLG00020010714
|
NA
|
ns
|
11.02
|
7.95
|
ENSAPLG00020014552
|
NA
|
ns
|
8.58
|
6.07
|
ENSAPLG00020011024
|
NA
|
ns
|
10.61
|
7.64
|
ENSAPLG00020010687
|
NA
|
ns
|
9.7
|
7.23
|
ENSAPLG00020014595
|
NA
|
ns
|
ns
|
6.46
|
ENSAPLG00020014586
|
NA
|
ns
|
8.38
|
6.85
|
ENSAPLG00020017448
|
LGALS3
|
ns
|
ns
|
2.81
|
Gene functions
bind_rows(tmp1a, tmp2a, tmp3a) %>%
select(-comp, -adj.P.Val, -logFC) %>%
filter(ensembl_gene_id %in% dt.tib$gene) %>%
distinct(.) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id
|
gene_biotype
|
hgnc_symbol
|
description
|
ENSAPLG00020000564
|
protein_coding
|
MALRD1
|
MAM and LDL receptor class A domain containing 1 [Source:HGNC Symbol;Acc:HGNC:24331]
|
ENSAPLG00020004469
|
protein_coding
|
NA
|
histone H2B 1/2/3/4/6 [Source:NCBI gene;Acc:101796082]
|
ENSAPLG00020006175
|
protein_coding
|
NA
|
glycerol-3-phosphate dehydrogenase [NAD(+)], cytoplasmic [Source:NCBI gene;Acc:101798440]
|
ENSAPLG00020017074
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020016143
|
protein_coding
|
PCK1
|
phosphoenolpyruvate carboxykinase 1 [Source:NCBI gene;Acc:101793637]
|
ENSAPLG00020006372
|
protein_coding
|
CALB1
|
calbindin 1 [Source:NCBI gene;Acc:101802006]
|
ENSAPLG00020005323
|
protein_coding
|
PLB1
|
phospholipase B1 [Source:HGNC Symbol;Acc:HGNC:30041]
|
ENSAPLG00020001436
|
protein_coding
|
CPO
|
carboxypeptidase O [Source:HGNC Symbol;Acc:HGNC:21011]
|
ENSAPLG00020002099
|
protein_coding
|
SUSD2
|
sushi domain containing 2 [Source:HGNC Symbol;Acc:HGNC:30667]
|
ENSAPLG00020005311
|
protein_coding
|
SLC3A1
|
solute carrier family 3 member 1 [Source:HGNC Symbol;Acc:HGNC:11025]
|
ENSAPLG00020017868
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020016544
|
protein_coding
|
ACE2
|
angiotensin I converting enzyme 2 [Source:HGNC Symbol;Acc:HGNC:13557]
|
ENSAPLG00020002285
|
protein_coding
|
SELENOP
|
selenoprotein P [Source:HGNC Symbol;Acc:HGNC:10751]
|
ENSAPLG00020012027
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020001209
|
protein_coding
|
APOB
|
apolipoprotein B [Source:HGNC Symbol;Acc:HGNC:603]
|
ENSAPLG00020005067
|
pseudogene
|
NA
|
NA
|
ENSAPLG00020002557
|
protein_coding
|
NA
|
chromodomain-helicase-DNA-binding protein 1 [Source:NCBI gene;Acc:101804420]
|
ENSAPLG00020014491
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020014852
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020005112
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020014497
|
protein_coding
|
NA
|
ATP synthase subunit alpha, mitochondrial [Source:NCBI gene;Acc:101801944]
|
ENSAPLG00020010883
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020010852
|
protein_coding
|
NA
|
guanine nucleotide-binding protein G(q) subunit alpha [Source:NCBI gene;Acc:101790479]
|
ENSAPLG00020015123
|
protein_coding
|
NA
|
ubiquilin-1-like [Source:NCBI gene;Acc:101793250]
|
ENSAPLG00020002126
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020010714
|
lncRNA
|
NA
|
NA
|
ENSAPLG00020014552
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020011024
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020010687
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020014595
|
protein_coding
|
NA
|
NA
|
ENSAPLG00020014586
|
protein_coding
|
NA
|
dnaJ homolog subfamily B member 5-like [Source:NCBI gene;Acc:101800405]
|
ENSAPLG00020017448
|
protein_coding
|
LGALS3
|
galectin 3 [Source:HGNC Symbol;Acc:HGNC:6563]
|
Pathway analysis
de.entrez <- annot %>% filter(ensembl_gene_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)
K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>%
as_tibble(rownames = NA) %>%
rownames_to_column("PathwayID") %>%
filter(P.DE < 0.05) %>%
arrange(P.DE)
keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>%
as_tibble() %>%
rename("entrezgene_id" = GeneID)
keggGenes = data.frame()
for (path in nrow(K)){
procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
df <- data.frame(procPath)
keggGenes <- rbind(keggGenes, df)
}
inner_join(keggGenes, K) %>%
as_tibble() %>%
mutate(entrezgene_id = as.numeric(entrezgene_id)) %>%
left_join(annot) %>%
select(Pathway, PathwayID, N, DE, P.DE, ensembl_gene_id, hgnc_symbol, entrezgene_id) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway
|
PathwayID
|
N
|
DE
|
P.DE
|
ensembl_gene_id
|
hgnc_symbol
|
entrezgene_id
|
Glycolysis / Gluconeogenesis
|
path:apla00010
|
14
|
1
|
0.0079920
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
Citrate cycle (TCA cycle)
|
path:apla00020
|
9
|
1
|
0.0051414
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
Pyruvate metabolism
|
path:apla00620
|
8
|
1
|
0.0045708
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
PPAR signaling pathway
|
path:apla03320
|
15
|
1
|
0.0085616
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
FoxO signaling pathway
|
path:apla04068
|
31
|
1
|
0.0176534
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
Insulin signaling pathway
|
path:apla04910
|
43
|
1
|
0.0244448
|
ENSAPLG00020016143
|
PCK1
|
101793637
|
Adipocytokine signaling pathway
|
path:apla04920
|
22
|
1
|
0.0125444
|
ENSAPLG00020016143
|
PCK1
|
101793637
|