This is an analysis of differential expression at the transcript level between supershedder groupings based on shedding rates measured on the day of sacrifice for infected mallard bursa samples.
## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(gplots)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)
## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.isoforms.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationTrans103120.txt", delim = "\t")
newNames <- names(cnt) %>%
as_tibble() %>%
separate(value, into = c("pt1", "pt2", "pt3")) %>%
select(pt2, pt3) %>%
mutate(pt2 = as.numeric(pt2),
pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
mutate(newName = paste0("bursa_", pt2))
colnames(cnt) <- newNames$newName
cnt <- cnt %>%
select(order(colnames(cnt)))
covars <- covars %>%
mutate(bird = as.numeric(bird)) %>%
mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
arrange(bird) %>%
mutate(bird = as.factor(bird)) %>%
mutate(group = str_remove(group, "-"))
## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)
covars <- covars %>% filter(bird != "01", bird != "72")
Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any transcripts that are expressed at few than 0.5 CPM in at least 25% of individuals.
#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_transcript_id),]
#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)
#### Retain only transcripts expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE TRUE
## 26095 3225
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 17851
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE
## 17851
dim(dge)
## [1] 17851 68
Following normalization, we subset the data based on infection groups.
#Assign groups in DGElist
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>%
as.factor()
dge$samples$group <- group
### Divide out I1 and I2
dge.I1 <- dge[,grep('\\bI1\\b', dge$samples$group)]
dge.I1 <- calcNormFactors(dge.I1, method = "TMM")
dge.I2 <- dge[,grep('\\bI2\\b', dge$samples$group)]
dge.I2 <- calcNormFactors(dge.I2, method = "TMM")
covars.I1 <- covars %>% filter(group == "I1")
covars.I2 <- covars %>% filter(group == "I2")
## Establish design matrix: I1
design.I1 = model.matrix(~ 0 +
factor(covars.I1$SSgroup.virus.sac) +
covars.I1$age +
covars.I1$sex +
covars.I1$Bursa_pool)
colnames(design.I1) <- c("HIGH", "LOW", "MODERATE",
"age", "sexM", "poolMSU2",
"poolMSU3", "poolMSU4", "poolMSU5")
contr.matrix.I1 = makeContrasts(
LvM = LOW - MODERATE,
MvH = MODERATE - HIGH,
LvH = LOW - HIGH,
levels = colnames(design.I1))
## Mean-variance trend plots
v.I1 <- voomWithQualityWeights(dge.I1, design.I1, plot = FALSE)
#Fitting models
vfit.I1 <- lmFit(v.I1, design.I1)
vfit.I1 <- contrasts.fit(vfit.I1, contrasts = contr.matrix.I1)
tfit.I1 <- treat(vfit.I1, lfc = 0.5)
## Identify DE transcripts
dt.I1 <- decideTests(tfit.I1, p.value = 0.1, adjust.method = "fdr")
dt.tib.I1 <- as_tibble(dt.I1, rownames = NA) %>%
rownames_to_column("gene") %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
rename(LvM.I1 = LvM,
MvH.I1 = MvH,
LvH.I1 = LvH) %>%
filter_at(vars(LvM.I1:LvH.I1), any_vars(. != 0))
## Establish design matrix: I2
design.I2 = model.matrix(
~ 0 +
factor(covars.I2$SSgroup.virus.sac) +
covars.I2$age +
covars.I2$sex +
covars.I2$Bursa_pool
)
colnames(design.I2) <- c(
"HIGH",
"LOW",
"MODERATE",
"age",
"sexM",
"poolMSU2",
"poolMSU3",
"poolMSU4",
"poolMSU5"
)
contr.matrix.I2 = makeContrasts(
LvM = LOW - MODERATE,
MvH = MODERATE - HIGH,
LvH = LOW - HIGH,
levels = colnames(design.I2))
## Mean-variance trend plots
v.I2 <- voomWithQualityWeights(dge.I2, design.I2, plot = FALSE)
#Fitting models
vfit.I2 <- lmFit(v.I2, design.I2)
vfit.I2 <- contrasts.fit(vfit.I2, contrasts = contr.matrix.I2)
tfit.I2 <- treat(vfit.I2, lfc = 0.5)
## Identify DE genes
dt.I2 <- decideTests(tfit.I2, p.value = 0.1, adjust.method = "fdr")
dt.tib.I2 <- as_tibble(dt.I2, rownames = NA) %>%
rownames_to_column("gene") %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
rename(LvM.I2 = LvM,
MvH.I2 = MvH,
LvH.I2 = LvH) %>%
filter_at(vars(LvM.I2:LvH.I2), any_vars(. != 0))
## Bring it all together
dt.tib <- full_join(dt.tib.I1, dt.tib.I2) %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
replace(is.na(.), 0)
## Joining, by = "gene"
For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.
allResults <- rbind(
as.data.frame(summary(dt.I1)),
as.data.frame(summary(dt.I2))) %>%
filter(Var1 != "NotSig")
kable(allResults) %>%
kable_styling("striped", full_width = F) %>%
pack_rows("1 Day Post Infection", 1, 6) %>%
pack_rows("2 Days Post Infection", 7, 12)
Var1 | Var2 | Freq |
---|---|---|
1 Day Post Infection | ||
Down | LvM | 0 |
Up | LvM | 0 |
Down | MvH | 0 |
Up | MvH | 0 |
Down | LvH | 0 |
Up | LvH | 0 |
2 Days Post Infection | ||
Down | LvM | 0 |
Up | LvM | 0 |
Down | MvH | 0 |
Up | MvH | 0 |
Down | LvH | 0 |
Up | LvH | 0 |
tmp1 <- topTreat(tfit.I1, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. Moderate shedding, Day 1") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp2 <- topTreat(tfit.I1, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Moderate vs. High shedding, Day 1") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp3 <- topTreat(tfit.I1, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. High shedding, Day 1") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp4 <- topTreat(tfit.I2, coef = 1, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. Moderate shedding, Day 2") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp5 <- topTreat(tfit.I2, coef = 2, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Moderate vs. High shedding, Day 2") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp6 <- topTreat(tfit.I2, coef = 3, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. High shedding, Day 2") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)