This is an analysis of differential expression at the transcript level between supershedder groupings based on shedding rates measured on the day of sacrifice for infected mallard bursa samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(gplots)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.isoforms.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationTrans103120.txt", delim = "\t")

Prep data for analysis

newNames <- names(cnt) %>%
  as_tibble() %>%
  separate(value, into = c("pt1", "pt2", "pt3")) %>%
  select(pt2, pt3) %>%
  mutate(pt2 = as.numeric(pt2),
         pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("bursa_", pt2))

colnames(cnt) <- newNames$newName

cnt <- cnt %>% 
  select(order(colnames(cnt)))

covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)

covars <- covars %>% filter(bird != "01", bird != "72")

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any transcripts that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_transcript_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only transcripts expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 26095  3225
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 17851
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 17851
dim(dge)
## [1] 17851    68

Subset data

Following normalization, we subset the data based on infection groups.

#Assign groups in DGElist
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
dge$samples$group <- group

### Divide out I1 and I2
dge.I1 <- dge[,grep('\\bI1\\b', dge$samples$group)]
dge.I1 <- calcNormFactors(dge.I1, method = "TMM")

dge.I2 <- dge[,grep('\\bI2\\b', dge$samples$group)]
dge.I2 <- calcNormFactors(dge.I2, method = "TMM")

covars.I1 <- covars %>% filter(group == "I1")
covars.I2 <- covars %>% filter(group == "I2")

Differential expression analysis

Establish design & contrast matrices

## Establish design matrix: I1
design.I1 = model.matrix(~ 0 +
                        factor(covars.I1$SSgroup.virus.sac) +
                        covars.I1$age +
                        covars.I1$sex + 
                        covars.I1$Bursa_pool)

colnames(design.I1) <- c("HIGH", "LOW", "MODERATE",
                         "age", "sexM", "poolMSU2",
                         "poolMSU3", "poolMSU4", "poolMSU5")


contr.matrix.I1 = makeContrasts(
  LvM = LOW - MODERATE,
  MvH = MODERATE - HIGH,
  LvH = LOW - HIGH,
  levels = colnames(design.I1))

## Mean-variance trend plots
v.I1 <- voomWithQualityWeights(dge.I1, design.I1, plot = FALSE)

#Fitting models
vfit.I1 <- lmFit(v.I1, design.I1) 
vfit.I1 <- contrasts.fit(vfit.I1, contrasts = contr.matrix.I1)
tfit.I1 <- treat(vfit.I1, lfc = 0.5)

## Identify DE transcripts
dt.I1 <- decideTests(tfit.I1, p.value = 0.1, adjust.method = "fdr")
dt.tib.I1 <- as_tibble(dt.I1, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  rename(LvM.I1 = LvM,
         MvH.I1 = MvH,
         LvH.I1 = LvH) %>% 
  filter_at(vars(LvM.I1:LvH.I1), any_vars(. != 0))


## Establish design matrix: I2
design.I2 = model.matrix(
  ~ 0 +
    factor(covars.I2$SSgroup.virus.sac) +
    covars.I2$age +
    covars.I2$sex +
    covars.I2$Bursa_pool
)

colnames(design.I2) <- c(
  "HIGH",
  "LOW",
  "MODERATE",
  "age",
  "sexM",
  "poolMSU2",
  "poolMSU3",
  "poolMSU4",
  "poolMSU5"
)

contr.matrix.I2 = makeContrasts(
  LvM = LOW - MODERATE,
  MvH = MODERATE - HIGH,
  LvH = LOW - HIGH,
  levels = colnames(design.I2))

## Mean-variance trend plots
v.I2 <- voomWithQualityWeights(dge.I2, design.I2, plot = FALSE)

#Fitting models
vfit.I2 <- lmFit(v.I2, design.I2) 
vfit.I2 <- contrasts.fit(vfit.I2, contrasts = contr.matrix.I2)
tfit.I2 <- treat(vfit.I2, lfc = 0.5)

## Identify DE genes
dt.I2 <- decideTests(tfit.I2, p.value = 0.1, adjust.method = "fdr")
dt.tib.I2 <- as_tibble(dt.I2, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  rename(LvM.I2 = LvM,
         MvH.I2 = MvH,
         LvH.I2 = LvH) %>% 
  filter_at(vars(LvM.I2:LvH.I2), any_vars(. != 0))


## Bring it all together
dt.tib <- full_join(dt.tib.I1, dt.tib.I2) %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  replace(is.na(.), 0)
## Joining, by = "gene"

Count DE transcripts

For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.

allResults <- rbind(
  as.data.frame(summary(dt.I1)),
  as.data.frame(summary(dt.I2))) %>% 
  filter(Var1 != "NotSig")

kable(allResults) %>%
  kable_styling("striped", full_width = F) %>%
  pack_rows("1 Day Post Infection", 1, 6) %>%
  pack_rows("2 Days Post Infection", 7, 12)
Var1 Var2 Freq
1 Day Post Infection
Down LvM 0
Up LvM 0
Down MvH 0
Up MvH 0
Down LvH 0
Up LvH 0
2 Days Post Infection
Down LvM 0
Up LvM 0
Down MvH 0
Up MvH 0
Down LvH 0
Up LvH 0

Volcano plot

tmp1 <- topTreat(tfit.I1, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. Moderate shedding, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")


tmp2 <- topTreat(tfit.I1, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Moderate vs. High shedding, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit.I1, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. High shedding, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp4 <- topTreat(tfit.I2, coef = 1, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. Moderate shedding, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp5 <- topTreat(tfit.I2, coef = 2, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Moderate vs. High shedding, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp6 <- topTreat(tfit.I2, coef = 3, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. High shedding, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)