This is an analysis of differential expression at the transcript level between treatment groups for control and infected mallard bursa samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(gplots)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.isoforms.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationTrans103120.txt", delim = "\t")

Prep data for analysis

newNames <- names(cnt) %>%
  as_tibble() %>%
  separate(value, into = c("pt1", "pt2", "pt3")) %>%
  select(pt2, pt3) %>%
  mutate(pt2 = as.numeric(pt2),
         pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("bursa_", pt2))

colnames(cnt) <- newNames$newName

cnt <- cnt %>% 
  select(order(colnames(cnt)))

covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)

covars <- covars %>% filter(bird != "01", bird != "72")

Set up model effects

bird <- as.factor(covars$bird)
sex <- as.factor(covars$sex)
age <- as.numeric(covars$age)
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
pool <- as.factor(covars$Bursa_pool)

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_transcript_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 26095  3225
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 17851
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 17851
dim(dge)
## [1] 17851    68
#TMM normalization
dge <- calcNormFactors(dge, method = "TMM")

Differential expression analysis

Establish design & contrast matrices

## Establish design matrix
design = model.matrix(~ 0 +
                        group +
                        age +
                        sex + 
                        pool)

colnames(design) <- gsub("group", "", colnames(design))

contr.matrix = makeContrasts(
  CtlvI1 = Ctl - I1,
  CtlvI2 = Ctl - I2,
  CtlvI5 = Ctl - I5,
  CtlvI15 = Ctl - I15,
  CtlvI29 = Ctl - I29,
  I1vI2 = I1 - I2,
  I1vI5 = I1 - I5,
  I1vI15 = I1 - I15,
  I1vI29 = I1 - I29,
  I2vI5 = I2 - I5,
  I2vI15 = I2 - I15,
  I2vI29 = I2 - I29,
  I5vI15 = I5 - I15,
  I5vI29 = I5 - I29,
  I15vI29 = I15 - I29,
  levels = colnames(design))

## Mean-variance trend and sample weight plots
v <- voomWithQualityWeights(dge, design, plot = TRUE)

## Fitting the model
vfit <- lmFit(v, design) 
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
tfit <- treat(vfit, lfc = 0.5)
plotSA(tfit)

Count DE genes

For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.

dt <- decideTests(tfit, p.value = 0.1, adjust.method = "fdr")
summary(dt) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
CtlvI1 CtlvI2 CtlvI5 CtlvI15 CtlvI29 I1vI2 I1vI5 I1vI15 I1vI29 I2vI5 I2vI15 I2vI29 I5vI15 I5vI29 I15vI29
Down 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0
NotSig 17851 17850 17851 17851 17851 17850 17851 17851 17850 17850 17850 17850 17849 17851 17851
Up 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0
dt.tib <- as_tibble(dt, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("C")), as.numeric) %>% 
  mutate_at(vars(starts_with("I")), as.numeric) %>% 
  filter_at(vars(CtlvI1:I15vI29), any_vars(. != 0))

Volcano plot

tmp1 <- topTreat(tfit, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp2 <- topTreat(tfit, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp4 <- topTreat(tfit, coef = 4, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp5 <- topTreat(tfit, coef = 5, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp6 <- topTreat(tfit, coef = 6, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")


tmp7 <- topTreat(tfit, coef = 7, n = Inf)
results7 <- mutate(tmp7, sig=ifelse(tmp7$adj.P.Val<0.1, "Sig", "Not Sig"))
p7 <- ggplot(results7, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp8 <- topTreat(tfit, coef = 8, n = Inf)
results8 <- mutate(tmp8, sig=ifelse(tmp8$adj.P.Val<0.1, "Sig", "Not Sig"))
p8 <- ggplot(results8, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp9 <- topTreat(tfit, coef = 9, n = Inf)
results9 <- mutate(tmp9, sig=ifelse(tmp9$adj.P.Val<0.1, "Sig", "Not Sig"))
p9 <- ggplot(results9, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp10 <- topTreat(tfit, coef = 10, n = Inf)
results10 <- mutate(tmp10, sig=ifelse(tmp10$adj.P.Val<0.1, "Sig", "Not Sig"))
p10 <- ggplot(results10, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp11 <- topTreat(tfit, coef = 11, n = Inf)
results11 <- mutate(tmp11, sig=ifelse(tmp11$adj.P.Val<0.1, "Sig", "Not Sig"))
p11 <- ggplot(results11, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp12 <- topTreat(tfit, coef = 12, n = Inf)
results12 <- mutate(tmp12, sig=ifelse(tmp12$adj.P.Val<0.1, "Sig", "Not Sig"))
p12 <- ggplot(results12, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp13 <- topTreat(tfit, coef = 13, n = Inf)
results13 <- mutate(tmp13, sig=ifelse(tmp13$adj.P.Val<0.1, "Sig", "Not Sig"))
p13 <- ggplot(results13, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 5 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp14 <- topTreat(tfit, coef = 14, n = Inf)
results14 <- mutate(tmp14, sig=ifelse(tmp14$adj.P.Val<0.1, "Sig", "Not Sig"))
p14 <- ggplot(results14, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 5 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp15 <- topTreat(tfit, coef = 15, n = Inf)
results15 <- mutate(tmp15, sig=ifelse(tmp15$adj.P.Val<0.1, "Sig", "Not Sig"))
p15 <- ggplot(results15, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 15 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")


grid.arrange(p1, p2, p3, p4, p5, p6, p7,
             p8, p9, p10, p11, p12, p13, p14, p15, nrow = 8)

Heatmap

# Get the gene names for DE genes
lcpm2 <- as_tibble(dge$counts, rownames = NA) %>% 
  cpm(., log = TRUE)

newNames <- colnames(lcpm2) %>% 
  as_tibble() %>% 
  separate(value, into = c(NA, "bird")) %>% 
  bind_cols(covars) %>% 
  mutate(group = ifelse(group == "C1", "Ctl", group),
         group = ifelse(group == "C29", "Ctl", group),
         group = as.factor(group)) %>% 
  unite("newNames", group, bird)

colOrder <- covars %>% 
  mutate(group = ifelse(group == "C1", "Ctl", group),
         group = ifelse(group == "C29", "Ctl", group),
         group = as.factor(group)) %>% 
  mutate(group = fct_relevel(group, "Ctl", "I1", "I2", "I5", "I15", "I29")) %>%
  arrange(group, bird) %>% 
  unite("newNames", group, bird)

colnames(lcpm2) <- newNames$newNames
lcpm2 <- lcpm2[, colOrder$newNames]

heatmap.annot <- annot %>%
  filter(ensembl_transcript_id %in% dt.tib$gene) %>% 
  mutate(hgnc_symbol = replace_na(hgnc_symbol, ".")) %>% 
  unite(annot.hm, c(ensembl_transcript_id, hgnc_symbol), sep = "-", remove = FALSE) %>% 
  select(annot.hm, ensembl_transcript_id) %>% 
  distinct(annot.hm, .keep_all = TRUE)

deGeneCounts <- lcpm2 %>% 
  as_tibble() %>% 
  mutate(ensembl_transcript_id = rownames(dge$counts)) %>% 
  filter(ensembl_transcript_id %in% dt.tib$gene) %>% 
  left_join(heatmap.annot) %>% 
  column_to_rownames("annot.hm") %>%
  select(-ensembl_transcript_id) %>% 
  as.matrix()

## Set up palette
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)

hc <- hclust(as.dist(1-cor(t(deGeneCounts))))

# Plot the heatmap
heatmap.2(deGeneCounts,
          Colv = FALSE,
          Rowv = as.dendrogram(hc),
          col = rev(morecols(50)),
          trace = "none",
          colsep = c(8, 23, 33, 48, 58),
          dendrogram = "row",
          density.info = "none",
          key = TRUE,
          margins = c(3, 20),
          scale ="row")

Boxplots for differentially expressed genes

lcpm.DE <- lcpm %>%
  as_tibble(rownames = NA) %>% 
  rownames_to_column("identifier") %>% 
  filter(identifier %in% dt.tib$gene) %>%
  pivot_longer(cols = contains("_"),
               names_to = "sample",
               values_to = "lcpm") %>%
  separate(sample, into = c("tissue", "bird")) %>%
  mutate(bird = as.factor(bird)) %>%
  left_join(covars, by = "bird") %>%
  select(identifier, bird, lcpm, group) %>% 
  mutate(group = recode(group, C29 = "Control"),
         group = recode(group, C1 = "Control"),
         tmpID = ifelse(group == "Control", 'Control', 'Infected'))


aprioriPlotting <- function(target, ...) {
   annot.target <- annot %>%
    select(
      ensembl_transcript_id,
      hgnc_symbol) %>%
    filter(ensembl_transcript_id == target) %>% 
    mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))


  plot <- lcpm.DE %>%
    filter(identifier == target) %>%
    mutate(group = fct_relevel(group, "Control", "I1", "I2", "I5", "I15", "I29")) %>%
    ggplot(aes(x = group, y = lcpm)) +
    facet_grid(. ~ tmpID, scales = "free", space = "free") +
    ylab("Log2(Counts per million)") +
    xlab("Group") +
    geom_point(position = position_dodge(width=0.75), show.legend = FALSE) +
    geom_boxplot(alpha = 0.5) +
    geom_label_repel(aes(label = bird, fill = NULL), show.legend=FALSE) +
    theme_classic() +
    labs(title= paste0(target, " - ", annot.target[1,2])) +
    theme(legend.title = element_blank())
    
  print(plot)
}

for(target in sort(unique(lcpm.DE$identifier))) {
  aprioriPlotting(target)
}

Annotations for differentially expressed genes

ns denotes non-significant genes for each comparison and numerical values are the Log2(Fold change) difference
ensembl_transcript_id hgnc_symbol CtlvI1 CtlvI2 CtlvI5 CtlvI15 CtlvI29 I1vI2 I1vI5 I1vI15 I1vI29 I2vI5 I2vI15 I2vI29 I5vI15 I5vI29 I15vI29
ENSAPLT00020022320 EDN2 ns ns ns ns ns ns ns ns 1.65 ns ns ns ns ns ns
ENSAPLT00020026833 HSPA8 ns ns ns ns ns ns ns ns ns ns -3.5 ns ns ns ns
ENSAPLT00020026836 HSPA8 ns ns ns ns ns ns ns ns ns ns ns -1.58 ns ns ns
ENSAPLT00020010104 NA ns 4.13 ns ns ns 4.73 ns ns ns -4.54 ns ns ns ns ns
ENSAPLT00020025810 NFATC3 ns ns ns ns ns ns ns ns ns ns ns ns -4.86 ns ns
ENSAPLT00020008328 STAU1 ns ns ns ns ns ns ns ns ns ns ns ns 2.86 ns ns

Gene functions

bind_rows(tmp1a, tmp2a, tmp3a, tmp4a, tmp5a, tmp6a, tmp7a, tmp8a, tmp9a, tmp10a, tmp11a, tmp12a, tmp13a, tmp14a, tmp15a) %>% 
  select(-comp, -adj.P.Val, -logFC) %>% 
  filter(ensembl_transcript_id %in% dt.tib$gene) %>% 
  distinct(.) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_transcript_id transcript_biotype hgnc_symbol description
ENSAPLT00020022320 protein_coding EDN2 endothelin 2 [Source:NCBI gene;Acc:101795578]
ENSAPLT00020026833 protein_coding HSPA8 heat shock protein family A (Hsp70) member 8 [Source:NCBI gene;Acc:101801738]
ENSAPLT00020026836 protein_coding HSPA8 heat shock protein family A (Hsp70) member 8 [Source:NCBI gene;Acc:101801738]
ENSAPLT00020010104 protein_coding NA pantetheinase [Source:NCBI gene;Acc:101798228]
ENSAPLT00020025810 protein_coding NFATC3 nuclear factor of activated T cells 3 [Source:HGNC Symbol;Acc:HGNC:7777]
ENSAPLT00020008328 protein_coding STAU1 staufen double-stranded RNA binding protein 1 [Source:NCBI gene;Acc:101803828]

Pathway analysis

de.entrez <- annot %>% filter(ensembl_transcript_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)

K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>% 
  as_tibble(rownames = NA) %>% 
  rownames_to_column("PathwayID") %>% 
  filter(P.DE < 0.05) %>% 
  arrange(P.DE)

keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>% 
  as_tibble() %>% 
  rename("entrezgene_id" = GeneID)

keggGenes = data.frame()
for (path in nrow(K)){
    procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
    df <- data.frame(procPath)
    keggGenes <- rbind(keggGenes, df)
}

inner_join(keggGenes, K) %>%
  as_tibble() %>% 
  mutate(entrezgene_id = as.numeric(entrezgene_id)) %>% 
  left_join(annot) %>% 
  select(Pathway, PathwayID, N, DE, P.DE, ensembl_transcript_id, hgnc_symbol, entrezgene_id) %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway PathwayID N DE P.DE ensembl_transcript_id hgnc_symbol entrezgene_id
Spliceosome path:apla03040 32 1 0.0272017 ENSAPLT00020026833 HSPA8 101801738
Spliceosome path:apla03040 32 1 0.0272017 ENSAPLT00020026836 HSPA8 101801738
Protein processing in endoplasmic reticulum path:apla04141 35 1 0.0297262 ENSAPLT00020026833 HSPA8 101801738
Protein processing in endoplasmic reticulum path:apla04141 35 1 0.0297262 ENSAPLT00020026836 HSPA8 101801738
Vascular smooth muscle contraction path:apla04270 30 1 0.0255162 ENSAPLT00020022320 EDN2 101795578