This is an analysis of differential expression at the gene level between treatment groups for control and infected mallard bursa samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(gplots)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.genes.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationGenes103120.txt", delim = "\t")

Prep data for analysis

newNames <- names(cnt) %>%
  as_tibble() %>%
  separate(value, into = c("pt1", "pt2", "pt3")) %>%
  select(pt2, pt3) %>%
  mutate(pt2 = as.numeric(pt2),
         pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("bursa_", pt2))

colnames(cnt) <- newNames$newName

cnt <- cnt %>% 
  select(order(colnames(cnt)))

covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)

covars <- covars %>% filter(bird != "01", bird != "72")

Set up model effects

bird <- as.factor(covars$bird)
sex <- as.factor(covars$sex)
age <- as.numeric(covars$age)
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
pool <- as.factor(covars$Bursa_pool)

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 16979  1511
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 13110
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 13110
dim(dge)
## [1] 13110    68
#TMM normalization
dge <- calcNormFactors(dge, method = "TMM")

Differential expression analysis

Establish design & contrast matrices

## Establish design matrix
design = model.matrix(~ 0 +
                        group +
                        age +
                        sex + 
                        pool)

colnames(design) <- gsub("group", "", colnames(design))

contr.matrix = makeContrasts(
  CtlvI1 = Ctl - I1,
  CtlvI2 = Ctl - I2,
  CtlvI5 = Ctl - I5,
  CtlvI15 = Ctl - I15,
  CtlvI29 = Ctl - I29,
  I1vI2 = I1 - I2,
  I1vI5 = I1 - I5,
  I1vI15 = I1 - I15,
  I1vI29 = I1 - I29,
  I2vI5 = I2 - I5,
  I2vI15 = I2 - I15,
  I2vI29 = I2 - I29,
  I5vI15 = I5 - I15,
  I5vI29 = I5 - I29,
  I15vI29 = I15 - I29,
  levels = colnames(design))

## Mean-variance trend and sample weight plots
v <- voomWithQualityWeights(dge, design, plot = TRUE)

## Fitting the model
vfit <- lmFit(v, design) 
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
tfit <- treat(vfit, lfc = 0.5)
plotSA(tfit)

Count DE genes

For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.

dt <- decideTests(tfit, p.value = 0.1, adjust.method = "fdr")
summary(dt) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
CtlvI1 CtlvI2 CtlvI5 CtlvI15 CtlvI29 I1vI2 I1vI5 I1vI15 I1vI29 I2vI5 I2vI15 I2vI29 I5vI15 I5vI29 I15vI29
Down 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0
NotSig 13110 13110 13110 13110 13110 13110 13110 13110 13109 13110 13110 13108 13110 13109 13110
Up 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
dt.tib <- as_tibble(dt, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("C")), as.numeric) %>% 
  mutate_at(vars(starts_with("I")), as.numeric) %>% 
  filter_at(vars(CtlvI1:I15vI29), any_vars(. != 0))

Volcano plot

tmp1 <- topTreat(tfit, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp2 <- topTreat(tfit, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp4 <- topTreat(tfit, coef = 4, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp5 <- topTreat(tfit, coef = 5, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp6 <- topTreat(tfit, coef = 6, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")


tmp7 <- topTreat(tfit, coef = 7, n = Inf)
results7 <- mutate(tmp7, sig=ifelse(tmp7$adj.P.Val<0.1, "Sig", "Not Sig"))
p7 <- ggplot(results7, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp8 <- topTreat(tfit, coef = 8, n = Inf)
results8 <- mutate(tmp8, sig=ifelse(tmp8$adj.P.Val<0.1, "Sig", "Not Sig"))
p8 <- ggplot(results8, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp9 <- topTreat(tfit, coef = 9, n = Inf)
results9 <- mutate(tmp9, sig=ifelse(tmp9$adj.P.Val<0.1, "Sig", "Not Sig"))
p9 <- ggplot(results9, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp10 <- topTreat(tfit, coef = 10, n = Inf)
results10 <- mutate(tmp10, sig=ifelse(tmp10$adj.P.Val<0.1, "Sig", "Not Sig"))
p10 <- ggplot(results10, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp11 <- topTreat(tfit, coef = 11, n = Inf)
results11 <- mutate(tmp11, sig=ifelse(tmp11$adj.P.Val<0.1, "Sig", "Not Sig"))
p11 <- ggplot(results11, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp12 <- topTreat(tfit, coef = 12, n = Inf)
results12 <- mutate(tmp12, sig=ifelse(tmp12$adj.P.Val<0.1, "Sig", "Not Sig"))
p12 <- ggplot(results12, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp13 <- topTreat(tfit, coef = 13, n = Inf)
results13 <- mutate(tmp13, sig=ifelse(tmp13$adj.P.Val<0.1, "Sig", "Not Sig"))
p13 <- ggplot(results13, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 5 vs. Infected, Day 15") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp14 <- topTreat(tfit, coef = 14, n = Inf)
results14 <- mutate(tmp14, sig=ifelse(tmp14$adj.P.Val<0.1, "Sig", "Not Sig"))
p14 <- ggplot(results14, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 5 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp15 <- topTreat(tfit, coef = 15, n = Inf)
results15 <- mutate(tmp15, sig=ifelse(tmp15$adj.P.Val<0.1, "Sig", "Not Sig"))
p15 <- ggplot(results15, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 15 vs. Infected, Day 29") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")


grid.arrange(p1, p2, p3, p4, p5, p6, p7,
             p8, p9, p10, p11, p12, p13, p14, p15, nrow = 8)

Boxplots for differentially expressed genes

lcpm.DE <- lcpm %>%
  as_tibble(rownames = NA) %>% 
  rownames_to_column("identifier") %>% 
  filter(identifier %in% dt.tib$gene) %>%
  pivot_longer(cols = contains("_"),
               names_to = "sample",
               values_to = "lcpm") %>%
  separate(sample, into = c("tissue", "bird")) %>%
  mutate(bird = as.factor(bird)) %>%
  left_join(covars, by = "bird") %>%
  select(identifier, bird, lcpm, group) %>% 
  mutate(group = recode(group, C29 = "Control"),
         group = recode(group, C1 = "Control"),
         tmpID = ifelse(group == "Control", 'Control', 'Infected'))


aprioriPlotting <- function(target, ...) {
   annot.target <- annot %>%
    select(
      ensembl_gene_id,
      hgnc_symbol) %>%
    filter(ensembl_gene_id == target) %>% 
    mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))


  plot <- lcpm.DE %>%
    filter(identifier == target) %>%
    mutate(group = fct_relevel(group, "Control", "I1", "I2", "I5", "I15", "I29")) %>%
    ggplot(aes(x = group, y = lcpm)) +
    facet_grid(. ~ tmpID, scales = "free", space = "free") +
    ylab("Log2(Counts per million)") +
    xlab("Group") +
    geom_point(position = position_dodge(width=0.75), show.legend = FALSE) +
    geom_boxplot(alpha = 0.5) +
    geom_label_repel(aes(label = bird, fill = NULL), show.legend=FALSE) +
    theme_classic() +
    labs(title= paste0(target, " - ", annot.target[1,2])) +
    theme(legend.title = element_blank())
    
  print(plot)
}

for(target in sort(unique(lcpm.DE$identifier))) {
  aprioriPlotting(target)
}

Annotations for differentially expressed genes

ns denotes non-significant genes for each comparison and numerical values are the Log2(Fold change) difference
ensembl_gene_id hgnc_symbol CtlvI1 CtlvI2 CtlvI5 CtlvI15 CtlvI29 I1vI2 I1vI5 I1vI15 I1vI29 I2vI5 I2vI15 I2vI29 I5vI15 I5vI29 I15vI29
ENSAPLG00020014545 EDN2 ns ns ns ns ns ns ns ns 1.61 ns ns ns ns ns ns
ENSAPLG00020015991 FOSB ns ns ns ns ns ns ns ns ns ns ns -3.78 ns -3.08 ns
ENSAPLG00020017110 HSPA8 ns ns ns ns ns ns ns ns ns ns ns -1.57 ns ns ns

Gene functions

bind_rows(tmp1a, tmp2a, tmp3a, tmp4a, tmp5a, tmp6a, tmp7a, tmp8a, tmp9a, tmp10a, tmp11a, tmp12a, tmp13a, tmp14a, tmp15a) %>% 
  select(-comp, -adj.P.Val, -logFC) %>% 
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  distinct(.) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id gene_biotype hgnc_symbol description
ENSAPLG00020014545 protein_coding EDN2 endothelin 2 [Source:NCBI gene;Acc:101795578]
ENSAPLG00020015991 protein_coding FOSB FosB proto-oncogene, AP-1 transcription factor subunit [Source:HGNC Symbol;Acc:HGNC:3797]
ENSAPLG00020017110 protein_coding HSPA8 heat shock protein family A (Hsp70) member 8 [Source:NCBI gene;Acc:101801738]

Pathway analysis

de.entrez <- annot %>% filter(ensembl_gene_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)

K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>% 
  as_tibble(rownames = NA) %>% 
  rownames_to_column("PathwayID") %>% 
  filter(P.DE < 0.05) %>% 
  arrange(P.DE)

keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>% 
  as_tibble() %>% 
  rename("entrezgene_id" = GeneID)

keggGenes = data.frame()
for (path in nrow(K)){
    procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
    df <- data.frame(procPath)
    keggGenes <- rbind(keggGenes, df)
}

inner_join(keggGenes, K) %>%
  as_tibble() %>% 
  mutate(entrezgene_id = as.numeric(entrezgene_id)) %>% 
  left_join(annot) %>% 
  select(Pathway, PathwayID, N, DE, P.DE, ensembl_gene_id, hgnc_symbol, entrezgene_id) %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway PathwayID N DE P.DE ensembl_gene_id hgnc_symbol entrezgene_id
Spliceosome path:apla03040 32 1 0.0182151 ENSAPLG00020017110 HSPA8 101801738
MAPK signaling pathway path:apla04010 74 1 0.0418683 ENSAPLG00020017110 HSPA8 101801738
Neuroactive ligand-receptor interaction path:apla04080 88 1 0.0496886 ENSAPLG00020014545 EDN2 101795578
Protein processing in endoplasmic reticulum path:apla04141 35 1 0.0199142 ENSAPLG00020017110 HSPA8 101801738
Endocytosis path:apla04144 63 1 0.0357013 ENSAPLG00020017110 HSPA8 101801738
Vascular smooth muscle contraction path:apla04270 30 1 0.0170815 ENSAPLG00020014545 EDN2 101795578