This is an analysis of differential expression at the gene level between treatment groups for control and infected mallard ileum samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(gplots)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt1 <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/ileumCounts.txt", delim = "\t") %>% distinct(gene, .keep_all = TRUE)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationGenesIleum110120.txt", delim = "\t")

Prep data for analysis

cnt <- cnt1 %>%
  as_tibble() %>%
  select(-gene)

newNames.ileum <- names(cnt) %>%
  as_tibble() %>%
  mutate(value = gsub("Iluem_", "", .$value)) %>% 
  separate(value, into = c(NA, "pt1", "pt2")) %>%
  mutate(pt1 = na_if(pt1, "MALL"),
         pt2 = na_if(pt2, "ileum")) %>%
  mutate(bird = ifelse(is.na(pt1), pt2, pt1)) %>%
  mutate(bird = as.numeric(bird),
    bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("ileum_", bird))

colnames(cnt) <- newNames.ileum$newName

cnt <- cnt[,order(colnames(cnt))] %>%
  mutate(gene = cnt1$gene) %>%
  column_to_rownames("gene") %>%
  as_tibble(rownames = NA)

### Covars
covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

covars.ileum <- covars %>%
  filter(bird %in% newNames.ileum$bird)

cnt <- cnt %>% select(-ileum_01, -ileum_72)

covars.ileum <- covars.ileum %>% filter(bird != "01",
                                        bird != "72")

Set up model effects

bird <- as.factor(covars.ileum$bird)
sex <- as.factor(covars.ileum$sex)
age <- as.numeric(covars.ileum$age)
group <- recode(covars.ileum$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
pool <- as.factor(covars.ileum$Ileum_pool)

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 17388   728
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 14563
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 14563
dim(dge)
## [1] 14563    40
#TMM normalization
dge <- calcNormFactors(dge, method = "TMM")

Differential expression analysis

Establish design & contrast matrices

## Establish design matrix
design = model.matrix(~ 0 +
                        group +
                        age +
                        sex + 
                        pool)

colnames(design) <- gsub("group", "", colnames(design))

contr.matrix = makeContrasts(
  CtlvI1 = Ctl - I1,
  CtlvI2 = Ctl - I2,
  CtlvI5 = Ctl - I5,
  I1vI2 = I1 - I2,
  I1vI5 = I1 - I5,
  I2vI5 = I2 - I5,
  levels = colnames(design))

## Mean-variance trend and sample weight plots
v <- voomWithQualityWeights(dge, design, plot = TRUE)

## Fitting the model
vfit <- lmFit(v, design) 
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
tfit <- treat(vfit, lfc = 0.5)
plotSA(tfit)

Count DE genes

For a gene to be considered differentially expressed, we require a p-value of 0.1 with a false discovery rate correction and a log fold count difference of 1.

dt <- decideTests(tfit, p.value = 0.1, adjust.method = "fdr")
summary(dt) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
CtlvI1 CtlvI2 CtlvI5 I1vI2 I1vI5 I2vI5
Down 0 0 0 0 0 1
NotSig 14563 14563 14563 14563 14561 14559
Up 0 0 0 0 2 3
dt.tib <- as_tibble(dt, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("C")), as.numeric) %>% 
  mutate_at(vars(starts_with("I")), as.numeric) %>% 
  filter_at(vars(CtlvI1:I2vI5), any_vars(. != 0))

Volcano plot

tmp1 <- topTreat(tfit, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 1") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp2 <- topTreat(tfit, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Control vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp4 <- topTreat(tfit, coef = 4, n = Inf)
results4 <- mutate(tmp4, sig=ifelse(tmp4$adj.P.Val<0.1, "Sig", "Not Sig"))
p4 <- ggplot(results4, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 2") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp5 <- topTreat(tfit, coef = 5, n = Inf)
results5 <- mutate(tmp5, sig=ifelse(tmp5$adj.P.Val<0.1, "Sig", "Not Sig"))
p5 <- ggplot(results5, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 1 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

tmp6 <- topTreat(tfit, coef = 6, n = Inf)
results6 <- mutate(tmp6, sig=ifelse(tmp6$adj.P.Val<0.1, "Sig", "Not Sig"))
p6 <- ggplot(results6, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Infected, Day 2 vs. Infected, Day 5") +
  ylab("-Log10(p-value)") +
  xlab("Log2(Fold change)") +
  theme_bw() +
  theme(legend.position = "none")

grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 3)

Heatmap

# Get the gene names for DE genes
deGeneCounts <- lcpm %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("gene") %>%
  filter(gene %in% dt.tib$gene) %>%
  column_to_rownames("gene")

group2 <- group %>% 
  as_tibble() %>% 
  mutate(group = sub("I2", "I02", value)) %>% 
  mutate(group = sub("I1", "I01", group)) %>% 
  mutate(group = sub("I5", "I05", group))

newNames1 <- colnames(deGeneCounts) %>%
  as_tibble() %>%
  separate(value, into = c(NA, "sample")) %>%
  mutate(group = group2$group) %>%
  unite("sample", 2:1, sep = "_")

colnames(deGeneCounts) <- newNames1$sample
deGeneCounts <- deGeneCounts %>% dplyr::select(order(colnames(.))) %>%
  as.matrix()

heatmap.annot <- annot %>%
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  mutate(hgnc_symbol = replace_na(hgnc_symbol, ".")) %>% 
  unite(annot.hm, c(ensembl_gene_id, hgnc_symbol), sep = "-", remove = FALSE) %>% 
  select(annot.hm, ensembl_gene_id)

deGeneCounts <- deGeneCounts %>% 
  as_tibble(rownames = NA) %>% 
  mutate(gene = rownames(.)) %>% 
  rename(ensembl_gene_id = gene) %>% 
  left_join(heatmap.annot) %>% 
  column_to_rownames("annot.hm") %>%
  select(-ensembl_gene_id) %>% 
  as.matrix()

## Set up palette
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)

hc <- hclust(as.dist(1-cor(t(deGeneCounts))))

# Plot the heatmap
heatmap.2(deGeneCounts,
          Colv = FALSE,
          Rowv = as.dendrogram(hc),
          col = rev(morecols(50)),
          trace = "none",
          colsep = c(5, 17, 27),
          dendrogram = "row",
          density.info = "none",
          key = TRUE,
          margins = c(5, 22),
          scale ="row")

Boxplots for differentially expressed genes

lcpm.DE <- lcpm %>%
  as_tibble(rownames = NA) %>% 
  rownames_to_column("identifier") %>% 
  filter(identifier %in% dt.tib$gene) %>%
  pivot_longer(cols = contains("_"),
               names_to = "sample",
               values_to = "lcpm") %>%
  separate(sample, into = c("tissue", "bird")) %>%
  mutate(bird = as.factor(bird)) %>%
  left_join(covars, by = "bird") %>%
  select(identifier, bird, lcpm, group) %>% 
  mutate(group = recode(group, C29 = "Control"),
         group = recode(group, C1 = "Control"),
         tmpID = ifelse(group == "Control", 'Control', 'Infected'))


aprioriPlotting <- function(target, ...) {
   annot.target <- annot %>%
    select(
      ensembl_gene_id,
      hgnc_symbol) %>%
    filter(ensembl_gene_id == target) %>% 
    mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))


  plot <- lcpm.DE %>%
    filter(identifier == target) %>%
    mutate(group = fct_relevel(group, "Control", "I1", "I2", "I5", "I15", "I29")) %>%
    ggplot(aes(x = group, y = lcpm)) +
    facet_grid(. ~ tmpID, scales = "free", space = "free") +
    ylab("Log2(Counts per million)") +
    xlab("Group") +
    geom_point(position = position_dodge(width=0.75), show.legend = FALSE) +
    geom_boxplot(alpha = 0.5) +
    geom_label_repel(aes(label = bird, fill = NULL), show.legend=FALSE) +
    theme_classic() +
    labs(title= paste0(target, " - ", annot.target[1,2])) +
    theme(legend.title = element_blank())
    
  print(plot)
}

for(target in sort(unique(lcpm.DE$identifier))) {
  aprioriPlotting(target)
}

Annotations for differentially expressed genes

ns denotes non-significant genes for each comparison and numerical values are the Log2(Fold change) difference
ensembl_gene_id hgnc_symbol CtlvI1 CtlvI2 CtlvI5 I1vI2 I1vI5 I2vI5
ENSAPLG00020005696 CA7 ns ns ns ns 3.49 ns
ENSAPLG00020005864 NA ns ns ns ns ns -3.25
ENSAPLG00020006372 CALB1 ns ns ns ns 6.91 4.85
ENSAPLG00020004306 FGG ns ns ns ns ns 3.28
ENSAPLG00020010772 SLC26A9 ns ns ns ns ns 4.54

Gene functions

bind_rows(tmp1a, tmp2a, tmp3a, tmp4a, tmp5a, tmp6a) %>% 
  select(-comp, -adj.P.Val, -logFC) %>% 
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  distinct(.) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id gene_biotype hgnc_symbol description
ENSAPLG00020005696 protein_coding CA7 carbonic anhydrase 7 [Source:NCBI gene;Acc:101798320]
ENSAPLG00020005864 protein_coding NA vasotocin-neurophysin VT-like [Source:NCBI gene;Acc:113843450]
ENSAPLG00020006372 protein_coding CALB1 calbindin 1 [Source:NCBI gene;Acc:101802006]
ENSAPLG00020004306 protein_coding FGG fibrinogen gamma chain [Source:NCBI gene;Acc:101793661]
ENSAPLG00020010772 protein_coding SLC26A9 solute carrier family 26 member 9 [Source:HGNC Symbol;Acc:HGNC:14469]

Pathway analysis

de.entrez <- annot %>% filter(ensembl_gene_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)

K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>% 
  as_tibble(rownames = NA) %>% 
  rownames_to_column("PathwayID") %>% 
  filter(P.DE < 0.05) %>% 
  arrange(P.DE)

keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>% 
  as_tibble() %>% 
  rename("entrezgene_id" = GeneID)

keggGenes = data.frame()
for (path in nrow(K)){
    procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
    df <- data.frame(procPath)
    keggGenes <- rbind(keggGenes, df)
}

inner_join(keggGenes, K) %>%
  as_tibble() %>% 
  mutate(entrezgene_id = as.numeric(entrezgene_id)) %>% 
  left_join(annot) %>% 
  select(Pathway, PathwayID, N, DE, P.DE, ensembl_gene_id, hgnc_symbol, entrezgene_id) %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway PathwayID N DE P.DE ensembl_gene_id hgnc_symbol entrezgene_id
Nitrogen metabolism path:apla00910 3 1 0.0025722 ENSAPLG00020005696 CA7 101798320