This is an analysis of differential expression at the gene level between shedding rate groupings based on average shedding rates for infected mallard ileum samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(ggrepel)
library(gplots)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt1 <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/ileumCounts.txt", delim = "\t") %>% distinct(gene, .keep_all = TRUE)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationGenesIleum110120.txt", delim = "\t")

Prep data for analysis

cnt <- cnt1 %>%
  as_tibble() %>%
  select(-gene)

newNames.ileum <- names(cnt) %>%
  as_tibble() %>%
  mutate(value = gsub("Iluem_", "", .$value)) %>% 
  separate(value, into = c(NA, "pt1", "pt2")) %>%
  mutate(pt1 = na_if(pt1, "MALL"),
         pt2 = na_if(pt2, "ileum")) %>%
  mutate(bird = ifelse(is.na(pt1), pt2, pt1)) %>%
  mutate(bird = as.numeric(bird),
    bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("ileum_", bird))

colnames(cnt) <- newNames.ileum$newName

cnt <- cnt[,order(colnames(cnt))] %>%
  mutate(gene = cnt1$gene) %>%
  column_to_rownames("gene") %>%
  as_tibble(rownames = NA)

### Covars
covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

covars.ileum <- covars %>%
  filter(bird %in% newNames.ileum$bird)

cnt <- cnt %>% select(-ileum_01, -ileum_72)

covars.ileum <- covars.ileum %>% filter(bird != "01",
                                        bird != "72")

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 17388   728
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 14563
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 14563
dim(dge)
## [1] 14563    40

Subset data

Following normalization, we subset the data based on infection groups.

#Assign groups in DGElist
group <- recode(covars.ileum$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
dge$samples$group <- group

### Divide out I1 and I3
dge.I5 <- dge[,grep('\\bI5\\b', dge$samples$group)]
dge.I5 <- calcNormFactors(dge.I5, method = "TMM")

covars.I5 <- covars.ileum %>% filter(group == "I5")

Differential expression analysis

## Establish design matrix: I5
design.I5 = model.matrix(~ 0 +
                        factor(covars.I5$SSgroup.virus.avg) +
                        covars.I5$age +
                        covars.I5$sex + 
                        covars.I5$Bursa_pool)

colnames(design.I5) <- c("HIGH", "LOW", "MODERATE",
                         "age", "sexM", "poolMSU2",
                         "poolMSU3", "poolMSU4", "poolMSU5")

contr.matrix.I5 = makeContrasts(
  LvM = LOW - MODERATE,
  MvH = MODERATE - HIGH,
  LvH = LOW - HIGH,
  levels = colnames(design.I5))

## Mean-variance trend plots
v.I5 <- voomWithQualityWeights(dge.I5, design.I5, plot = FALSE)

#Fitting models
vfit.I5 <- lmFit(v.I5, design.I5) 
vfit.I5 <- contrasts.fit(vfit.I5, contrasts = contr.matrix.I5)
tfit.I5 <- treat(vfit.I5, lfc = 0.5)

## Identify DE genes
dt.I5 <- decideTests(tfit.I5, p.value = 0.1, adjust.method = "fdr")
dt.tib.I5 <- as_tibble(dt.I5, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  rename(LvM.I5 = LvM,
         MvH.I5 = MvH,
         LvH.I5 = LvH)

dt.tib <- dt.tib.I5 %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  filter_at(vars(LvM.I5:LvH.I5), any_vars(. != 0))

Numbers of DE genes

allResults <- as.data.frame(summary(dt.I5)) %>% 
  filter(Var1 != "NotSig")

kable(allResults) %>%
  kable_styling("striped", full_width = F)
Var1 Var2 Freq
Down LvM 0
Up LvM 0
Down MvH 2
Up MvH 28
Down LvH 0
Up LvH 20

Volcano plot

tmp1 <- topTreat(tfit.I5, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. Moderate shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp2 <- topTreat(tfit.I5, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Moderate vs. High shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit.I5, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. High shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

grid.arrange(p1, p2, p3, nrow = 1)

Heatmap

# Get the gene names for DE genes
lcpm2 <- as_tibble(dge.I5$counts, rownames = NA) %>% 
  cpm(., log = TRUE)

newNames <- colnames(lcpm2) %>% 
  as_tibble() %>% 
  separate(value, into = c(NA, "bird")) %>% 
  bind_cols(covars.I5) %>% 
  unite("newNames", SSgroup.virus.avg, group, bird)

colOrder <- covars.I5 %>% 
  mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
  arrange(SSgroup.virus.avg, group, bird) %>% 
  unite("newNames", SSgroup.virus.avg, group, bird)

colnames(lcpm2) <- newNames$newNames
lcpm2 <- lcpm2[, colOrder$newNames]

heatmap.annot <- annot %>%
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  mutate(hgnc_symbol = replace_na(hgnc_symbol, ".")) %>% 
  unite(annot.hm, c(ensembl_gene_id, hgnc_symbol), sep = "-", remove = FALSE) %>% 
  select(annot.hm, ensembl_gene_id) %>% 
  distinct(annot.hm, .keep_all = TRUE)

deGeneCounts <- lcpm2 %>% 
  as_tibble() %>% 
  mutate(ensembl_gene_id = rownames(dge.I5$counts)) %>% 
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  left_join(heatmap.annot) %>% 
  column_to_rownames("annot.hm") %>%
  select(-ensembl_gene_id) %>% 
  as.matrix()


## Set up palette
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)

hc <- hclust(as.dist(1-cor(t(deGeneCounts))))

# Plot the heatmap
heatmap.2(deGeneCounts,
          Colv = FALSE,
          Rowv = as.dendrogram(hc),
          col = rev(morecols(50)),
          trace = "none",
          colsep = c(4, 10),
          dendrogram = "row",
          density.info = "none",
          key = TRUE,
          #main = "Ileum - Gene level",
          margins = c(10, 15),
          scale ="row")

Boxplots for differentially expressed genes

lcpm.DE <- lcpm %>%
  as_tibble(rownames = NA) %>% 
  rownames_to_column("identifier") %>% 
  filter(identifier %in% dt.tib$gene) %>%
  pivot_longer(cols = contains("_"),
               names_to = "sample",
               values_to = "lcpm") %>%
  separate(sample, into = c("tissue", "bird")) %>%
  mutate(bird = as.factor(bird)) %>%
  left_join(covars, by = "bird") %>%
  filter(group == "C1" | group == "C14" | group == "I5") %>% 
  select(identifier, bird, lcpm, SSgroup.virus.avg, group) %>% 
  mutate(group = recode(group, C29 = "Control"),
         group = recode(group, C1 = "Control"),
         tmpID = ifelse(group == "Control", 'Control', 'Infected'))

aprioriPlotting <- function(target, ...) {
   annot.target <- annot %>%
    select(
      ensembl_gene_id,
      hgnc_symbol) %>%
    filter(ensembl_gene_id == target) %>% 
    mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))

  plot <- lcpm.DE %>%
    filter(identifier == target) %>%
    mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
    ggplot(aes(x = SSgroup.virus.avg, y = lcpm)) +
    facet_grid(. ~ tmpID, scales = "free", space = "free") +
    ylab("Log2(Counts per million)") +
    xlab("Shedding Group") +
    scale_fill_grey(start = 0.35, end = 1) + 
    geom_point(position = position_dodge(width=0.75), aes(group = group), show.legend = FALSE) +
    geom_boxplot(alpha = 0.5) +
    geom_label_repel(aes(label = bird, group = group, fill = NULL), position = position_dodge(width=0.75), show.legend=FALSE) +
    theme_classic() +
    labs(title= paste0(target, " - ", annot.target[1,2])) +
    theme(legend.title = element_blank())
    
  print(plot)
}

for(target in sort(unique(lcpm.DE$identifier))) {
  aprioriPlotting(target)
}

Annotations for differentially expressed genes

ns denotes non-significant genes for each comparison and numerical values are the log(fold count) difference
ensembl_gene_id hgnc_symbol LvM MvH LvH
ENSAPLG00020000564 MALRD1 ns 8.61 ns
ENSAPLG00020004469 NA ns -6.31 ns
ENSAPLG00020006175 NA ns 4.1 ns
ENSAPLG00020017074 NA ns -4.39 ns
ENSAPLG00020016143 PCK1 ns 7.87 ns
ENSAPLG00020006372 CALB1 ns 9.75 5.82
ENSAPLG00020005323 PLB1 ns 7.25 ns
ENSAPLG00020001436 CPO ns 7.34 ns
ENSAPLG00020002099 SUSD2 ns 7.97 ns
ENSAPLG00020005311 SLC3A1 ns 5.69 3.15
ENSAPLG00020017868 NA ns 8.92 5.48
ENSAPLG00020016544 ACE2 ns 7.11 4
ENSAPLG00020002285 SELENOP ns 6.1 ns
ENSAPLG00020012027 NA ns 6.25 ns
ENSAPLG00020001209 APOB ns 6.37 ns
ENSAPLG00020005067 NA ns 8.37 4.98
ENSAPLG00020002557 NA ns 12.35 8.7
ENSAPLG00020014491 NA ns 10.06 6.93
ENSAPLG00020014852 NA ns 4.17 ns
ENSAPLG00020005112 NA ns 10.15 6.77
ENSAPLG00020014497 NA ns 11.16 8.31
ENSAPLG00020010883 NA ns 12.68 8.24
ENSAPLG00020010852 NA ns 10.75 7.61
ENSAPLG00020015123 NA ns 11.76 8.76
ENSAPLG00020002126 NA ns 10.33 6.88
ENSAPLG00020010714 NA ns 11.02 7.95
ENSAPLG00020014552 NA ns 8.58 6.07
ENSAPLG00020011024 NA ns 10.61 7.64
ENSAPLG00020010687 NA ns 9.7 7.23
ENSAPLG00020014595 NA ns ns 6.46
ENSAPLG00020014586 NA ns 8.38 6.85
ENSAPLG00020017448 LGALS3 ns ns 2.81

Gene functions

bind_rows(tmp1a, tmp2a, tmp3a) %>% 
  select(-comp, -adj.P.Val, -logFC) %>% 
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  distinct(.) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id gene_biotype hgnc_symbol description
ENSAPLG00020000564 protein_coding MALRD1 MAM and LDL receptor class A domain containing 1 [Source:HGNC Symbol;Acc:HGNC:24331]
ENSAPLG00020004469 protein_coding NA histone H2B 1/2/3/4/6 [Source:NCBI gene;Acc:101796082]
ENSAPLG00020006175 protein_coding NA glycerol-3-phosphate dehydrogenase [NAD(+)], cytoplasmic [Source:NCBI gene;Acc:101798440]
ENSAPLG00020017074 protein_coding NA NA
ENSAPLG00020016143 protein_coding PCK1 phosphoenolpyruvate carboxykinase 1 [Source:NCBI gene;Acc:101793637]
ENSAPLG00020006372 protein_coding CALB1 calbindin 1 [Source:NCBI gene;Acc:101802006]
ENSAPLG00020005323 protein_coding PLB1 phospholipase B1 [Source:HGNC Symbol;Acc:HGNC:30041]
ENSAPLG00020001436 protein_coding CPO carboxypeptidase O [Source:HGNC Symbol;Acc:HGNC:21011]
ENSAPLG00020002099 protein_coding SUSD2 sushi domain containing 2 [Source:HGNC Symbol;Acc:HGNC:30667]
ENSAPLG00020005311 protein_coding SLC3A1 solute carrier family 3 member 1 [Source:HGNC Symbol;Acc:HGNC:11025]
ENSAPLG00020017868 protein_coding NA NA
ENSAPLG00020016544 protein_coding ACE2 angiotensin I converting enzyme 2 [Source:HGNC Symbol;Acc:HGNC:13557]
ENSAPLG00020002285 protein_coding SELENOP selenoprotein P [Source:HGNC Symbol;Acc:HGNC:10751]
ENSAPLG00020012027 protein_coding NA NA
ENSAPLG00020001209 protein_coding APOB apolipoprotein B [Source:HGNC Symbol;Acc:HGNC:603]
ENSAPLG00020005067 pseudogene NA NA
ENSAPLG00020002557 protein_coding NA chromodomain-helicase-DNA-binding protein 1 [Source:NCBI gene;Acc:101804420]
ENSAPLG00020014491 protein_coding NA NA
ENSAPLG00020014852 protein_coding NA NA
ENSAPLG00020005112 protein_coding NA NA
ENSAPLG00020014497 protein_coding NA ATP synthase subunit alpha, mitochondrial [Source:NCBI gene;Acc:101801944]
ENSAPLG00020010883 protein_coding NA NA
ENSAPLG00020010852 protein_coding NA guanine nucleotide-binding protein G(q) subunit alpha [Source:NCBI gene;Acc:101790479]
ENSAPLG00020015123 protein_coding NA ubiquilin-1-like [Source:NCBI gene;Acc:101793250]
ENSAPLG00020002126 protein_coding NA NA
ENSAPLG00020010714 lncRNA NA NA
ENSAPLG00020014552 protein_coding NA NA
ENSAPLG00020011024 protein_coding NA NA
ENSAPLG00020010687 protein_coding NA NA
ENSAPLG00020014595 protein_coding NA NA
ENSAPLG00020014586 protein_coding NA dnaJ homolog subfamily B member 5-like [Source:NCBI gene;Acc:101800405]
ENSAPLG00020017448 protein_coding LGALS3 galectin 3 [Source:HGNC Symbol;Acc:HGNC:6563]

Pathway analysis

de.entrez <- annot %>% filter(ensembl_gene_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)

K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>% 
  as_tibble(rownames = NA) %>% 
  rownames_to_column("PathwayID") %>% 
  filter(P.DE < 0.05) %>% 
  arrange(P.DE)

keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>% 
  as_tibble() %>% 
  rename("entrezgene_id" = GeneID)

keggGenes = data.frame()
for (path in nrow(K)){
    procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
    df <- data.frame(procPath)
    keggGenes <- rbind(keggGenes, df)
}

inner_join(keggGenes, K) %>%
  as_tibble() %>% 
  mutate(entrezgene_id = as.numeric(entrezgene_id)) %>% 
  left_join(annot) %>% 
  select(Pathway, PathwayID, N, DE, P.DE, ensembl_gene_id, hgnc_symbol, entrezgene_id) %>% 
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway PathwayID N DE P.DE ensembl_gene_id hgnc_symbol entrezgene_id
Glycolysis / Gluconeogenesis path:apla00010 14 1 0.0079920 ENSAPLG00020016143 PCK1 101793637
Citrate cycle (TCA cycle) path:apla00020 9 1 0.0051414 ENSAPLG00020016143 PCK1 101793637
Pyruvate metabolism path:apla00620 8 1 0.0045708 ENSAPLG00020016143 PCK1 101793637
PPAR signaling pathway path:apla03320 15 1 0.0085616 ENSAPLG00020016143 PCK1 101793637
FoxO signaling pathway path:apla04068 31 1 0.0176534 ENSAPLG00020016143 PCK1 101793637
Insulin signaling pathway path:apla04910 43 1 0.0244448 ENSAPLG00020016143 PCK1 101793637
Adipocytokine signaling pathway path:apla04920 22 1 0.0125444 ENSAPLG00020016143 PCK1 101793637