This is an analysis of differential expression at the gene level between shedding rate groupings based on average shedding rates for infected mallard bursa samples.

## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(ggrepel)
library(gplots)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)

## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.genes.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationGenes103120.txt", delim = "\t")

Prep data for analysis

newNames <- names(cnt) %>%
  as_tibble() %>%
  separate(value, into = c("pt1", "pt2", "pt3")) %>%
  select(pt2, pt3) %>%
  mutate(pt2 = as.numeric(pt2),
         pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
  mutate(newName = paste0("bursa_", pt2))

colnames(cnt) <- newNames$newName

cnt <- cnt %>% 
  select(order(colnames(cnt)))

covars <- covars %>%
  mutate(bird = as.numeric(bird)) %>%
  mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
  arrange(bird) %>%
  mutate(bird = as.factor(bird)) %>%
  mutate(group = str_remove(group, "-"))

## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)

covars <- covars %>% filter(bird != "01", bird != "72")

Prep data for analysis

Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.

#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_gene_id),]

#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)

#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE  TRUE 
## 16979  1511
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 13110
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
## 
## FALSE 
## 13110
dim(dge)
## [1] 13110    68

Subset data

Following normalization, we subset the data based on infection group of interest.

#Assign groups in DGElist
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>% 
  as.factor()
dge$samples$group <- group

### Divide out I1 and I3
dge.I5 <- dge[,grep('\\bI5\\b', dge$samples$group)]
dge.I5 <- calcNormFactors(dge.I5, method = "TMM")

covars.I5 <- covars %>% filter(group == "I5")

Differential expression analysis

## Establish design matrix: I5
design.I5 = model.matrix(~ 0 +
                        factor(covars.I5$SSgroup.virus.avg) +
                        covars.I5$age +
                        covars.I5$sex + 
                        covars.I5$Bursa_pool)

colnames(design.I5) <- c("HIGH", "LOW", "MODERATE",
                         "age", "sexM", "poolMSU2",
                         "poolMSU3", "poolMSU4", "poolMSU5")

contr.matrix.I5 = makeContrasts(
  LvM = LOW - MODERATE,
  MvH = MODERATE - HIGH,
  LvH = LOW - HIGH,
  levels = colnames(design.I5))

## Mean-variance trend plots
v.I5 <- voomWithQualityWeights(dge.I5, design.I5, plot = FALSE)

#Fitting models
vfit.I5 <- lmFit(v.I5, design.I5) 
vfit.I5 <- contrasts.fit(vfit.I5, contrasts = contr.matrix.I5)
tfit.I5 <- treat(vfit.I5, lfc = 0.5)

## Identify DE genes
dt.I5 <- decideTests(tfit.I5, p.value = 0.1, adjust.method = "fdr")
dt.tib.I5 <- as_tibble(dt.I5, rownames = NA) %>% 
  rownames_to_column("gene") %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  rename(LvM.I5 = LvM,
         MvH.I5 = MvH,
         LvH.I5 = LvH)

dt.tib <- dt.tib.I5 %>% 
  mutate_at(vars(starts_with("L")), as.numeric) %>% 
  mutate_at(vars(starts_with("M")), as.numeric) %>% 
  filter_at(vars(LvM.I5:LvH.I5), any_vars(. != 0))

Numbers of DE genes

allResults <- as.data.frame(summary(dt.I5)) %>% 
  filter(Var1 != "NotSig")

kable(allResults) %>%
  kable_styling("striped", full_width = F)
Var1 Var2 Freq
Down LvM 0
Up LvM 0
Down MvH 0
Up MvH 0
Down LvH 0
Up LvH 1

Volcano plot

tmp1 <- topTreat(tfit.I5, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. Moderate shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp2 <- topTreat(tfit.I5, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Moderate vs. High shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

tmp3 <- topTreat(tfit.I5, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
  geom_point(aes(col = sig)) +
  scale_color_manual(values=c("black", "red")) +
  ggtitle("Low vs. High shedding") +
  ylab("-Log10(p-value)") +
  xlab("Log(Fold count)") +
  theme_bw() +
  theme(legend.position = "none")

grid.arrange(p1, p2, p3, nrow = 1)

Boxplots for differentially expressed genes

lcpm.DE <- lcpm %>%
  as_tibble(rownames = NA) %>% 
  rownames_to_column("identifier") %>% 
  filter(identifier %in% dt.tib$gene) %>%
  pivot_longer(cols = contains("_"),
               names_to = "sample",
               values_to = "lcpm") %>%
  separate(sample, into = c("tissue", "bird")) %>%
  mutate(bird = as.factor(bird)) %>%
  left_join(covars, by = "bird") %>%
  filter(group == "C1" | group == "C14" | group == "I5") %>% 
  select(identifier, bird, lcpm, SSgroup.virus.avg, group) %>% 
  mutate(group = recode(group, C29 = "Control"),
         group = recode(group, C1 = "Control"),
         tmpID = ifelse(group == "Control", 'Control', 'Infected'))

aprioriPlotting <- function(target, ...) {
   annot.target <- annot %>%
    select(
      ensembl_gene_id,
      hgnc_symbol) %>%
    filter(ensembl_gene_id == target) %>% 
    mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))

  plot <- lcpm.DE %>%
    filter(identifier == target) %>%
    mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
    ggplot(aes(x = SSgroup.virus.avg, y = lcpm)) +
    facet_grid(. ~ tmpID, scales = "free", space = "free") +
    ylab("Log2(Counts per million)") +
    xlab("Shedding Group") +
    scale_fill_grey(start = 0.35, end = 1) + 
    geom_point(position = position_dodge(width=0.75), aes(group = group), show.legend = FALSE) +
    geom_boxplot(alpha = 0.5) +
    geom_label_repel(aes(label = bird, group = group, fill = NULL), position = position_dodge(width=0.75), show.legend=FALSE) +
    theme_classic() +
    labs(title= paste0(target, " - ", annot.target[1,2])) +
    theme(legend.title = element_blank())
    
  print(plot)
}

for(target in sort(unique(lcpm.DE$identifier))) {
  aprioriPlotting(target)
}

Annotations for differentially expressed genes

ns denotes non-significant genes for each comparison and numerical values are the log(fold count) difference
ensembl_gene_id gene_biotype hgnc_symbol LvM MvH LvH
ENSAPLG00020012660 protein_coding NA ns ns 6.61

Gene functions

bind_rows(tmp1a, tmp2a, tmp3a) %>% 
  select(-comp, -adj.P.Val, -logFC) %>% 
  filter(ensembl_gene_id %in% dt.tib$gene) %>% 
  distinct(.) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_gene_id gene_biotype hgnc_symbol description
ENSAPLG00020012660 protein_coding NA cystatin-B [Source:NCBI gene;Acc:101794964]