This is an analysis of differential expression at the transcript level between shedding rate groupings based on average shedding rates for infected mallard bursa samples.
## Load packages and data
library(limma)
library(edgeR)
library(Glimma)
library(ggrepel)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
library(tidyverse)
## Load data
cnt <- read.table("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/all.isoforms.results.csv", header = TRUE) %>% as_tibble(rownames = NA)
covars <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/MALL70_SSgroup_RAW_11.5.20.csv", delim = ",")
annot <- read_delim("G:/Shared drives/RNA Seq Supershedder Project/MALL DE manuscript/mallardLPAIV/extData/annotationTrans103120.txt", delim = "\t")
newNames <- names(cnt) %>%
as_tibble() %>%
separate(value, into = c("pt1", "pt2", "pt3")) %>%
select(pt2, pt3) %>%
mutate(pt2 = as.numeric(pt2),
pt2 = formatC(pt2, width = 2, format = "d", flag = "0")) %>%
mutate(newName = paste0("bursa_", pt2))
colnames(cnt) <- newNames$newName
cnt <- cnt %>%
select(order(colnames(cnt)))
covars <- covars %>%
mutate(bird = as.numeric(bird)) %>%
mutate(bird = formatC(bird, width = 2, format = "d", flag = "0")) %>%
arrange(bird) %>%
mutate(bird = as.factor(bird)) %>%
mutate(group = str_remove(group, "-"))
## Remove birds with issues
cnt <- cnt %>% select(-bursa_01, -bursa_72)
covars <- covars %>% filter(bird != "01", bird != "72")
Here we standardize expression levels across samples of varying depth by using the counts per million (CPM) and trimmed mean of M-values (TMM) methods. We also remove any genes that are expressed at few than 0.5 CPM in at least 25% of individuals.
#Convert to DGEList object
dge <- DGEList(counts=cnt)
dge$genes <- annot[match(rownames(dge$counts), annot$ensembl_transcript_id),]
#CPM and log-CPM
cpm <- cpm(dge)
lcpm <- cpm(dge, log = TRUE)
#### Retain only genes expressed at >0.5 CPM in at least 25% of individuals
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE TRUE
## 26095 3225
keep.exprs <- rowSums(cpm>0.5) >= length(cnt[1,])/4
sum(keep.exprs)
## [1] 17851
dge <- dge[keep.exprs,, keep.lib.sizes=FALSE]
table(rowSums(dge$counts==0) == length(cnt[1,]))
##
## FALSE
## 17851
dim(dge)
## [1] 17851 68
Following normalization, we subset the data based on infection group of interest.
#Assign groups in DGElist
group <- recode(covars$group, C1 = "Ctl", C29 = "Ctl") %>%
as.factor()
dge$samples$group <- group
### Divide out I1 and I3
dge.I5 <- dge[,grep('\\bI5\\b', dge$samples$group)]
dge.I5 <- calcNormFactors(dge.I5, method = "TMM")
covars.I5 <- covars %>% filter(group == "I5")
## Establish design matrix: I5
design.I5 = model.matrix(~ 0 +
factor(covars.I5$SSgroup.virus.avg) +
covars.I5$age +
covars.I5$sex +
covars.I5$Bursa_pool)
colnames(design.I5) <- c("HIGH", "LOW", "MODERATE",
"age", "sexM", "poolMSU2",
"poolMSU3", "poolMSU4", "poolMSU5")
contr.matrix.I5 = makeContrasts(
LvM = LOW - MODERATE,
MvH = MODERATE - HIGH,
LvH = LOW - HIGH,
levels = colnames(design.I5))
## Mean-variance trend plots
v.I5 <- voomWithQualityWeights(dge.I5, design.I5, plot = FALSE)
#Fitting models
vfit.I5 <- lmFit(v.I5, design.I5)
vfit.I5 <- contrasts.fit(vfit.I5, contrasts = contr.matrix.I5)
tfit.I5 <- treat(vfit.I5, lfc = 0.5)
## Identify DE genes
dt.I5 <- decideTests(tfit.I5, p.value = 0.1, adjust.method = "fdr")
dt.tib.I5 <- as_tibble(dt.I5, rownames = NA) %>%
rownames_to_column("gene") %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
rename(LvM.I5 = LvM,
MvH.I5 = MvH,
LvH.I5 = LvH)
dt.tib <- dt.tib.I5 %>%
mutate_at(vars(starts_with("L")), as.numeric) %>%
mutate_at(vars(starts_with("M")), as.numeric) %>%
filter_at(vars(LvM.I5:LvH.I5), any_vars(. != 0))
allResults <- as.data.frame(summary(dt.I5)) %>%
filter(Var1 != "NotSig")
kable(allResults) %>%
kable_styling("striped", full_width = F)
Var1 | Var2 | Freq |
---|---|---|
Down | LvM | 2 |
Up | LvM | 0 |
Down | MvH | 0 |
Up | MvH | 0 |
Down | LvH | 0 |
Up | LvH | 1 |
tmp1 <- topTreat(tfit.I5, coef = 1, n = Inf)
results1 <- mutate(tmp1, sig=ifelse(tmp1$adj.P.Val<0.1, "Sig", "Not Sig"))
p1 <- ggplot(results1, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. Moderate shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp2 <- topTreat(tfit.I5, coef = 2, n = Inf)
results2 <- mutate(tmp2, sig=ifelse(tmp2$adj.P.Val<0.1, "Sig", "Not Sig"))
p2 <- ggplot(results2, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Moderate vs. High shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
tmp3 <- topTreat(tfit.I5, coef = 3, n = Inf)
results3 <- mutate(tmp3, sig=ifelse(tmp3$adj.P.Val<0.1, "Sig", "Not Sig"))
p3 <- ggplot(results3, aes(logFC, -log10(P.Value))) +
geom_point(aes(col = sig)) +
scale_color_manual(values=c("black", "red")) +
ggtitle("Low vs. High shedding") +
ylab("-Log10(p-value)") +
xlab("Log(Fold count)") +
theme_bw() +
theme(legend.position = "none")
grid.arrange(p1, p2, p3, nrow = 1)
lcpm.DE <- lcpm %>%
as_tibble(rownames = NA) %>%
rownames_to_column("identifier") %>%
filter(identifier %in% dt.tib$gene) %>%
pivot_longer(cols = contains("_"),
names_to = "sample",
values_to = "lcpm") %>%
separate(sample, into = c("tissue", "bird")) %>%
mutate(bird = as.factor(bird)) %>%
left_join(covars, by = "bird") %>%
filter(group == "C1" | group == "C14" | group == "I5") %>%
select(identifier, bird, lcpm, SSgroup.virus.avg, group) %>%
mutate(group = recode(group, C29 = "Control"),
group = recode(group, C1 = "Control"),
tmpID = ifelse(group == "Control", 'Control', 'Infected'))
aprioriPlotting <- function(target, ...) {
annot.target <- annot %>%
select(
ensembl_transcript_id,
hgnc_symbol) %>%
filter(ensembl_transcript_id == target) %>%
mutate(hgnc_symbol = replace_na(hgnc_symbol, "."))
plot <- lcpm.DE %>%
filter(identifier == target) %>%
mutate(SSgroup.virus.avg = fct_relevel(SSgroup.virus.avg, "LOW", "MODERATE", "HIGH")) %>%
ggplot(aes(x = SSgroup.virus.avg, y = lcpm)) +
facet_grid(. ~ tmpID, scales = "free", space = "free") +
ylab("Log2(Counts per million)") +
xlab("Shedding Group") +
scale_fill_grey(start = 0.35, end = 1) +
geom_point(position = position_dodge(width=0.75), aes(group = group), show.legend = FALSE) +
geom_boxplot(alpha = 0.5) +
geom_label_repel(aes(label = bird, group = group, fill = NULL), position = position_dodge(width=0.75), show.legend=FALSE) +
theme_classic() +
labs(title= paste0(target, " - ", annot.target[1,2])) +
theme(legend.title = element_blank())
print(plot)
}
for(target in sort(unique(lcpm.DE$identifier))) {
aprioriPlotting(target)
}
ns denotes non-significant genes for each comparison and numerical values are the log(fold count) difference
tmp1a <- tmp1 %>% select(ensembl_transcript_id, adj.P.Val, transcript_biotype, logFC, hgnc_symbol, description) %>% mutate(comp = "LvM")
tmp2a <- tmp2 %>% select(ensembl_transcript_id, adj.P.Val, transcript_biotype, logFC, hgnc_symbol, description) %>% mutate(comp = "MvH")
tmp3a <- tmp3 %>% select(ensembl_transcript_id, adj.P.Val, transcript_biotype, logFC, hgnc_symbol, description) %>% mutate(comp = "LvH")
bind_rows(tmp1a, tmp2a, tmp3a) %>%
select(-description) %>%
filter(ensembl_transcript_id %in% dt.tib$gene) %>%
mutate(logFoldCount = ifelse(adj.P.Val < 0.10, round(logFC, 2), "ns")) %>%
select(-adj.P.Val, -logFC, -transcript_biotype) %>%
pivot_wider(names_from = comp, values_from = logFoldCount) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_transcript_id | hgnc_symbol | LvM | MvH | LvH |
---|---|---|---|---|
ENSAPLT00020020989 | MCM4 | -6.46 | ns | ns |
ENSAPLT00020004689 | SLMAP | -5.01 | ns | ns |
ENSAPLT00020019975 | ROS1 | ns | ns | 4.45 |
bind_rows(tmp1a, tmp2a, tmp3a) %>%
select(-comp, -adj.P.Val, -logFC) %>%
filter(ensembl_transcript_id %in% dt.tib$gene) %>%
distinct(.) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
ensembl_transcript_id | transcript_biotype | hgnc_symbol | description |
---|---|---|---|
ENSAPLT00020020989 | protein_coding | MCM4 | minichromosome maintenance complex component 4 [Source:HGNC Symbol;Acc:HGNC:6947] |
ENSAPLT00020004689 | protein_coding | SLMAP | sarcolemma associated protein [Source:HGNC Symbol;Acc:HGNC:16643] |
ENSAPLT00020019975 | protein_coding | ROS1 | ROS proto-oncogene 1, receptor tyrosine kinase [Source:HGNC Symbol;Acc:HGNC:10261] |
de.entrez <- annot %>% filter(ensembl_transcript_id %in% dt.tib$gene) %>% na.omit(entrezgene_id)
entrez.universe <- annot %>% na.omit(entrezgene_id)
K <- kegga(de.entrez$entrezgene_id, universe = entrez.universe$entrezgene_id, species.KEGG = "apla") %>%
as_tibble(rownames = NA) %>%
rownames_to_column("PathwayID") %>%
filter(P.DE < 0.05) %>%
arrange(P.DE)
keggList <- getGeneKEGGLinks(species.KEGG = "apla") %>%
as_tibble() %>%
rename("entrezgene_id" = GeneID)
keggGenes = data.frame()
for (path in nrow(K)){
procPath <- filter(keggList, entrezgene_id %in% de.entrez$entrezgene_id)
df <- data.frame(procPath)
keggGenes <- rbind(keggGenes, df)
}
inner_join(keggGenes, K) %>%
as_tibble() %>%
mutate(entrezgene_id = as.numeric(entrezgene_id)) %>%
left_join(annot) %>%
select(Pathway, PathwayID, N, DE, P.DE, ensembl_transcript_id, hgnc_symbol, entrezgene_id) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Pathway | PathwayID | N | DE | P.DE | ensembl_transcript_id | hgnc_symbol | entrezgene_id |
---|---|---|---|---|---|---|---|