### Number of divisions per colony
generations = 27.2
### max rate - max number of mutation per division
max_rate = 100
### genome size
genome_size = 4800000
### Passages per strain
passages.dt<-data.table(read_xlsx("passages.xlsx",range = "A1:B24",col_names = T))
setnames(passages.dt,c("isolate",'number of pasages'),c("isolate","rounds"))
MA2.SNV.dt<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))
MA2.SNV.dt
## Mutational Accumulation 5G1b 5G1a 6G1b 2B2a 2B2b 2B1 8B1 2B3 7B1 1G1 6G1a
## 1: A:T>G:C 58 26 NA 65 29 8 NA 1 1 NA NA
## 2: G:C>A:T 31 15 NA 100 30 8 NA 2 1 NA NA
## 3: A:T>T:A 14 7 NA 17 10 4 NA 0 1 NA NA
## 4: A:T>C:G 19 10 NA 21 11 9 NA 0 1 NA NA
## 5: G:C>T:A 17 14 NA 52 17 4 NA 0 0 NA NA
## 6: G:C>C:G 0 0 NA 2 0 1 NA 1 0 NA NA
## 2G2 3G1 1B1a 1B1b 3B1 4G1d 4G1c 4G1a 4G1b 1G2 3G3 3G2 2G1
## 1: 6 27 2 29 2 26 1 16 9 44 122 62 150
## 2: 16 49 6 109 4 16 7 7 8 34 32 32 82
## 3: 26 17 5 24 15 20 23 32 6 5 6 5 14
## 4: 16 22 6 25 10 10 10 19 8 9 20 6 26
## 5: 16 23 14 88 54 33 10 7 7 17 21 12 26
## 6: 0 1 1 1 1 0 0 1 0 0 3 0 3
setnames(MA2.SNV.dt, 'Mutational Accumulation',"SNV")
### create output form:
MA.rates.dt<-melt(MA2.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[,.(SNP=sum(value)),by=isolate][passages.dt,on=c("isolate")]
## Warning in melt.data.table(MA2.SNV.dt, id.vars = "SNV", variable.name =
## "isolate"): 'measure.vars' [5G1b, 5G1a, 6G1b, 2B2a, ...] are not all of the
## same type. By order of hierarchy, the molten data value column will be of type
## 'double'. All measure variables not of type 'double' will be coerced too. Check
## DETAILS in ?melt.data.table for more on coercion.
MA.rates.dt[,GT := ifelse((isolate == "2B3" | isolate == "7B1"),yes ="WT",no="DnaQ")]
MA.rates.dt
## isolate SNP rounds GT
## 1: 1B1a 34 4 DnaQ
## 2: 1B1b 276 4 DnaQ
## 3: 1G2 109 4 DnaQ
## 4: 2B1 34 7 DnaQ
## 5: 2B2a 257 7 DnaQ
## 6: 2B2b 97 6 DnaQ
## 7: 2G1 301 6 DnaQ
## 8: 2G2 80 36 DnaQ
## 9: 3B1 86 4 DnaQ
## 10: 3G1 139 6 DnaQ
## 11: 3G2 117 4 DnaQ
## 12: 3G3 204 20 DnaQ
## 13: 4G1a 82 39 DnaQ
## 14: 4G1b 38 32 DnaQ
## 15: 4G1c 51 29 DnaQ
## 16: 4G1d 105 18 DnaQ
## 17: 5G1a 72 7 DnaQ
## 18: 5G1b 139 7 DnaQ
## 19: 7B1 4 113 WT
## 20: 2B3 4 113 WT
## 21: ATCC_51783 NA 113 DnaQ
## 22: ATCC_51784 NA 113 DnaQ
## 23: ATCC_51730 NA 113 DnaQ
## isolate SNP rounds GT
## some manual correction for ATCC isolates
MA.rates.dt[isolate == "ATCC_51783", `:=` (SNP = 2, GT = "ATCC")]
MA.rates.dt[isolate == "ATCC_51784", `:=` (SNP = 3, GT = "ATCC")]
MA.rates.dt[isolate == "ATCC_51730", `:=` (SNP = 4, GT = "ATCC")]
MA.rates.dt
## isolate SNP rounds GT
## 1: 1B1a 34 4 DnaQ
## 2: 1B1b 276 4 DnaQ
## 3: 1G2 109 4 DnaQ
## 4: 2B1 34 7 DnaQ
## 5: 2B2a 257 7 DnaQ
## 6: 2B2b 97 6 DnaQ
## 7: 2G1 301 6 DnaQ
## 8: 2G2 80 36 DnaQ
## 9: 3B1 86 4 DnaQ
## 10: 3G1 139 6 DnaQ
## 11: 3G2 117 4 DnaQ
## 12: 3G3 204 20 DnaQ
## 13: 4G1a 82 39 DnaQ
## 14: 4G1b 38 32 DnaQ
## 15: 4G1c 51 29 DnaQ
## 16: 4G1d 105 18 DnaQ
## 17: 5G1a 72 7 DnaQ
## 18: 5G1b 139 7 DnaQ
## 19: 7B1 4 113 WT
## 20: 2B3 4 113 WT
## 21: ATCC_51783 2 113 ATCC
## 22: ATCC_51784 3 113 ATCC
## 23: ATCC_51730 4 113 ATCC
## isolate SNP rounds GT
### stats
MA.rates.dt[, SNV_per_division := round(
binom.test(SNP,round(generations*rounds*max_rate))$estimate*max_rate,
digits = 4),
by=isolate]
MA.rates.dt[, ci :=
paste0(round(
binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate,
digits = 4),collapse = "-"),
by=isolate]
MA.rates.dt
## isolate SNP rounds GT SNV_per_division ci
## 1: 1B1a 34 4 DnaQ 0.3125 0.2165-0.4364
## 2: 1B1b 276 4 DnaQ 2.5368 2.2495-2.8498
## 3: 1G2 109 4 DnaQ 1.0018 0.8233-1.2073
## 4: 2B1 34 7 DnaQ 0.1786 0.1237-0.2494
## 5: 2B2a 257 7 DnaQ 1.3498 1.1907-1.524
## 6: 2B2b 97 6 DnaQ 0.5944 0.4822-0.7246
## 7: 2G1 301 6 DnaQ 1.8444 1.6435-2.0627
## 8: 2G2 80 36 DnaQ 0.0817 0.0648-0.1017
## 9: 3B1 86 4 DnaQ 0.7904 0.6327-0.9753
## 10: 3G1 139 6 DnaQ 0.8517 0.7165-1.0049
## 11: 3G2 117 4 DnaQ 1.0754 0.8901-1.2874
## 12: 3G3 204 20 DnaQ 0.3750 0.3254-0.43
## 13: 4G1a 82 39 DnaQ 0.0773 0.0615-0.0959
## 14: 4G1b 38 32 DnaQ 0.0437 0.0309-0.0599
## 15: 4G1c 51 29 DnaQ 0.0647 0.0481-0.085
## 16: 4G1d 105 18 DnaQ 0.2145 0.1754-0.2596
## 17: 5G1a 72 7 DnaQ 0.3782 0.296-0.476
## 18: 5G1b 139 7 DnaQ 0.7300 0.6141-0.8614
## 19: 7B1 4 113 WT 0.0013 4e-04-0.0033
## 20: 2B3 4 113 WT 0.0013 4e-04-0.0033
## 21: ATCC_51783 2 113 ATCC 0.0007 1e-04-0.0024
## 22: ATCC_51784 3 113 ATCC 0.0010 2e-04-0.0029
## 23: ATCC_51730 4 113 ATCC 0.0013 4e-04-0.0033
## isolate SNP rounds GT SNV_per_division ci
### analysis of trunks and splits
MM.SNV.dt<-data.table(read_xlsx("branch_and_truncs.xlsx",col_names = T))
MM.SNV.dt
## Isolates Unique SNVs Shared between them Shared with closest relative
## 1: 1B1a 142 842 -
## 2: 1B1b 132 842 -
## 3: 2B2a 126 86 145
## 4: 2B2b 117 86 145
## 5: 4G1a 19 220 -
## 6: 4G1b 317 220 -
## 7: 4G1c 18 220 -
## 8: 4G1d 10 220 -
## 9: 5G1a 58 21 113
## 10: 5G1b 90 21 113
## Trunc total
## 1: 842
## 2: 842
## 3: 231
## 4: 231
## 5: 220
## 6: 220
## 7: 220
## 8: 220
## 9: 134
## 10: 134
MA.rates.dt[MM.SNV.dt,.(isolate,SNP,'Unique SNVs'),on=c(isolate ="Isolates")]
## isolate SNP V3
## 1: 1B1a 34 Unique SNVs
## 2: 1B1b 276 Unique SNVs
## 3: 2B2a 257 Unique SNVs
## 4: 2B2b 97 Unique SNVs
## 5: 4G1a 82 Unique SNVs
## 6: 4G1b 38 Unique SNVs
## 7: 4G1c 51 Unique SNVs
## 8: 4G1d 105 Unique SNVs
## 9: 5G1a 72 Unique SNVs
## 10: 5G1b 139 Unique SNVs
MM.div.dt<-MA.rates.dt[MM.SNV.dt,on=c(isolate ="Isolates")][,.(isolate,SNP, rounds, SNV_per_division, div=`Unique SNVs`)]
MM.div.dt
## isolate SNP rounds SNV_per_division div
## 1: 1B1a 34 4 0.3125 142
## 2: 1B1b 276 4 2.5368 132
## 3: 2B2a 257 7 1.3498 126
## 4: 2B2b 97 6 0.5944 117
## 5: 4G1a 82 39 0.0773 19
## 6: 4G1b 38 32 0.0437 317
## 7: 4G1c 51 29 0.0647 18
## 8: 4G1d 105 18 0.2145 10
## 9: 5G1a 72 7 0.3782 58
## 10: 5G1b 139 7 0.7300 90
MM.div.dt[,divisions :=
round(
div/(binom.test(SNP,round(generations*rounds*max_rate))$estimate*max_rate),
digits = 1),
by=isolate]
MM.div.dt[,ci_divisions :=
paste0(
sort(
round(
div/(binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate),
digits = 1)
)
,collapse = "-"),
by=isolate]
MM.div.dt[, passages := round(divisions/generations,digits = 1) ]
MM.div.dt[,ci_passages :=
paste0(
sort(
round(
div/(binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate)/
generations,
digits = 1)
)
,collapse = "-"),
by=isolate]
MM.div.dt[,.(isolate,divergence=div, SNV_per_division,divisions, ci_divisions, passages, ci_passages)]
## isolate divergence SNV_per_division divisions ci_divisions passages
## 1: 1B1a 142 0.3125 454.4 325.4-655.9 16.7
## 2: 1B1b 132 2.5368 52.0 46.3-58.7 1.9
## 3: 2B2a 126 1.3498 93.3 82.7-105.8 3.4
## 4: 2B2b 117 0.5944 196.8 161.5-242.6 7.2
## 5: 4G1a 19 0.0773 245.8 198-309 9.0
## 6: 4G1b 317 0.0437 7261.0 5290.5-10259.9 266.9
## 7: 4G1c 18 0.0647 278.4 211.8-373.9 10.2
## 8: 4G1d 10 0.2145 46.6 38.5-57 1.7
## 9: 5G1a 58 0.3782 153.4 121.9-195.9 5.6
## 10: 5G1b 90 0.7300 123.3 104.5-146.6 4.5
## ci_passages
## 1: 12-24.1
## 2: 1.7-2.2
## 3: 3-3.9
## 4: 5.9-8.9
## 5: 7.3-11.4
## 6: 194.5-377.2
## 7: 7.8-13.7
## 8: 1.4-2.1
## 9: 4.5-7.2
## 10: 3.8-5.4
### and now some plots:
factor(MA.rates.dt$GT,levels = c("DnaQ","WT","ATCC"),ordered=TRUE)
## [1] DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ
## [16] DnaQ DnaQ DnaQ WT WT ATCC ATCC ATCC
## Levels: DnaQ < WT < ATCC
MA.rates.dt
## isolate SNP rounds GT SNV_per_division ci
## 1: 1B1a 34 4 DnaQ 0.3125 0.2165-0.4364
## 2: 1B1b 276 4 DnaQ 2.5368 2.2495-2.8498
## 3: 1G2 109 4 DnaQ 1.0018 0.8233-1.2073
## 4: 2B1 34 7 DnaQ 0.1786 0.1237-0.2494
## 5: 2B2a 257 7 DnaQ 1.3498 1.1907-1.524
## 6: 2B2b 97 6 DnaQ 0.5944 0.4822-0.7246
## 7: 2G1 301 6 DnaQ 1.8444 1.6435-2.0627
## 8: 2G2 80 36 DnaQ 0.0817 0.0648-0.1017
## 9: 3B1 86 4 DnaQ 0.7904 0.6327-0.9753
## 10: 3G1 139 6 DnaQ 0.8517 0.7165-1.0049
## 11: 3G2 117 4 DnaQ 1.0754 0.8901-1.2874
## 12: 3G3 204 20 DnaQ 0.3750 0.3254-0.43
## 13: 4G1a 82 39 DnaQ 0.0773 0.0615-0.0959
## 14: 4G1b 38 32 DnaQ 0.0437 0.0309-0.0599
## 15: 4G1c 51 29 DnaQ 0.0647 0.0481-0.085
## 16: 4G1d 105 18 DnaQ 0.2145 0.1754-0.2596
## 17: 5G1a 72 7 DnaQ 0.3782 0.296-0.476
## 18: 5G1b 139 7 DnaQ 0.7300 0.6141-0.8614
## 19: 7B1 4 113 WT 0.0013 4e-04-0.0033
## 20: 2B3 4 113 WT 0.0013 4e-04-0.0033
## 21: ATCC_51783 2 113 ATCC 0.0007 1e-04-0.0024
## 22: ATCC_51784 3 113 ATCC 0.0010 2e-04-0.0029
## 23: ATCC_51730 4 113 ATCC 0.0013 4e-04-0.0033
## isolate SNP rounds GT SNV_per_division ci
MA.rates.dt[isolate == "ATCC_51730", `:=` (SNP = 4, GT = "ATCC")]
MA.rates.dt<-cbind(MA.rates.dt,DnaQ=c("E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","WT","WT","WT","WT","WT"))
#Figure 3c:
###########
pdf("Figure_3c.pdf",width=4,height = 4,family = "Arial")
ggplot(MA.rates.dt,aes(y=SNV_per_division,x=DnaQ, fill=DnaQ)) +
geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) +
#geom_dotplot(binaxis = 'y', stackdir = 'center', position = position_dodge(),dotsize =.2)+
scale_y_log10(limits=c(1E-4,10), expand = c(0, 0),
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
geom_jitter(width =.1)+
#scale_y_log10(limits=c(1E-4,10)) +
scale_fill_manual(values = c("coral3","slateblue")) +
theme_classic(base_size = 20)+
theme(axis.text.y =element_text(size=14))+
labs(y="SNV per division")+
annotation_logticks(sides = "l", outside = TRUE,long = unit(0,"mm"),short = unit(1,"mm"),
mid = unit(1.25,"mm"),)+
coord_cartesian(clip = "off")
dev.off()
## quartz_off_screen
## 2
ggplot(MA.rates.dt,aes(y=SNV_per_division,x=DnaQ, fill=DnaQ)) +
geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) +
#geom_dotplot(binaxis = 'y', stackdir = 'center', position = position_dodge(),dotsize =.2)+
scale_y_log10(limits=c(1E-4,10), expand = c(0, 0),
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
geom_jitter(width =.1)+
#scale_y_log10(limits=c(1E-4,10)) +
scale_fill_manual(values = c("coral3","slateblue")) +
theme_classic(base_size = 20)+
theme(axis.text.y =element_text(size=14))+
labs(y="SNV per division")+
annotation_logticks(sides = "l", outside = TRUE,long = unit(0,"mm"),short = unit(1,"mm"),
mid = unit(1.25,"mm"),)+
coord_cartesian(clip = "off")
#Figure 3d:
###########
### data import from last dataset
PT.SNV.dt<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))
###############################################################
setnames(PT.SNV.dt, 'Patient Isolate SNVs',"SNV")
PT.div.dt<-melt(PT.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[,.(SNP=sum(value)),by=isolate]
PT.div.dt[,DnaQ := ifelse((isolate == "2B3" | isolate == "7B1" | isolate == "1G1" | isolate == "8B1"),yes ="WT",no="E9G")]
pdf("Figure_3d.pdf",width=4,height = 4,family = "Arial")
ggplot(PT.div.dt,aes(y=SNP,x= DnaQ, fill=DnaQ)) +
geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) +
geom_jitter(width =.1) +
# scale_y_log10(limits=c(1E-4,10)) +
scale_fill_manual(values = c("coral3","slateblue")) +
theme_classic(base_size = 20)+
theme(axis.text.y =element_text(size=14),axis.title.y=element_text(size=19))+
labs(y="Divergence from LCA (SNV)")
dev.off()
## quartz_off_screen
## 2
ggplot(PT.div.dt,aes(y=SNP,x= DnaQ, fill=DnaQ)) +
geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) +
geom_jitter(width =.1) +
# scale_y_log10(limits=c(1E-4,10)) +
scale_fill_manual(values = c("coral3","slateblue")) +
theme_classic(base_size = 20)+
theme(axis.text.y =element_text(size=14),axis.title.y=element_text(size=19))+
labs(y="Divergence from LCA (SNV)")
#Figure S2b:
FigS2b.dt<-fread("Pairewise_diff.txt")
FigS2b.dt[V1 != V2][order(V3)]
## V1 V2 V3
## 1: 2B3 8B1 13
## 2: 4G1c 4G1d 28
## 3: 7B1 2B3 34
## 4: 2B3 1G1 39
## 5: 7B1 8B1 47
## ---
## 272: 4G1c 3B1 1713
## 273: 3G2 3B1 1724
## 274: 3B1 4G1b 1765
## 275: 1B1b 3B1 2169
## 276: 1B1a 3B1 2179
length(rownames(FigS2b.dt[V1 != V2]))
## [1] 276
pdf("Figure_S2a.pdf",width=2,height = 4,family = "Arial")
ggplot(FigS2b.dt[V1 != V2],aes(x=1,y=V3)) +
geom_violin(lwd=.1,fill="slateblue") + scale_y_log10(limits=c(10,3000)) +
geom_boxplot(width=.3,fill='navy',color='black',lwd=.5,
notch=TRUE,outlier.alpha=0)+
geom_dotplot(data = FigS2b.dt[V1 !=V2][V3<100],aes(x=1,y=V3),binaxis='y', stackdir='center', dotsize=0.2, width =1,stackratio = 1.5)+
theme_classic(base_size = 12) + theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +
annotation_logticks(sides = "l") +
labs(y="SNV",x=NULL)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
dev.off()
## quartz_off_screen
## 2
ggplot(FigS2b.dt[V1 != V2],aes(x=1,y=V3)) +
geom_violin(lwd=.1,fill="slateblue") + scale_y_log10(limits=c(10,3000)) +
geom_boxplot(width=.3,fill='navy',color='black',lwd=.5,
notch=TRUE,outlier.alpha=0)+
geom_dotplot(data = FigS2b.dt[V1 !=V2][V3<100],aes(x=1,y=V3),binaxis='y', stackdir='center', dotsize=0.2, width =1,stackratio = 1.5)+
theme_classic(base_size = 12) + theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +
annotation_logticks(sides = "l") +
labs(y="SNV",x=NULL)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
MA.spectra.dt<-melt(MA2.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[
!(isolate %in% c("8B1","6G1a","6G1b","1G1"))]
MA.spectra.dt[, freq:=value/sum(value), by=isolate]
setkeyv(MA.spectra.dt,c("isolate","SNV"))
PT.spectra.dt<-melt(PT.SNV.dt,id.vars = 'SNV', variable.name = "isolate")
PT.spectra.dt[, freq:=value/sum(value), by=isolate]
setkeyv(PT.spectra.dt,c("isolate","SNV"))
#Figure S10:
############
PT.spectra.dt[MA.spectra.dt]
## SNV isolate value freq i.value i.freq
## 1: A:T>C:G 5G1b 35 0.18134715 19 0.136690647
## 2: A:T>G:C 5G1b 31 0.16062176 58 0.417266187
## 3: A:T>T:A 5G1b 31 0.16062176 14 0.100719424
## 4: G:C>A:T 5G1b 42 0.21761658 31 0.223021583
## 5: G:C>C:G 5G1b 4 0.02072539 0 0.000000000
## ---
## 116: A:T>G:C 2G1 30 0.16129032 150 0.498338870
## 117: A:T>T:A 2G1 23 0.12365591 14 0.046511628
## 118: G:C>A:T 2G1 36 0.19354839 82 0.272425249
## 119: G:C>C:G 2G1 5 0.02688172 3 0.009966777
## 120: G:C>T:A 2G1 51 0.27419355 26 0.086378738
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)]
## isolate SNV delta_freq
## 1: 5G1b A:T>C:G 0.044656503
## 2: 5G1b A:T>G:C -0.256644425
## 3: 5G1b A:T>T:A 0.059902337
## 4: 5G1b G:C>A:T -0.005405002
## 5: 5G1b G:C>C:G 0.020725389
## ---
## 116: 2G1 A:T>G:C -0.337048548
## 117: 2G1 A:T>T:A 0.077144286
## 118: 2G1 G:C>A:T -0.078876862
## 119: 2G1 G:C>C:G 0.016914943
## 120: 2G1 G:C>T:A 0.187814811
pdf("Figure_S10.pdf",width=7,height = 5,family = "Arial")
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)], aes(x=SNV,y=delta_freq,fill=SNV)) +
geom_boxplot(outlier.shape = NA,varwidth = .4) + geom_jitter(width = .1) +
scale_fill_brewer(type = "qual",palette = "Dark2" ) +
theme_classic()
dev.off()
## quartz_off_screen
## 2
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)], aes(x=SNV,y=delta_freq,fill=SNV)) +
geom_boxplot(outlier.shape = NA,varwidth = .4) + geom_jitter(width = .1) +
scale_fill_brewer(type = "qual",palette = "Dark2" ) +
theme_classic()
#knitr::opts_chunk$set()
MA.spectra.dt
## SNV isolate value freq
## 1: A:T>C:G 5G1b 19 0.136690647
## 2: A:T>G:C 5G1b 58 0.417266187
## 3: A:T>T:A 5G1b 14 0.100719424
## 4: G:C>A:T 5G1b 31 0.223021583
## 5: G:C>C:G 5G1b 0 0.000000000
## ---
## 116: A:T>G:C 2G1 150 0.498338870
## 117: A:T>T:A 2G1 14 0.046511628
## 118: G:C>A:T 2G1 82 0.272425249
## 119: G:C>C:G 2G1 3 0.009966777
## 120: G:C>T:A 2G1 26 0.086378738
PT.spectra.dt
## SNV isolate value freq
## 1: A:T>C:G 5G1b 35 0.18134715
## 2: A:T>G:C 5G1b 31 0.16062176
## 3: A:T>T:A 5G1b 31 0.16062176
## 4: G:C>A:T 5G1b 42 0.21761658
## 5: G:C>C:G 5G1b 4 0.02072539
## ---
## 140: A:T>G:C 2G1 30 0.16129032
## 141: A:T>T:A 2G1 23 0.12365591
## 142: G:C>A:T 2G1 36 0.19354839
## 143: G:C>C:G 2G1 5 0.02688172
## 144: G:C>T:A 2G1 51 0.27419355
PT.spectra.dt[MA.spectra.dt]
## SNV isolate value freq i.value i.freq
## 1: A:T>C:G 5G1b 35 0.18134715 19 0.136690647
## 2: A:T>G:C 5G1b 31 0.16062176 58 0.417266187
## 3: A:T>T:A 5G1b 31 0.16062176 14 0.100719424
## 4: G:C>A:T 5G1b 42 0.21761658 31 0.223021583
## 5: G:C>C:G 5G1b 4 0.02072539 0 0.000000000
## ---
## 116: A:T>G:C 2G1 30 0.16129032 150 0.498338870
## 117: A:T>T:A 2G1 23 0.12365591 14 0.046511628
## 118: G:C>A:T 2G1 36 0.19354839 82 0.272425249
## 119: G:C>C:G 2G1 5 0.02688172 3 0.009966777
## 120: G:C>T:A 2G1 51 0.27419355 26 0.086378738
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)]
## isolate SNV delta_freq
## 1: 5G1b A:T>C:G 0.044656503
## 2: 5G1b A:T>G:C -0.256644425
## 3: 5G1b A:T>T:A 0.059902337
## 4: 5G1b G:C>A:T -0.005405002
## 5: 5G1b G:C>C:G 0.020725389
## ---
## 116: 2G1 A:T>G:C -0.337048548
## 117: 2G1 A:T>T:A 0.077144286
## 118: 2G1 G:C>A:T -0.078876862
## 119: 2G1 G:C>C:G 0.016914943
## 120: 2G1 G:C>T:A 0.187814811
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV]
## SNV difference logP
## 1: A:T>C:G -0.005037704 0.05159177
## 2: A:T>G:C -0.099484202 1.37809475
## 3: A:T>T:A -0.001692194 0.32339325
## 4: G:C>A:T -0.015714808 0.17119647
## 5: G:C>C:G 0.008590900 1.23464401
## 6: G:C>T:A 0.113338008 3.49168321
wilcox.test(PT.spectra.dt[MA.spectra.dt]$freq-PT.spectra.dt[MA.spectra.dt]$i.freq)$p.value
## [1] 0.3219527
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV],
aes(x=difference,y=logP,label=SNV,colour=SNV)) +
geom_point(size=3) +
geom_text_repel(colour="black",nudge_x=.053) +
scale_colour_brewer(type = "qual",palette = "Dark2", guide =F) +
scale_x_continuous(limits=c(-.15,.15)) +
labs(x="Mean PT-MA difference",y="-logP") +
theme_classic(base_size = 12)
PT.spectra.dt[MA.spectra.dt]
## SNV isolate value freq i.value i.freq
## 1: A:T>C:G 5G1b 35 0.18134715 19 0.136690647
## 2: A:T>G:C 5G1b 31 0.16062176 58 0.417266187
## 3: A:T>T:A 5G1b 31 0.16062176 14 0.100719424
## 4: G:C>A:T 5G1b 42 0.21761658 31 0.223021583
## 5: G:C>C:G 5G1b 4 0.02072539 0 0.000000000
## ---
## 116: A:T>G:C 2G1 30 0.16129032 150 0.498338870
## 117: A:T>T:A 2G1 23 0.12365591 14 0.046511628
## 118: G:C>A:T 2G1 36 0.19354839 82 0.272425249
## 119: G:C>C:G 2G1 5 0.02688172 3 0.009966777
## 120: G:C>T:A 2G1 51 0.27419355 26 0.086378738
PT.spectra.dt[PT.spectra.dt,allow.cartesian=T,on="SNV"][order(isolate,i.isolate)]
## SNV isolate value freq i.isolate i.value i.freq
## 1: A:T>C:G 5G1b 35 0.18134715 5G1b 35 0.18134715
## 2: A:T>G:C 5G1b 31 0.16062176 5G1b 31 0.16062176
## 3: A:T>T:A 5G1b 31 0.16062176 5G1b 31 0.16062176
## 4: G:C>A:T 5G1b 42 0.21761658 5G1b 42 0.21761658
## 5: G:C>C:G 5G1b 4 0.02072539 5G1b 4 0.02072539
## ---
## 3452: A:T>G:C 2G1 30 0.16129032 2G1 30 0.16129032
## 3453: A:T>T:A 2G1 23 0.12365591 2G1 23 0.12365591
## 3454: G:C>A:T 2G1 36 0.19354839 2G1 36 0.19354839
## 3455: G:C>C:G 2G1 5 0.02688172 2G1 5 0.02688172
## 3456: G:C>T:A 2G1 51 0.27419355 2G1 51 0.27419355
PT.spectra.pval.dt <-PT.spectra.dt[PT.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]
MA.spectra.pval.dt <-MA.spectra.dt[MA.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]
PTvsMA.spectra.pval.dt <-PT.spectra.dt[MA.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]
dcast(PT.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
## isolate 5G1b 5G1a 6G1b 2B2a 2B2b
## 1: 5G1b 1.000000e+00 9.349139e-01 9.688062e-01 1.524869e-02 1.987835e-02
## 2: 5G1a 9.349139e-01 1.000000e+00 5.975245e-01 1.653591e-03 2.576276e-03
## 3: 6G1b 9.688062e-01 5.975245e-01 1.000000e+00 5.143139e-02 5.599936e-02
## 4: 2B2a 1.524869e-02 1.653591e-03 5.143139e-02 1.000000e+00 8.260670e-01
## 5: 2B2b 1.987835e-02 2.576276e-03 5.599936e-02 8.260670e-01 1.000000e+00
## 6: 2B1 2.404062e-01 3.815275e-02 3.615634e-01 5.050247e-02 2.505466e-01
## 7: 8B1 3.143907e-03 3.214699e-03 7.639774e-04 2.338284e-03 5.154551e-04
## 8: 2B3 3.887974e-04 3.977783e-04 6.291931e-05 1.611130e-04 4.060590e-05
## 9: 7B1 1.926923e-05 2.408350e-05 1.786559e-06 3.390010e-06 8.511987e-07
## 10: 1G1 5.129370e-01 4.799516e-01 2.780501e-01 3.269489e-01 2.933917e-01
## 11: 6G1a 3.534372e-01 4.953403e-02 7.886153e-01 9.019921e-02 5.984663e-02
## 12: 2G2 1.393980e-01 6.366288e-02 1.490029e-01 1.316104e-07 2.594930e-07
## 13: 3G1 2.027142e-01 2.467556e-01 1.364252e-01 4.274990e-08 1.062865e-07
## 14: 1B1a 1.219244e-08 5.381779e-07 6.033561e-11 1.172237e-27 4.992083e-28
## 15: 1B1b 4.383228e-09 3.278957e-07 1.375868e-11 1.349314e-28 6.159104e-29
## 16: 3B1 4.174516e-20 1.278850e-15 1.494218e-24 5.894622e-54 1.648180e-51
## 17: 4G1d 1.551785e-02 1.346368e-03 1.087796e-01 6.655144e-01 3.078904e-01
## 18: 4G1c 1.282896e-02 1.243758e-03 9.172317e-02 5.086889e-01 2.371737e-01
## 19: 4G1a 1.833134e-02 1.774794e-03 1.290526e-01 3.613974e-01 2.013570e-01
## 20: 4G1b 5.258126e-05 1.772721e-06 9.897128e-04 8.043299e-02 1.239029e-02
## 21: 1G2 7.936768e-01 5.136746e-01 8.668955e-01 3.962396e-01 4.315178e-01
## 22: 3G3 3.773864e-01 1.036704e-01 8.008204e-01 3.762653e-03 2.041073e-03
## 23: 3G2 2.661662e-01 8.482486e-02 6.670256e-01 7.984447e-04 2.748562e-04
## 24: 2G1 8.309870e-01 6.790375e-01 5.954864e-01 3.381823e-04 2.322387e-04
## isolate 5G1b 5G1a 6G1b 2B2a 2B2b
## 2B1 8B1 2B3 7B1 1G1
## 1: 2.404062e-01 3.143907e-03 3.887974e-04 1.926923e-05 5.129370e-01
## 2: 3.815275e-02 3.214699e-03 3.977783e-04 2.408350e-05 4.799516e-01
## 3: 3.615634e-01 7.639774e-04 6.291931e-05 1.786559e-06 2.780501e-01
## 4: 5.050247e-02 2.338284e-03 1.611130e-04 3.390010e-06 3.269489e-01
## 5: 2.505466e-01 5.154551e-04 4.060590e-05 8.511987e-07 2.933917e-01
## 6: 1.000000e+00 8.485104e-04 1.265298e-04 5.248816e-06 3.381611e-01
## 7: 8.485104e-04 1.000000e+00 7.572787e-01 5.726745e-01 2.421688e-01
## 8: 1.265298e-04 7.572787e-01 NaN NaN 3.072073e-02
## 9: 5.248816e-06 5.726745e-01 NaN NaN 9.573314e-03
## 10: 3.381611e-01 2.421688e-01 3.072073e-02 9.573314e-03 1.000000e+00
## 11: 2.557498e-01 4.647781e-04 3.712193e-05 6.317315e-07 2.693280e-01
## 12: 5.759091e-03 2.929672e-06 1.288526e-07 7.060235e-10 1.910280e-02
## 13: 8.625983e-04 8.995986e-07 3.212174e-08 2.869536e-10 4.406905e-03
## 14: 1.327272e-16 4.902517e-06 2.406803e-07 3.818316e-11 5.080661e-02
## 15: 2.401037e-17 6.016314e-06 3.363978e-07 6.535096e-11 5.279367e-02
## 16: 2.068455e-31 1.357045e-14 3.707852e-16 2.091597e-22 9.222175e-05
## 17: 1.892381e-02 4.836448e-05 1.319652e-06 3.703457e-09 1.154916e-01
## 18: 1.093009e-02 2.508552e-05 6.630646e-07 1.543333e-09 1.166283e-01
## 19: 1.895904e-02 1.949674e-05 5.571333e-07 1.780492e-09 9.753207e-02
## 20: 1.204318e-04 3.039028e-05 9.850733e-07 1.764737e-09 1.356799e-01
## 21: 3.459355e-01 2.255912e-03 1.875796e-04 8.641687e-06 3.538293e-01
## 22: 5.823781e-02 7.574914e-06 1.360562e-07 5.116427e-10 1.398123e-02
## 23: 9.639617e-03 7.005500e-07 5.551896e-09 6.309959e-12 3.908427e-03
## 24: 1.627183e-02 3.145136e-03 4.910797e-04 2.300620e-05 6.626006e-01
## 2B1 8B1 2B3 7B1 1G1
## 6G1a 2G2 3G1 1B1a 1B1b
## 1: 3.534372e-01 1.393980e-01 2.027142e-01 1.219244e-08 4.383228e-09
## 2: 4.953403e-02 6.366288e-02 2.467556e-01 5.381779e-07 3.278957e-07
## 3: 7.886153e-01 1.490029e-01 1.364252e-01 6.033561e-11 1.375868e-11
## 4: 9.019921e-02 1.316104e-07 4.274990e-08 1.172237e-27 1.349314e-28
## 5: 5.984663e-02 2.594930e-07 1.062865e-07 4.992083e-28 6.159104e-29
## 6: 2.557498e-01 5.759091e-03 8.625983e-04 1.327272e-16 2.401037e-17
## 7: 4.647781e-04 2.929672e-06 8.995986e-07 4.902517e-06 6.016314e-06
## 8: 3.712193e-05 1.288526e-07 3.212174e-08 2.406803e-07 3.363978e-07
## 9: 6.317315e-07 7.060235e-10 2.869536e-10 3.818316e-11 6.535096e-11
## 10: 2.693280e-01 1.910280e-02 4.406905e-03 5.080661e-02 5.279367e-02
## 11: 1.000000e+00 1.593525e-02 4.652631e-03 5.895554e-17 6.669275e-18
## 12: 1.593525e-02 1.000000e+00 6.391007e-01 2.449398e-06 7.482776e-07
## 13: 4.652631e-03 6.391007e-01 1.000000e+00 1.707436e-07 7.497355e-08
## 14: 5.895554e-17 2.449398e-06 1.707436e-07 1.000000e+00 9.994785e-01
## 15: 6.669275e-18 7.482776e-07 7.497355e-08 9.994785e-01 1.000000e+00
## 16: 4.142097e-35 2.561961e-15 7.268410e-15 2.278003e-06 8.592549e-06
## 17: 3.803576e-01 4.242324e-07 2.057488e-07 3.484129e-32 2.994213e-33
## 18: 3.403489e-01 1.527924e-07 1.423124e-07 4.658362e-34 3.916566e-35
## 19: 4.788574e-01 1.090791e-06 8.796074e-07 1.732247e-31 1.384282e-32
## 20: 3.881222e-02 1.651289e-11 6.298062e-12 2.125414e-45 1.392457e-46
## 21: 3.949317e-01 7.380060e-03 9.358212e-03 2.734315e-12 7.237920e-13
## 22: 5.322772e-01 2.151378e-01 6.999627e-02 5.515349e-13 8.934373e-14
## 23: 3.117112e-01 1.416776e-01 6.390796e-02 2.976163e-14 4.759195e-15
## 24: 1.620287e-01 8.854705e-02 2.413217e-01 2.497864e-08 9.566429e-09
## 6G1a 2G2 3G1 1B1a 1B1b
## 3B1 4G1d 4G1c 4G1a 4G1b
## 1: 4.174516e-20 1.551785e-02 1.282896e-02 1.833134e-02 5.258126e-05
## 2: 1.278850e-15 1.346368e-03 1.243758e-03 1.774794e-03 1.772721e-06
## 3: 1.494218e-24 1.087796e-01 9.172317e-02 1.290526e-01 9.897128e-04
## 4: 5.894622e-54 6.655144e-01 5.086889e-01 3.613974e-01 8.043299e-02
## 5: 1.648180e-51 3.078904e-01 2.371737e-01 2.013570e-01 1.239029e-02
## 6: 2.068455e-31 1.892381e-02 1.093009e-02 1.895904e-02 1.204318e-04
## 7: 1.357045e-14 4.836448e-05 2.508552e-05 1.949674e-05 3.039028e-05
## 8: 3.707852e-16 1.319652e-06 6.630646e-07 5.571333e-07 9.850733e-07
## 9: 2.091597e-22 3.703457e-09 1.543333e-09 1.780492e-09 1.764737e-09
## 10: 9.222175e-05 1.154916e-01 1.166283e-01 9.753207e-02 1.356799e-01
## 11: 4.142097e-35 3.803576e-01 3.403489e-01 4.788574e-01 3.881222e-02
## 12: 2.561961e-15 4.242324e-07 1.527924e-07 1.090791e-06 1.651289e-11
## 13: 7.268410e-15 2.057488e-07 1.423124e-07 8.796074e-07 6.298062e-12
## 14: 2.278003e-06 3.484129e-32 4.658362e-34 1.732247e-31 2.125414e-45
## 15: 8.592549e-06 2.994213e-33 3.916566e-35 1.384282e-32 1.392457e-46
## 16: 1.000000e+00 5.463871e-60 3.197277e-62 1.105915e-57 2.323230e-78
## 17: 5.463871e-60 1.000000e+00 9.987691e-01 9.899509e-01 4.496617e-01
## 18: 3.197277e-62 9.987691e-01 1.000000e+00 9.991280e-01 5.170326e-01
## 19: 1.105915e-57 9.899509e-01 9.991280e-01 1.000000e+00 5.562112e-01
## 20: 2.323230e-78 4.496617e-01 5.170326e-01 5.562112e-01 1.000000e+00
## 21: 4.192463e-27 3.001195e-01 2.540580e-01 2.397194e-01 4.982540e-03
## 22: 9.678001e-30 1.924931e-02 1.019030e-02 2.126999e-02 2.562950e-05
## 23: 1.289819e-32 5.709164e-03 2.851758e-03 6.710227e-03 1.917749e-06
## 24: 4.781325e-19 7.081954e-04 7.494697e-04 1.548128e-03 2.987067e-06
## 3B1 4G1d 4G1c 4G1a 4G1b
## 1G2 3G3 3G2 2G1
## 1: 7.936768e-01 3.773864e-01 2.661662e-01 8.309870e-01
## 2: 5.136746e-01 1.036704e-01 8.482486e-02 6.790375e-01
## 3: 8.668955e-01 8.008204e-01 6.670256e-01 5.954864e-01
## 4: 3.962396e-01 3.762653e-03 7.984447e-04 3.381823e-04
## 5: 4.315178e-01 2.041073e-03 2.748562e-04 2.322387e-04
## 6: 3.459355e-01 5.823781e-02 9.639617e-03 1.627183e-02
## 7: 2.255912e-03 7.574914e-06 7.005500e-07 3.145136e-03
## 8: 1.875796e-04 1.360562e-07 5.551896e-09 4.910797e-04
## 9: 8.641687e-06 5.116427e-10 6.309959e-12 2.300620e-05
## 10: 3.538293e-01 1.398123e-02 3.908427e-03 6.626006e-01
## 11: 3.949317e-01 5.322772e-01 3.117112e-01 1.620287e-01
## 12: 7.380060e-03 2.151378e-01 1.416776e-01 8.854705e-02
## 13: 9.358212e-03 6.999627e-02 6.390796e-02 2.413217e-01
## 14: 2.734315e-12 5.515349e-13 2.976163e-14 2.497864e-08
## 15: 7.237920e-13 8.934373e-14 4.759195e-15 9.566429e-09
## 16: 4.192463e-27 9.678001e-30 1.289819e-32 4.781325e-19
## 17: 3.001195e-01 1.924931e-02 5.709164e-03 7.081954e-04
## 18: 2.540580e-01 1.019030e-02 2.851758e-03 7.494697e-04
## 19: 2.397194e-01 2.126999e-02 6.710227e-03 1.548128e-03
## 20: 4.982540e-03 2.562950e-05 1.917749e-06 2.987067e-06
## 21: 1.000000e+00 3.227614e-01 2.228573e-01 1.844926e-01
## 22: 3.227614e-01 1.000000e+00 9.974790e-01 9.239711e-02
## 23: 2.228573e-01 9.974790e-01 1.000000e+00 6.612473e-02
## 24: 1.844926e-01 9.239711e-02 6.612473e-02 1.000000e+00
## 1G2 3G3 3G2 2G1
dcast(MA.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
## isolate 5G1b 5G1a 2B2a 2B2b 2B1
## 1: 5G1b NaN NaN 2.329194e-04 NaN 9.682551e-02
## 2: 5G1a NaN NaN 5.315323e-02 NaN 2.808922e-01
## 3: 2B2a 2.329194e-04 5.315323e-02 1.000000e+00 4.348749e-01 9.191913e-03
## 4: 2B2b NaN NaN 4.348749e-01 NaN 1.542864e-01
## 5: 2B1 9.682551e-02 2.808922e-01 9.191913e-03 1.542864e-01 1.000000e+00
## 6: 2B3 4.245329e-07 7.123851e-04 6.158249e-04 7.842315e-05 2.812776e-01
## 7: 7B1 NaN NaN 5.082750e-01 NaN 9.563633e-01
## 8: 2G2 NaN NaN 8.354934e-11 NaN 2.571156e-02
## 9: 3G1 3.254822e-03 8.630508e-02 6.243772e-02 4.540034e-01 4.580641e-01
## 10: 1B1a 3.492163e-05 1.003886e-02 7.059109e-04 3.274400e-03 7.103096e-02
## 11: 1B1b 3.026118e-13 2.454219e-06 1.618892e-04 1.413948e-04 4.635797e-04
## 12: 3B1 1.298090e-17 2.582877e-10 1.006755e-17 1.566370e-12 2.061579e-07
## 13: 4G1d NaN NaN 2.049361e-05 NaN 1.533985e-02
## 14: 4G1c NaN NaN 2.381914e-14 NaN 1.303989e-03
## 15: 4G1a 2.261058e-07 5.949174e-05 4.959379e-16 1.007677e-06 5.327518e-02
## 16: 4G1b NaN NaN 2.934877e-02 NaN 8.322826e-01
## 17: 1G2 NaN NaN 9.655994e-02 NaN 1.055563e-02
## 18: 3G3 3.294290e-03 4.530645e-03 7.534511e-13 1.377871e-05 9.781867e-04
## 19: 3G2 NaN NaN 2.805301e-05 NaN 3.179531e-04
## 20: 2G1 4.052250e-02 1.223943e-02 6.274110e-08 3.434709e-03 2.805723e-03
## 2B3 7B1 2G2 3G1 1B1a 1B1b
## 1: 4.245329e-07 NaN NaN 3.254822e-03 3.492163e-05 3.026118e-13
## 2: 7.123851e-04 NaN NaN 8.630508e-02 1.003886e-02 2.454219e-06
## 3: 6.158249e-04 0.50827501 8.354934e-11 6.243772e-02 7.059109e-04 1.618892e-04
## 4: 7.842315e-05 NaN NaN 4.540034e-01 3.274400e-03 1.413948e-04
## 5: 2.812776e-01 0.95636326 2.571156e-02 4.580641e-01 7.103096e-02 4.635797e-04
## 6: NaN NaN 9.331034e-05 2.409050e-03 8.462851e-02 7.989034e-07
## 7: NaN NaN NaN 9.040959e-01 5.680103e-01 4.763228e-01
## 8: 9.331034e-05 NaN NaN 8.755415e-04 9.334026e-02 5.777656e-08
## 9: 2.409050e-03 0.90409587 8.755415e-04 1.000000e+00 1.146653e-02 1.788149e-03
## 10: 8.462851e-02 0.56801035 9.334026e-02 1.146653e-02 1.000000e+00 3.677758e-02
## 11: 7.989034e-07 0.47632285 5.777656e-08 1.788149e-03 3.677758e-02 1.000000e+00
## 12: 8.814641e-06 0.03500296 1.509321e-06 2.470711e-13 1.049110e-01 4.350354e-10
## 13: 7.565115e-06 NaN NaN 1.276270e-03 7.002015e-02 6.487760e-06
## 14: 1.724890e-04 NaN NaN 2.836844e-06 3.887768e-02 3.151812e-11
## 15: 2.455860e-03 0.88117721 1.955396e-02 1.307276e-06 2.160523e-04 1.213196e-16
## 16: 2.350238e-02 NaN NaN 6.628190e-01 1.428087e-01 6.010342e-03
## 17: 2.272302e-05 NaN NaN 3.960443e-03 3.205490e-05 3.557155e-09
## 18: 7.300444e-03 0.15305444 6.348760e-18 9.867396e-12 9.180923e-09 1.311615e-29
## 19: 8.081844e-06 NaN NaN 1.017013e-06 4.949748e-08 3.248997e-17
## 20: 1.581824e-03 0.36882357 3.685098e-18 7.140353e-08 1.276755e-09 1.112248e-24
## 3B1 4G1d 4G1c 4G1a 4G1b
## 1: 1.298090e-17 NaN NaN 2.261058e-07 NaN
## 2: 2.582877e-10 NaN NaN 5.949174e-05 NaN
## 3: 1.006755e-17 2.049361e-05 2.381914e-14 4.959379e-16 2.934877e-02
## 4: 1.566370e-12 NaN NaN 1.007677e-06 NaN
## 5: 2.061579e-07 1.533985e-02 1.303989e-03 5.327518e-02 8.322826e-01
## 6: 8.814641e-06 7.565115e-06 1.724890e-04 2.455860e-03 2.350238e-02
## 7: 3.500296e-02 NaN NaN 8.811772e-01 NaN
## 8: 1.509321e-06 NaN NaN 1.955396e-02 NaN
## 9: 2.470711e-13 1.276270e-03 2.836844e-06 1.307276e-06 6.628190e-01
## 10: 1.049110e-01 7.002015e-02 3.887768e-02 2.160523e-04 1.428087e-01
## 11: 4.350354e-10 6.487760e-06 3.151812e-11 1.213196e-16 6.010342e-03
## 12: 1.000000e+00 3.778790e-06 5.966340e-05 5.562571e-11 2.962644e-06
## 13: 3.778790e-06 NaN NaN 5.849945e-05 NaN
## 14: 5.966340e-05 NaN NaN 2.873488e-02 NaN
## 15: 5.562571e-11 5.849945e-05 2.873488e-02 1.000000e+00 6.019479e-02
## 16: 2.962644e-06 NaN NaN 6.019479e-02 NaN
## 17: 4.998926e-17 NaN NaN 6.107294e-11 NaN
## 18: 2.121175e-27 2.575983e-11 7.957685e-20 2.781901e-17 1.500693e-04
## 19: 5.035201e-22 NaN NaN 2.833432e-13 NaN
## 20: 1.421551e-33 1.689209e-12 4.448014e-21 1.156384e-19 6.858215e-04
## 1G2 3G3 3G2 2G1
## 1: NaN 3.294290e-03 NaN 4.052250e-02
## 2: NaN 4.530645e-03 NaN 1.223943e-02
## 3: 9.655994e-02 7.534511e-13 2.805301e-05 6.274110e-08
## 4: NaN 1.377871e-05 NaN 3.434709e-03
## 5: 1.055563e-02 9.781867e-04 3.179531e-04 2.805723e-03
## 6: 2.272302e-05 7.300444e-03 8.081844e-06 1.581824e-03
## 7: NaN 1.530544e-01 NaN 3.688236e-01
## 8: NaN 6.348760e-18 NaN 3.685098e-18
## 9: 3.960443e-03 9.867396e-12 1.017013e-06 7.140353e-08
## 10: 3.205490e-05 9.180923e-09 4.949748e-08 1.276755e-09
## 11: 3.557155e-09 1.311615e-29 3.248997e-17 1.112248e-24
## 12: 4.998926e-17 2.121175e-27 5.035201e-22 1.421551e-33
## 13: NaN 2.575983e-11 NaN 1.689209e-12
## 14: NaN 7.957685e-20 NaN 4.448014e-21
## 15: 6.107294e-11 2.781901e-17 2.833432e-13 1.156384e-19
## 16: NaN 1.500693e-04 NaN 6.858215e-04
## 17: NaN 4.240471e-03 NaN 2.397349e-01
## 18: 4.240471e-03 1.000000e+00 8.144742e-02 4.934277e-02
## 19: NaN 8.144742e-02 NaN 7.066273e-01
## 20: 2.397349e-01 4.934277e-02 7.066273e-01 1.000000e+00
dcast(PTvsMA.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
## isolate 5G1b 5G1a 2B2a 2B2b 2B1
## 1: 5G1b 3.185055e-06 1.412074e-02 1.121191e-06 8.358178e-03 4.293127e-01
## 2: 5G1a 2.544259e-07 1.625780e-03 1.605713e-07 1.570537e-03 3.466166e-01
## 3: 6G1b 4.731188e-05 7.013782e-02 1.428750e-05 4.037808e-02 4.066520e-01
## 4: 2B2a 3.527046e-04 1.062972e-01 1.435416e-02 4.156633e-01 1.006759e-01
## 5: 2B2b 5.231326e-04 9.795970e-02 2.730312e-04 2.499545e-01 1.262357e-01
## 6: 2B1 2.188225e-05 4.641625e-02 1.270167e-07 1.345403e-02 1.180695e-01
## 7: 8B1 1.262194e-07 1.997230e-04 5.646063e-05 2.676773e-04 3.404208e-03
## 8: 2B3 7.132611e-09 2.160979e-05 7.856221e-07 1.575802e-05 5.165790e-04
## 9: 7B1 1.880419e-10 1.652119e-06 3.347229e-09 1.196073e-06 7.276136e-05
## 10: 1G1 1.402112e-03 4.978863e-02 1.930753e-02 1.409918e-02 8.979154e-01
## 11: 6G1a 1.169320e-03 3.472597e-01 8.474982e-05 1.253657e-01 3.144066e-01
## 12: 2G2 3.402271e-10 8.327167e-04 9.064400e-13 2.909229e-05 4.179656e-02
## 13: 3G1 5.959752e-10 4.950401e-04 8.575240e-13 2.856050e-05 6.542536e-02
## 14: 1B1a 8.545720e-36 3.271680e-16 3.024220e-34 2.590310e-17 2.872800e-05
## 15: 1B1b 2.931325e-37 2.768413e-17 2.668991e-35 2.830783e-18 1.486416e-05
## 16: 3B1 2.635981e-59 6.083773e-31 8.927357e-65 1.283509e-34 1.185754e-10
## 17: 4G1d 5.561156e-03 4.377635e-01 1.110550e-02 7.046951e-01 1.757833e-01
## 18: 4G1c 1.256696e-02 5.331308e-01 7.774735e-03 7.535217e-01 2.398383e-01
## 19: 4G1a 2.311704e-02 6.413869e-01 4.184261e-03 7.136695e-01 2.583898e-01
## 20: 4G1b 7.279233e-02 7.200585e-01 9.027863e-03 7.595482e-01 8.317685e-02
## 21: 1G2 1.024362e-04 5.057477e-02 4.087977e-04 1.082752e-01 4.012659e-01
## 22: 3G3 2.303829e-06 4.527218e-02 1.999919e-06 2.512142e-02 1.045023e-01
## 23: 3G2 4.098572e-07 2.784436e-02 7.829325e-07 1.914551e-02 8.772617e-02
## 24: 2G1 1.204876e-06 1.075540e-02 8.001951e-08 2.034032e-03 5.150039e-01
## isolate 5G1b 5G1a 2B2a 2B2b 2B1
## 2B3 7B1 2G2 3G1 1B1a 1B1b
## 1: 3.328139e-02 0.8982690366 2.300141e-02 4.827954e-02 0.43835185 9.823354e-06
## 2: 2.151599e-02 0.8828767580 8.224819e-02 4.303813e-02 0.47338488 2.226052e-05
## 3: 8.369651e-03 0.9004838793 6.708663e-03 7.782511e-02 0.30917615 1.095444e-05
## 4: 6.222115e-02 0.8304645105 2.801958e-05 2.347390e-01 0.01784768 2.983368e-06
## 5: 3.781072e-02 0.8838661809 6.304373e-04 1.201427e-01 0.01110026 1.831070e-08
## 6: 3.023660e-02 0.8784430285 1.119181e-02 3.093482e-03 0.13053338 4.389006e-09
## 7: 4.595397e-01 0.1499801583 2.908298e-04 3.689740e-04 0.15700770 1.291944e-04
## 8: NaN 0.0045856031 7.631276e-05 1.870543e-05 0.05251756 1.062100e-06
## 9: NaN 0.0008913283 1.789677e-05 9.299380e-07 0.01838332 7.672293e-09
## 10: 6.999858e-01 0.9130698145 4.123940e-02 1.260305e-01 0.62027468 1.333423e-03
## 11: 1.248720e-02 0.8613940801 1.360635e-04 2.595034e-02 0.17469741 3.135915e-07
## 12: 1.451337e-05 0.8065776966 2.105768e-03 1.122437e-05 0.54056435 8.220078e-09
## 13: 3.583163e-07 0.8406228516 9.514424e-03 7.011489e-05 0.34919691 6.516494e-09
## 14: 1.946488e-04 0.3902193272 1.929646e-07 1.526917e-15 0.88256332 3.307811e-16
## 15: 2.023972e-04 0.3632511313 2.407593e-07 3.909811e-16 0.89408439 1.615780e-16
## 16: 1.132568e-10 0.1178207908 3.198295e-08 2.110445e-32 0.19306291 1.752397e-37
## 17: 3.263498e-03 0.8517820276 5.004186e-06 2.126198e-01 0.02035657 1.580858e-07
## 18: 2.816178e-03 0.8657749828 4.205551e-06 2.362728e-01 0.01516164 2.614123e-08
## 19: 1.721707e-03 0.8709408132 7.778508e-06 1.725212e-01 0.01788889 1.740949e-08
## 20: 9.490161e-03 0.7624457068 4.732093e-09 1.881255e-02 0.00358503 5.540085e-11
## 21: 2.037875e-02 0.9131844053 1.393694e-02 4.233760e-01 0.15102365 1.501987e-04
## 22: 1.268558e-05 0.8508335833 4.193515e-04 1.444594e-02 0.23657703 1.207090e-05
## 23: 7.031341e-07 0.8437367618 1.402457e-04 1.523783e-02 0.19365757 1.248193e-05
## 24: 5.824894e-02 0.8536120297 2.021537e-03 8.198261e-03 0.50010861 5.830760e-07
## 2B3 7B1 2G2 3G1 1B1a 1B1b
## 3B1 4G1d 4G1c 4G1a 4G1b
## 1: 2.700457e-08 5.567167e-02 1.711735e-04 2.720740e-05 7.418636e-01
## 2: 4.763595e-08 7.586571e-03 8.095476e-04 4.299898e-05 5.518726e-01
## 3: 7.890708e-09 1.570348e-01 3.527979e-05 8.855078e-06 8.421781e-01
## 4: 6.955438e-15 1.537433e-02 1.220711e-07 7.892125e-09 3.359317e-01
## 5: 2.309249e-15 2.636622e-02 8.868404e-06 2.079096e-06 4.414266e-01
## 6: 1.183058e-09 4.060696e-01 4.577685e-04 1.372159e-04 5.164940e-01
## 7: 1.051659e-05 1.584179e-04 1.028711e-04 2.973536e-08 3.999615e-03
## 8: 1.228531e-05 1.930001e-05 1.542802e-04 1.496365e-08 5.986744e-04
## 9: 1.921113e-06 2.326366e-06 5.376220e-05 1.717287e-09 1.070901e-04
## 10: 7.323495e-03 4.955758e-03 4.557018e-02 1.631485e-01 3.684512e-01
## 11: 6.941648e-10 2.498987e-01 4.954783e-07 2.131483e-07 7.510267e-01
## 12: 3.179834e-05 1.420964e-01 2.332575e-05 3.772046e-07 3.346734e-01
## 13: 1.581303e-06 2.522402e-02 6.972338e-05 1.839396e-06 4.554932e-01
## 14: 2.014782e-02 1.365603e-09 1.388655e-09 1.296519e-17 1.307099e-04
## 15: 2.428560e-02 2.592389e-10 2.256040e-09 7.687251e-18 6.136205e-05
## 16: 3.854299e-01 5.733489e-19 1.739940e-07 3.134026e-20 1.496520e-09
## 17: 2.986492e-15 2.886577e-02 6.831357e-09 8.364029e-10 5.673147e-01
## 18: 4.386724e-16 1.779161e-02 4.824700e-09 1.062857e-09 6.417208e-01
## 19: 3.175133e-15 3.121915e-02 1.156179e-08 4.117866e-09 6.980579e-01
## 20: 2.370948e-18 2.651610e-03 3.918440e-12 4.395419e-13 2.592651e-01
## 21: 2.425999e-10 5.287815e-02 9.112672e-05 1.762806e-05 7.777173e-01
## 22: 1.988935e-08 2.594306e-01 8.536372e-07 2.393693e-08 6.468409e-01
## 23: 7.124589e-09 1.328350e-01 1.264797e-07 1.888441e-09 6.329953e-01
## 24: 5.032927e-08 1.292797e-02 7.929629e-06 2.176646e-06 6.402667e-01
## 3B1 4G1d 4G1c 4G1a 4G1b
## 1G2 3G3 3G2 2G1
## 1: 6.022979e-07 1.597316e-17 2.353161e-12 2.642402e-17
## 2: 3.972108e-08 6.721966e-19 8.976560e-14 2.502188e-18
## 3: 1.063254e-05 5.146226e-16 9.341964e-11 1.239686e-15
## 4: 1.873697e-03 1.234684e-15 6.216024e-08 4.124491e-12
## 5: 5.244105e-04 1.362566e-15 3.213555e-08 7.457772e-13
## 6: 3.416779e-06 1.629218e-16 1.223661e-10 4.455397e-17
## 7: 1.051495e-05 2.145324e-10 6.819468e-08 1.761543e-10
## 8: 4.098330e-07 1.879515e-11 3.359428e-09 4.323408e-12
## 9: 1.011410e-08 1.835900e-14 4.909911e-11 2.080229e-15
## 10: 2.483215e-03 3.179414e-02 3.705167e-04 1.815656e-02
## 11: 3.398198e-04 5.446664e-13 2.013604e-08 5.883587e-13
## 12: 8.218510e-11 6.096974e-26 7.055411e-18 8.346034e-29
## 13: 5.576471e-11 1.581202e-25 4.571979e-18 9.330995e-28
## 14: 1.277760e-31 2.075305e-75 1.843625e-50 2.492973e-77
## 15: 4.828281e-33 5.001328e-77 3.008564e-52 9.395959e-79
## 16: 3.485755e-56 7.416757e-109 8.855616e-81 1.555432e-115
## 17: 8.997242e-03 3.501785e-14 4.613311e-07 3.234371e-11
## 18: 1.251964e-02 1.680551e-13 9.437576e-07 1.427406e-10
## 19: 1.377691e-02 2.561250e-12 2.061479e-06 6.023616e-10
## 20: 1.319406e-01 1.288165e-09 2.198072e-04 5.541483e-07
## 21: 4.807293e-05 1.907395e-15 8.505714e-10 1.905203e-13
## 22: 2.606714e-06 8.787301e-21 1.623145e-12 2.269247e-20
## 23: 8.729562e-07 1.167094e-23 8.715689e-14 8.780920e-23
## 24: 2.029376e-07 6.011658e-17 7.482682e-13 6.861934e-18
## 1G2 3G3 3G2 2G1
ggplot(PT.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(title = "Similarity between mutation spectra, PT data",fill ="-log10(P)")
ggplot(MA.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(title = "Similarity between mutation spectra, MA data",fill ="-log10(P)")
ggplot(PTvsMA.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(x="PT spectra", y="MA spectra", title = "Similarity between mutation spectra, PT vs MA data",fill ="-log10(P)")
WT=c("2B3","7B1","8B1","1G1")
mutMY=c("1B1a","1B1b","3B1")
mutacc<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))
Patient_all<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))
#I am removing the values of 2B3 and 7B1 in mutacc as they are too few mutations
mutacc$"2B3"=c(NA,NA,NA,NA,NA,NA)
mutacc$"7B1"=c(NA,NA,NA,NA,NA,NA)
colnames(mutacc)[1]<-"SNV"
colnames(Patient_all)[1]<-"SNV"
mutacc[,exp:="MA"]
Patient_all[,exp:="PT"]
final_table<-rbindlist(
list(
melt(mutacc,id.vars = c("SNV","exp"),variable.name = "iso", value.name = "count"),
melt(Patient_all,id.vars = c("SNV","exp"),variable.name = "iso", value.name = "count"))
)
total=c()
for (i in 1:length(row.names(final_table))){
#print(i)
total=c(total,sum(final_table[(final_table$iso==final_table$iso[i])&(final_table$exp==final_table$exp[i]),]$count))
}
final_table$tot=total
#Creating the table with all the frequencies:
d<-as.data.frame(final_table[,.(AT_GC=sum(.SD[SNV %in% c("A:T>G:C"),count/tot]),
GC_AT=sum(.SD[SNV %in% c("G:C>A:T"),count/tot]),
AT_TA=sum(.SD[SNV %in% c("A:T>T:A"),count/tot]),
AT_CG=sum(.SD[SNV %in% c("A:T>C:G"),count/tot]),
GC_TA=sum(.SD[SNV %in% c("G:C>T:A"),count/tot]),
GC_CG=sum(.SD[SNV %in% c("G:C>C:G"),count/tot]),tot=sum(count))
, by=c("iso","exp")])
d$mut=ifelse(d$iso %in% WT,"WT",ifelse(d$iso %in% mutMY,"mutMY","dnaQ"))
#here I am removing the values of 2B3 and 7B1 isolates in mutacc for the purpose of the figure
#Preparing the different dataset for all the independant frequency comparisons.
AT_GC.P<-d[d$exp=="PT",][,"AT_GC"]
AT_GC.M<-d[d$exp=="MA",][,"AT_GC"]
GC_AT.P<-d[d$exp=="PT",][,"GC_AT"]
GC_AT.M<-d[d$exp=="MA",][,"GC_AT"]
AT_GC.P<-d[d$exp=="PT",][,"AT_GC"]
AT_GC.M<-d[d$exp=="MA",][,"AT_GC"]
AT_TA.P<-d[d$exp=="PT",][,"AT_TA"]
AT_TA.M<-d[d$exp=="MA",][,"AT_TA"]
AT_CG.P<-d[d$exp=="PT",][,"AT_CG"]
AT_CG.M<-d[d$exp=="MA",][,"AT_CG"]
GC_TA.P<-d[d$exp=="PT",][,"GC_TA"]
GC_TA.M<-d[d$exp=="MA",][,"GC_TA"]
Ti.P <-d[d$exp=="PT",][,"AT_GC"]+d[d$exp=="PT",][,"GC_AT"]
Ti.M <-d[d$exp=="MA",][,"AT_GC"]+d[d$exp=="MA",][,"GC_AT"]
Tv.P <-d[d$exp=="PT",][,"AT_TA"]+d[d$exp=="PT",][,"AT_CG"]+d[d$exp=="PT",][,"GC_TA"]+d[d$exp=="PT",][,"GC_CG"]
Tv.M <-d[d$exp=="MA",][,"AT_TA"]+d[d$exp=="MA",][,"AT_CG"]+d[d$exp=="MA",][,"GC_TA"]+d[d$exp=="MA",][,"GC_CG"]
GC.P <-d[d$exp=="PT",][,"GC_AT"]+d[d$exp=="PT",][,"GC_TA"]+d[d$exp=="PT",][,"GC_CG"]
GC.M <-d[d$exp=="MA",][,"GC_AT"]+d[d$exp=="MA",][,"GC_TA"]+d[d$exp=="MA",][,"GC_CG"]
###### Figure 5a: transition vs GC dotplot#####
###############################################
plot_seg=data.frame(AT_GC.P,AT_GC.M,GC_AT.P,GC_AT.M)
d_filt=d[d$iso!="1G1",]
pdf("Figure_5a.pdf",width=6,height = 6,family = "Arial")
ggplot(d_filt,aes(x=GC_AT+GC_TA+GC_CG, y=AT_GC+GC_AT))+
geom_encircle(expand=0.05,aes(group=mut),fill="grey93",colour="lightgrey")+
geom_point(aes(fill=exp,size=tot),pch=21,colour="black")+
geom_segment(data = plot_seg, aes(x=GC.P,xend=GC.M, y=Ti.P, yend=Ti.M),linetype = 3, colour="grey50", size=0.5)+
labs(x='G:C substitutions', y="Transitions",size="")+
scale_size(limits=c(15,1500), breaks = c(50,500,1500) ) +
scale_x_continuous(limits=c(0,1), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,1), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),
legend.position = c(.95, .95),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(6, 6, 6, 6))+
scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"="navyblue","PT"="red3"),guide=FALSE)
dev.off()
## quartz_off_screen
## 2
pdf("Figure_5a_&_5b.pdf",width=9,height = 6,family = "Arial")
Fig5a<-ggplot(d_filt,aes(x=GC_AT+GC_TA+GC_CG, y=AT_GC+GC_AT))+
geom_encircle(expand=0.05,aes(group=mut),fill="grey93",colour="lightgrey")+
geom_point(aes(fill=exp,size=tot),pch=21,colour="black")+
geom_segment(data = plot_seg, aes(x=GC.P,xend=GC.M, y=Ti.P, yend=Ti.M),linetype = 3, colour="grey50", size=0.5)+
labs(x='G:C substitutions (%)', y="Transitions (%)",size="",tag="a")+
scale_size(name="SNV",limits=c(15,1500), breaks = c(50,500,1500) ) +
scale_x_continuous(limits=c(0-0.05,1), breaks = seq(0,1,.2), expand = c(0, 0), labels =function(x)x*100 ) +
scale_y_continuous(limits=c(0-0.05,1), breaks = seq(0,1,.2), expand = c(0, 0), labels = function(x)x*100) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),
legend.position = c(1, 1),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(6, 6, 6, 6),
legend.background = element_blank(),
plot.margin = unit(c(10,25,10,25), "pt"),
plot.tag=element_text(face="bold"))+
scale_fill_manual(name="",labels = c("MA","Patient") ,values = c("MA"="navyblue","PT"="red3"))+
guides(fill = guide_legend(override.aes = list(size=4)),size=guide_legend(override.aes = list(fill="grey"))) +
annotate(geom="text",label = "DnaQ WT", x =0.9, y = 0.25,size=4.5, colour = "black")+
annotate(geom="text",label = "DnaQ E9G", x =0.3, y = 0.9,size=4.5, colour = "black")+
annotate(geom="text",label = "BER + DnaQ E9G", x =0.45, y = 0.025,size=4.5, colour = "black")
Fig5b<-ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV],
aes(x=difference,y=logP,label=SNV,colour=SNV)) +
geom_point(size=4) +
#geom_text_repel(colour="black",segment.color = "black",cex=4.5,box.padding = unit(0.8, "lines"),text.padding=unit(0.5,"lines")) + #,nudge_x=.053
scale_colour_brewer(type = "qual",palette = "Dark2", guide =F) +
scale_x_continuous(limits=c(-.15,.15),breaks=c(-0.1,0,0.1) ,labels=c(-10,0,10),expand = c(0, 0)) +
scale_y_continuous(limits=c(0-(0.05*4),4),expand = c(0, 0))+
labs(x="PT-MA difference (%)",y="-logP",tag="b") +
annotate(geom="text",label = "A:T>C:G", x =0.07, y = 0.05,size=4.5, colour = "black")+
annotate(geom="text",label = "G:C>A:T", x =-0.085, y = 0.2,size=4.5, colour = "black")+
annotate(geom="text",label = "A:T>T:A", x =0, y = 0.5,size=4.5, colour = "black")+
annotate(geom="text",label = "G:C>C:G", x =0.01, y = 1.1,size=4.5, colour = "black")+
annotate(geom="text",label = "A:T>G:C", x =-0.085, y = 1.55,size=4.5, colour = "black")+
annotate(geom="text",label = "G:C>T:A", x =0.09, y = 3.3,size=4.5, colour = "black")+
theme_classic(base_size = 18)+
theme(plot.margin = unit(c(10,25,10,10), "pt"),
plot.tag=element_text(face="bold"))
hlay <- rbind(c(1,1,2),
c(1,1,2),
c(1,1,2),
c(1,1,2))
grid.arrange(Fig5a,Fig5b,layout_matrix=hlay)
dev.off()
## quartz_off_screen
## 2
grid.arrange(Fig5a,Fig5b,layout_matrix=hlay)
#####Figure S9: pairwise comparisons of mutation frequency for each substitution#####
#####################################################################################
pdf("Fig_S9.pdf",width=10,height = 10,family = "Arial")
#1 1-2 AT_GC GC_AT
plot12=data.frame(AT_GC.P,AT_GC.M,GC_AT.P,GC_AT.M)
p12<-ggplot(d,aes(x=AT_GC, y=GC_AT))+
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot12, aes(x=AT_GC.P,xend=AT_GC.M, y=GC_AT.P, yend=GC_AT.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="G:C>A:T")+
scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#2 1-3 AT_GC AT_TA
plot13=data.frame(AT_GC.P,AT_GC.M,AT_TA.P,AT_TA.M)
p13<-ggplot(d, aes(x=AT_GC, y=AT_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot13, aes(x=AT_GC.P,xend=AT_GC.M, y=AT_TA.P, yend=AT_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="A:T>T:A")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#3 1-4 AT_GC AT_CG
plot2=data.frame(AT_GC.P,AT_GC.M,AT_CG.P,AT_CG.M)
p14<-ggplot(d, aes(x=AT_GC, y=AT_CG)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=AT_GC.P,xend=AT_GC.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="A:T>C:G")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#4 1-5 T_GC GC_TA
plot2=data.frame(AT_GC.P,AT_GC.M,GC_TA.P,GC_TA.M)
p15<-ggplot(d, aes(x=AT_GC, y=GC_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=AT_GC.P,xend=AT_GC.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="A:T>G:C",y="G:C>T:A")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#5 2-3 GC_AT AT_TA
plot2=data.frame(GC_AT.P,GC_AT.M,AT_TA.P,AT_TA.M)
p23<-ggplot(d, aes(x=GC_AT, y=AT_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=AT_TA.P, yend=AT_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#6 2-4 GC_AT AT_CG
plot2=data.frame(GC_AT.P,GC_AT.M,AT_CG.P,AT_CG.M)
p24<-ggplot(d, aes(x=GC_AT, y=AT_CG)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#7 2-5 GC_AT GC_TA
plot2=data.frame(GC_AT.P,GC_AT.M,GC_TA.P,GC_TA.M)
p25<-ggplot(d, aes(x=GC_AT, y=GC_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="G:C>A:T",y="")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#8 3-4 AT_TA AT_CG
plot2=data.frame(AT_TA.P,AT_TA.M,AT_CG.P,AT_CG.M)
p34<-ggplot(d, aes(x=AT_TA, y=AT_CG)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=AT_TA.P,xend=AT_TA.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="",y="")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#9 3-5 AT_TA GC_TA
plot2=data.frame(AT_TA.P,AT_TA.M,GC_TA.P,GC_TA.M)
p35<-ggplot(d, aes(x=AT_TA, y=GC_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=AT_TA.P,xend=AT_TA.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="A:T>T:A",y="")+ scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank(),legend.position = "none") +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
#10 4-5 AT_CG GC_TA
plot2=data.frame(AT_CG.P,AT_CG.M,GC_TA.P,GC_TA.M)
p45<-ggplot(d, aes(x=AT_CG, y=GC_TA)) +
geom_point(aes(shape=exp,colour=mut))+
geom_segment(data = plot2, aes(x=AT_CG.P,xend=AT_CG.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
labs(x="A:T>C:G",y="")+
scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
theme_classic(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))
legend <- cowplot::get_legend(p45)
p45<-p45+ theme(legend.position = "none")
hlay <- rbind(c(1,NA,NA,11),
c(2,5,NA,NA),
c(3,6,8,NA),
c(4,7,9,10))
grid.arrange(p12,p13,p14,p15, p23,p24,p25,p34,p35,p45,legend, layout_matrix=hlay)
dev.off()
## quartz_off_screen
## 2
grid.arrange(p12,p13,p14,p15, p23,p24,p25,p34,p35,p45,legend, layout_matrix=hlay)
####Figure 4: Heatmap
#####################
#without wild types
iso_order<-c('3B1','1B1a','1B1b','4G1b','4G1c','4G1d','4G1a','3G2','3G3','2G2','2B2a','2B2b','3G1','2B1','5G1b','5G1a','2G1','1G2','1G1','2B3','7B1','8B1',"6G1a","6G1b")
mutacc<-data.frame(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))
Patient_all<-data.frame(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))
#I am removing the values of 2B3 and 7B1 in mutacc as they are too few mutations
mutacc$"X2B3"<-c(NA,NA,NA,NA,NA,NA)
mutacc$"X7B1"<-c(NA,NA,NA,NA,NA,NA)
colnames(mutacc)[1]<-"SNV"
colnames(Patient_all)[1]<-"SNV"
row.names(mutacc)<-mutacc$SNV
mutacc<-mutacc[,2:length(colnames(mutacc))]
#removing the X in the col_names for mutacc
colnames(mutacc)<-str_remove(colnames(mutacc), "X")
row.names(Patient_all)<-Patient_all$SNV
Patient_all<-Patient_all[,2:length(colnames(Patient_all))]
#removing the X in the col_names for Patient_all
colnames(Patient_all)<-str_remove(colnames(Patient_all), "X")
Patient_WT=subset(Patient_all, select = c("8B1","2B3","1G1","7B1") )
Patient_all<-Patient_all[,iso_order]
for (i in 1:length(colnames(Patient_all))){
mutacc[,i]<-mutacc[,i]/sum(mutacc[,i])
Patient_all[,i]<-Patient_all[,i]/sum(Patient_all[,i])
}
for (i in 1:length(colnames(Patient_WT))){
Patient_WT[,i]<-Patient_WT[,i]/sum(Patient_WT[,i])
}
all_data=rbind(Patient_all,mutacc)
sort(colnames(Patient_all))
## [1] "1B1a" "1B1b" "1G1" "1G2" "2B1" "2B2a" "2B2b" "2B3" "2G1" "2G2"
## [11] "3B1" "3G1" "3G2" "3G3" "4G1a" "4G1b" "4G1c" "4G1d" "5G1a" "5G1b"
## [21] "6G1a" "6G1b" "7B1" "8B1"
sort(colnames(mutacc))
## [1] "1B1a" "1B1b" "1G1" "1G2" "2B1" "2B2a" "2B2b" "2B3" "2G1" "2G2"
## [11] "3B1" "3G1" "3G2" "3G3" "4G1a" "4G1b" "4G1c" "4G1d" "5G1a" "5G1b"
## [21] "6G1a" "6G1b" "7B1" "8B1"
iso<-colnames(Patient_all)
Mutations<-c("DnaQ E9G & MutY/MutM","DnaQ E9G & MutY/MutM","DnaQ E9G & MutY/MutM","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G",'WT','WT','WT','WT',"DnaQ E9G","DnaQ E9G")
annotation_c <- as.data.frame(Mutations)
rownames(annotation_c) <- colnames(Patient_all)
Source<-c("Patient","Patient","Patient","Patient","Patient","Patient","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation")
annotation_r <- as.data.frame(Source)
rownames(annotation_r) <- row.names(all_data)
lab=c("A:T>G:C",
"G:C>A:T",
"A:T>T:A",
"A:T>C:G",
"G:C>T:A",
"G:C>C:G")
#specifying colors for row/col side
annotation_colors = list(
Mutations = c("DnaQ E9G"="#F0E442", "DnaQ E9G & MutY/MutM"="#E69F00"),
Source = c("Patient"="red2", "Mutation Accumulation"="steelblue2"))
annotation_colors = list(
Mutations = c("DnaQ E9G"="#F0E442", "DnaQ E9G & MutY/MutM"="#E69F00","WT"="lightyellow"),
Source = c("Patient"="red3", "Mutation Accumulation"="navyblue"))
colnames(Patient_WT)<-str_remove(colnames(Patient_WT), "X")
WT=c("2B3","7B1","8B1","1G1")
mutMY=c("1B1a","1B1b","3B1")
dnaQ=c("4G1b","4G1c","4G1d","4G1a","3G2","3G3","2G2","2B2a","2B2b","3G1","2B1","5G1b","5G1a","2G1","1G2")
row_lab=c("A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G","A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G")
anno_simple(1:10, height = unit(2, "cm"))
## An AnnotationFunction object
## function: anno_simple()
## position: column
## items: 10
## width: 1npc
## height: 2cm
## imported variable: pt_size, border, value, color_mapping, gp, pt_gp, pch
## subsetable variable: value
## this object is subsetable
pdf("Fig_4.pdf",height=4,width=10,family = "Arial")
H1<-Heatmap(as.matrix(
all_data[WT]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
row_split = factor(c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE,
show_heatmap_legend = FALSE,left_annotation = HeatmapAnnotation(which = "row",group = anno_block(gp = gpar(fill = c("white","white"),col=c(NA,NA))),width = unit(0.75, "cm")),
top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))
H2<-Heatmap(as.matrix(
all_data[dnaQ]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
row_split =factor( c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE,show_heatmap_legend = FALSE,
top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))
H3<-Heatmap(as.matrix(
all_data[mutMY]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
row_split = factor(c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE,
,heatmap_legend_param =list(col_fun = circlize::colorRamp2(seq(0, 0.65, length = 100), viridis(100)),legend_height = unit(6, "cm"), title = "", border = F, at = c(0,0.1,0.2,0.3,0.4,0.5,0.60,0.7)),
top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))
ht_list =H1+H2+H3
ht_list
dev.off()
## quartz_off_screen
## 2
ht_list
#figure S8
##########
### Now we wil calculate 95% CI interval for spectra driven by sampling
### For convenience, we'l create new data table
PT1.spectra.dt<-PT.spectra.dt[PT.spectra.dt[,.(tot=sum(value)),by=isolate],.(SNV,isolate,value,tot)][, `:=`(CI_L=binom.test(value,tot)$conf.int[1],CI_H=binom.test(value,tot)$conf.int[2],estimate=binom.test(value,tot)$estimate),by=c("isolate","SNV")][]
MA1.spectra.dt<-rbind(MA.spectra.dt[MA.spectra.dt[,.(tot=sum(value)),by=isolate]][value>0,.(SNV,isolate,value,tot)][, `:=`(CI_L=binom.test(value,tot)$conf.int[1],CI_H=binom.test(value,tot)$conf.int[2],estimate=binom.test(value,tot)$estimate),by=c("isolate","SNV")][],
MA.spectra.dt[MA.spectra.dt[,.(tot=sum(value)),by=isolate]][value==0,.(SNV,isolate,value,tot)][, `:=`(CI_L=0,CI_H=3/tot,estimate=0),by=c("isolate","SNV")][])
All.spectra.dt<-rbind(PT1.spectra.dt[,source:="PT"],MA1.spectra.dt[,source:="MA"])
SNV_order<-c("A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G")
All.spectra.dt$SNV<-factor(All.spectra.dt$SNV,levels= rev(SNV_order))
pdf("Fig_S8.pdf",width=12,family = "Arial")
ggplot(All.spectra.dt, aes(x = SNV, ,y=estimate, fill = source))+
geom_hline(yintercept=0.9,color="grey",linetype=3)+
geom_hline(yintercept=0.6,color="grey",linetype=3)+
geom_hline(yintercept=0.3,color="grey",linetype=3)+
geom_hline(yintercept=-0.9,color="grey",linetype=3)+
geom_hline(yintercept=-0.6,color="grey",linetype=3)+
geom_hline(yintercept=-0.3,color="grey",linetype=3)+
facet_wrap(~isolate,ncol = 6)+
geom_bar(stat='identity',aes(y =estimate),color="black",width = 0.5,position = position_dodge())+
geom_errorbar(aes(ymin=CI_L, ymax=CI_H), width=.2,position=position_dodge(.5)) +
coord_flip()+
ylab("Frequency")+
annotate("segment", x=-Inf, xend=Inf, y=-Inf, yend=-Inf,lwd=1 )+
annotate("segment", x=-Inf, xend=-Inf, y=-Inf, yend=Inf,lwd=1 )+
scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),
values = c("MA"="steelblue","PT"="red3"))+
theme_classic(base_size = 15) +
theme(axis.line = element_blank(), axis.title.y =element_blank(),
axis.text.x = element_text(size=9),panel.grid.minor = element_blank(),
strip.background=element_rect( fill=NA, linetype="blank"),
strip.text = element_text(angle = 0, hjust = 0))+
scale_y_continuous(limits=c(-.05,.90), breaks = c(0,.30,.60,.90),
labels = c("0%","30%","60%","90%"))
## Warning: Removed 24 rows containing missing values (geom_hline).
## Warning: Removed 24 rows containing missing values (geom_hline).
## Warning: Removed 24 rows containing missing values (geom_hline).
dev.off()
## quartz_off_screen
## 2
ggplot(All.spectra.dt, aes(x = SNV, ,y=estimate, fill = source))+
geom_hline(yintercept=0.9,color="grey",linetype=3)+
geom_hline(yintercept=0.6,color="grey",linetype=3)+
geom_hline(yintercept=0.3,color="grey",linetype=3)+
geom_hline(yintercept=-0.9,color="grey",linetype=3)+
geom_hline(yintercept=-0.6,color="grey",linetype=3)+
geom_hline(yintercept=-0.3,color="grey",linetype=3)+
facet_wrap(~isolate,ncol = 6)+
geom_bar(stat='identity',aes(y =estimate),color="black",width = 0.5,position = position_dodge())+
geom_errorbar(aes(ymin=CI_L, ymax=CI_H), width=.2,position=position_dodge(.5)) +
coord_flip()+
ylab("Frequency")+
annotate("segment", x=-Inf, xend=Inf, y=-Inf, yend=-Inf,lwd=1 )+
annotate("segment", x=-Inf, xend=-Inf, y=-Inf, yend=Inf,lwd=1 )+
scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),
values = c("MA"="steelblue","PT"="red3"))+
theme_classic(base_size = 15) +
theme(axis.line = element_blank(), axis.title.y =element_blank(),
axis.text.x = element_text(size=9),panel.grid.minor = element_blank(),
strip.background=element_rect( fill=NA, linetype="blank"),
strip.text = element_text(angle = 0, hjust = 0))+
scale_y_continuous(limits=c(-.05,.90), breaks = c(0,.30,.60,.90),
labels = c("0%","30%","60%","90%"))
## Warning: Removed 24 rows containing missing values (geom_hline).
## Warning: Removed 24 rows containing missing values (geom_hline).
## Warning: Removed 24 rows containing missing values (geom_hline).
#TopGO analysis
###############
geneID2GO_2 <- readMappings(file ="for_topGO_blast2go.tsv")
#Figure S11: Targeted genes
###########################
genesOfInterest <- c('A_02623',
'glnQ_3',
'A_00662',
'glnE',
'A_04152',
'envZ',
'yurK',
'trxB_1',
'auaH',
'dmlR_11',
'dapE_2',
'tpiA',
'A_02548',
'pheA',
'trmJ',
'fhuA_2',
'A_03862',
'mdeA',
'dadA1_2',
'A_03735',
'epsL',
'cdhR_4',
'A_02613',
'fecR_5',
'maeB_1',
'bvgS',
'A_02878',
'A_02494',
'A_02127',
'A_00709',
'A_01818',
'rhlE_1',
'gltJ',
'A_01884',
'lptD',
'rplS',
'glnQ_1',
'A_03071',
'arnT',
'A_02007',
'A_00272',
'A_04343',
'A_01460',
'lptG',
'crt_6',
'recG',
'sgaA',
'A_00847',
'quiA_1',
'tyrB',
'A_01797',
'ydcV_1',
'dxr',
'fadJ_2',
'A_02482',
'A_03346',
'A_03300',
'A_02367',
'lvr_2',
'A_03171',
'A_02540',
'fabG_3',
'A_02474',
'A_02287',
'A_02621',
'arsC_3',
'gcvA_6',
'cmpR_2',
'A_04570',
'A_02140',
'suhB_1',
'paaK_1',
'hscB',
'adiA',
'amaB',
'clpA',
'dmlR_5',
'dapA_3',
'gsiC_11',
'A_03192',
'dmlR_6',
'slyX',
'A_03882',
'fumC',
'trxB_2',
'recO',
'ttr',
'yhaJ_2',
'bdlA',
'mrgA',
'A_00638',
'A_00540',
'ycaD',
'A_03767',
'fecR_4',
'nagK_1',
'A_04054',
'A_01061',
'bfmBAB',
'oppD_5',
'ghrA_2',
'nudC_2',
'A_02270',
'A_04083',
'frc_2',
'tgnC',
'hmuU',
'A_01131',
'ves',
'dppD_2',
'A_04167',
'rng',
'A_02777',
'wbgU',
'ydjA',
'grxC',
'fbp',
'comM',
'A_00439',
'lpxK',
'yhdN',
'pgrR_6',
'A_00290',
'gltB',
'A_02764',
'kstR2_3',
'nagZ',
'A_00627',
'dppA_3',
'A_01006',
'queE',
'A_03947',
'A_03995',
'lcfB_2')
geneUniverse <- names(geneID2GO_2)
geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList, annot = annFUN.gene2GO,
nodeSize = 5,gene2GO = geneID2GO_2)
##
## Building most specific GOs .....
## ( 666 GO terms found. )
##
## Build GO DAG topology ..........
## ( 1575 GO terms and 3369 relations. )
##
## Annotating nodes ...............
## ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis
#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
##
## -- Classic Algorithm --
##
## the algorithm is scoring 408 nontrivial nodes
## parameters:
## test statistic: fisher
resultFisher.classic_0.05
##
## Description: My project
## Ontology: BP
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 83
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 408
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 408 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 13: 1 nodes to be scored (0 eliminated genes)
##
## Level 12: 2 nodes to be scored (0 eliminated genes)
##
## Level 11: 7 nodes to be scored (10 eliminated genes)
##
## Level 10: 24 nodes to be scored (18 eliminated genes)
##
## Level 9: 41 nodes to be scored (436 eliminated genes)
##
## Level 8: 46 nodes to be scored (595 eliminated genes)
##
## Level 7: 50 nodes to be scored (856 eliminated genes)
##
## Level 6: 68 nodes to be scored (1247 eliminated genes)
##
## Level 5: 82 nodes to be scored (1530 eliminated genes)
##
## Level 4: 51 nodes to be scored (1788 eliminated genes)
##
## Level 3: 27 nodes to be scored (2245 eliminated genes)
##
## Level 2: 8 nodes to be scored (2310 eliminated genes)
##
## Level 1: 1 nodes to be scored (2730 eliminated genes)
resultFisher.weight01_0.05
##
## Description: My project
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 83
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 408
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
##
## -- Weight Algorithm --
##
## The algorithm is scoring 408 nontrivial nodes
## parameters:
## test statistic: fisher : ratio
##
## Level 13: 1 nodes to be scored.
##
## Level 12: 2 nodes to be scored.
##
## Level 11: 7 nodes to be scored.
##
## Level 10: 24 nodes to be scored.
##
## Level 9: 41 nodes to be scored.
##
## Level 8: 46 nodes to be scored.
##
## Level 7: 50 nodes to be scored.
##
## Level 6: 68 nodes to be scored.
##
## Level 5: 82 nodes to be scored.
##
## Level 4: 51 nodes to be scored.
##
## Level 3: 27 nodes to be scored.
##
## Level 2: 8 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
resultFisher.weight_0.05
##
## Description: My project
## Ontology: BP
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 83
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 408
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
##
## -- Elim Algorithm --
##
## the algorithm is scoring 408 nontrivial nodes
## parameters:
## test statistic: fisher
## cutOff: 0.01
##
## Level 13: 1 nodes to be scored (0 eliminated genes)
##
## Level 12: 2 nodes to be scored (0 eliminated genes)
##
## Level 11: 7 nodes to be scored (0 eliminated genes)
##
## Level 10: 24 nodes to be scored (0 eliminated genes)
##
## Level 9: 41 nodes to be scored (0 eliminated genes)
##
## Level 8: 46 nodes to be scored (0 eliminated genes)
##
## Level 7: 50 nodes to be scored (0 eliminated genes)
##
## Level 6: 68 nodes to be scored (0 eliminated genes)
##
## Level 5: 82 nodes to be scored (0 eliminated genes)
##
## Level 4: 51 nodes to be scored (0 eliminated genes)
##
## Level 3: 27 nodes to be scored (0 eliminated genes)
##
## Level 2: 8 nodes to be scored (0 eliminated genes)
##
## Level 1: 1 nodes to be scored (0 eliminated genes)
resultFisher.elim_0.05
##
## Description: My project
## Ontology: BP
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 83
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 408
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo,
classicFisher = resultFisher.classic_0.05,
elimFisher=resultFisher.elim_0.05,
weigthFisher=resultFisher.weight_0.05,
weigth01Fisher=resultFisher.weight01_0.05,
orderBy = "weigth01Fisher",
topNodes = 100)
goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"
p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
xlab("Biological process") +
ggtitle("Fisher.weight01_0.05")+
coord_flip()
pdf("Fig_S11.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
dev.off()
## quartz_off_screen
## 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
#Fig S12: pseudogenized genes
#############################
genesOfInterest <- c('A_00158',
'A_00177',
'A_00227',
'A_00238',
'A_00328',
'A_00381',
'A_00481',
'A_00501',
'A_00559',
'A_00608',
'A_00627',
'A_00662',
'A_00678',
'A_00725',
'A_00728',
'A_00729',
'A_00816',
'A_00821',
'A_00895',
'A_01028',
'A_01045',
'A_01067',
'A_01122',
'A_01140',
'A_01169',
'A_01183',
'A_01370',
'A_01449',
'A_01456',
'A_01587',
'A_01623',
'A_01660',
'A_02007',
'A_02008',
'A_02046',
'A_02072',
'A_02079',
'A_02091',
'A_02105',
'A_02114',
'A_02117',
'A_02182',
'A_02209',
'A_02377',
'A_02434',
'A_02518',
'A_02623',
'A_02632',
'A_02635',
'A_02680',
'A_02712',
'A_02718',
'A_02825',
'A_02951',
'A_03043',
'A_03112',
'A_03192',
'A_03332',
'A_03363',
'A_03635',
'A_03650',
'A_03668',
'A_03669',
'A_03693',
'A_03720',
'A_03736',
'A_03766',
'A_03821',
'A_03847',
'A_03878',
'A_03921',
'A_03922',
'A_04091',
'A_04122',
'A_04167',
'A_04230',
'A_04234',
'A_04236',
'A_04246',
'A_04260',
'A_04291',
'A_04316',
'acoR_2',
'actP_1',
'actP_2',
'aidB',
'amnE',
'ampD',
'appA',
'apt',
'araC',
'artM_1',
'asnB_1',
'bauA_2',
'bcsA',
'bdlA',
'braC_10',
'braC_9',
'btuD_9',
'caiA_2',
'cbpA',
'ccoN1_2',
'cdhR_4',
'cheA',
'cheB_2',
'clsB_2',
'cmpR_6',
'cynR_8',
'dac',
'dadA1_1',
'dadA1_2',
'dapE_1',
'dapE_2',
'dctM_4',
'dmlR_1',
'dmsA',
'dnaE2',
'ectB',
'egtB',
'eno_1',
'ettA',
'fabG2',
'fadD_1',
'fhuA_5',
'flhA_2',
'fumB_1',
'fur_2',
'ggt',
'glnE',
'glnH_2',
'glnQ_3',
'glpE_1',
'gltB',
'gltC_7',
'gsiA_4',
'gsiB_2',
'gsiB_4',
'hisC2',
'hsrA_1',
'hsrA_2',
'htrE',
'hyuA',
'iaaA',
'intA_3',
'kdpB_2',
'kefC_2',
'kpsM_2',
'lcfB_2',
'leuD1_2',
'lon',
'lon2',
'lptB_6',
'ltaE',
'mexB',
'mfd',
'mhbM',
'mshA_4',
'mshD',
'murA_2',
'mutS2',
'nagK_1',
'nhaP2_2',
'ntaA_2',
'pbpC',
'pcaB',
'pgi_1',
'phrB',
'pitA',
'plcN_2',
'pncA',
'ppc',
'queC',
'rfnT_2',
'rhlE_2',
'rhtB_2',
'rnr',
'rplY',
'sasA_4',
'sasA_8',
'secD_2',
'selO',
'smf-1_1',
'surE',
'tag',
'tauD',
'tcyC_2',
'tdh_1',
'tdh_2',
'thadh',
'tolQ_4',
'trxB_1',
'ttr',
'wecH',
'ydcV_1',
'yhaJ_2',
'trkH',
'A_02961',
'steT',
'A_01369',
'adoK',
'rsmB_2')
geneUniverse <- names(geneID2GO_2)
geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList, annot = annFUN.gene2GO,
nodeSize = 5,gene2GO = geneID2GO_2)
##
## Building most specific GOs .....
## ( 666 GO terms found. )
##
## Build GO DAG topology ..........
## ( 1575 GO terms and 3369 relations. )
##
## Annotating nodes ...............
## ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis
#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
##
## -- Classic Algorithm --
##
## the algorithm is scoring 433 nontrivial nodes
## parameters:
## test statistic: fisher
resultFisher.classic_0.05
##
## Description: My project
## Ontology: BP
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 2 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 125
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 433
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 433 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 13: 1 nodes to be scored (0 eliminated genes)
##
## Level 12: 5 nodes to be scored (0 eliminated genes)
##
## Level 11: 11 nodes to be scored (10 eliminated genes)
##
## Level 10: 32 nodes to be scored (48 eliminated genes)
##
## Level 9: 53 nodes to be scored (469 eliminated genes)
##
## Level 8: 56 nodes to be scored (679 eliminated genes)
##
## Level 7: 51 nodes to be scored (932 eliminated genes)
##
## Level 6: 62 nodes to be scored (1319 eliminated genes)
##
## Level 5: 83 nodes to be scored (1575 eliminated genes)
##
## Level 4: 44 nodes to be scored (1775 eliminated genes)
##
## Level 3: 24 nodes to be scored (2226 eliminated genes)
##
## Level 2: 10 nodes to be scored (2306 eliminated genes)
##
## Level 1: 1 nodes to be scored (2716 eliminated genes)
resultFisher.weight01_0.05
##
## Description: My project
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 125
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 433
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
##
## -- Weight Algorithm --
##
## The algorithm is scoring 433 nontrivial nodes
## parameters:
## test statistic: fisher : ratio
##
## Level 13: 1 nodes to be scored.
##
## Level 12: 5 nodes to be scored.
##
## Level 11: 11 nodes to be scored.
##
## Level 10: 32 nodes to be scored.
##
## Level 9: 53 nodes to be scored.
##
## Level 8: 56 nodes to be scored.
##
## Level 7: 51 nodes to be scored.
##
## Level 6: 62 nodes to be scored.
##
## Level 5: 83 nodes to be scored.
##
## Level 4: 44 nodes to be scored.
##
## Level 3: 24 nodes to be scored.
##
## Level 2: 10 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
resultFisher.weight_0.05
##
## Description: My project
## Ontology: BP
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 125
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 433
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
##
## -- Elim Algorithm --
##
## the algorithm is scoring 433 nontrivial nodes
## parameters:
## test statistic: fisher
## cutOff: 0.01
##
## Level 13: 1 nodes to be scored (0 eliminated genes)
##
## Level 12: 5 nodes to be scored (0 eliminated genes)
##
## Level 11: 11 nodes to be scored (0 eliminated genes)
##
## Level 10: 32 nodes to be scored (0 eliminated genes)
##
## Level 9: 53 nodes to be scored (8 eliminated genes)
##
## Level 8: 56 nodes to be scored (8 eliminated genes)
##
## Level 7: 51 nodes to be scored (8 eliminated genes)
##
## Level 6: 62 nodes to be scored (8 eliminated genes)
##
## Level 5: 83 nodes to be scored (8 eliminated genes)
##
## Level 4: 44 nodes to be scored (8 eliminated genes)
##
## Level 3: 24 nodes to be scored (8 eliminated genes)
##
## Level 2: 10 nodes to be scored (8 eliminated genes)
##
## Level 1: 1 nodes to be scored (8 eliminated genes)
resultFisher.elim_0.05
##
## Description: My project
## Ontology: BP
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 125
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 433
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo,
classicFisher = resultFisher.classic_0.05,
elimFisher=resultFisher.elim_0.05,
weigthFisher=resultFisher.weight_0.05,
weigth01Fisher=resultFisher.weight01_0.05,
orderBy = "weigth01Fisher",
topNodes = 100)
goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"
p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
xlab("Biological process") +
ggtitle("Fisher.weight01_0.05")+
coord_flip()
pdf("Fig_S12.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
dev.off()
## quartz_off_screen
## 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
#Fig S7: 25 exo- genes
######################
genesOfInterest <- c('tdhA',
'nhaP2_1',
'cheD',
'A_00560',
'dnaQ',
'envZ',
'gstB_2',
'A_01313',
'sucA',
'fliF',
'ppsR',
'A_02012',
'A_02072',
'icd_1',
'yhhQ',
'gsiD_5',
'ygaZ',
'A_03098',
'A_03291',
'A_03455',
'kefC_3',
'A_03713',
'ptsJ',
'A_04082',
'lpxL_2')
geneUniverse <- names(geneID2GO_2)
geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList, annot = annFUN.gene2GO,
nodeSize = 5,gene2GO = geneID2GO_2)
##
## Building most specific GOs .....
## ( 666 GO terms found. )
##
## Build GO DAG topology ..........
## ( 1575 GO terms and 3369 relations. )
##
## Annotating nodes ...............
## ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis
#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
##
## -- Classic Algorithm --
##
## the algorithm is scoring 137 nontrivial nodes
## parameters:
## test statistic: fisher
resultFisher.classic_0.05
##
## Description: My project
## Ontology: BP
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 4 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 16
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 137
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 137 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 11: 1 nodes to be scored (0 eliminated genes)
##
## Level 10: 4 nodes to be scored (0 eliminated genes)
##
## Level 9: 8 nodes to be scored (372 eliminated genes)
##
## Level 8: 11 nodes to be scored (413 eliminated genes)
##
## Level 7: 15 nodes to be scored (473 eliminated genes)
##
## Level 6: 19 nodes to be scored (556 eliminated genes)
##
## Level 5: 33 nodes to be scored (975 eliminated genes)
##
## Level 4: 23 nodes to be scored (1403 eliminated genes)
##
## Level 3: 15 nodes to be scored (2138 eliminated genes)
##
## Level 2: 7 nodes to be scored (2250 eliminated genes)
##
## Level 1: 1 nodes to be scored (2655 eliminated genes)
resultFisher.weight01_0.05
##
## Description: My project
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 16
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 137
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
##
## -- Weight Algorithm --
##
## The algorithm is scoring 137 nontrivial nodes
## parameters:
## test statistic: fisher : ratio
##
## Level 11: 1 nodes to be scored.
##
## Level 10: 4 nodes to be scored.
##
## Level 9: 8 nodes to be scored.
##
## Level 8: 11 nodes to be scored.
##
## Level 7: 15 nodes to be scored.
##
## Level 6: 19 nodes to be scored.
##
## Level 5: 33 nodes to be scored.
##
## Level 4: 23 nodes to be scored.
##
## Level 3: 15 nodes to be scored.
##
## Level 2: 7 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
resultFisher.weight_0.05
##
## Description: My project
## Ontology: BP
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 16
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 137
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
##
## -- Elim Algorithm --
##
## the algorithm is scoring 137 nontrivial nodes
## parameters:
## test statistic: fisher
## cutOff: 0.01
##
## Level 11: 1 nodes to be scored (0 eliminated genes)
##
## Level 10: 4 nodes to be scored (0 eliminated genes)
##
## Level 9: 8 nodes to be scored (0 eliminated genes)
##
## Level 8: 11 nodes to be scored (17 eliminated genes)
##
## Level 7: 15 nodes to be scored (17 eliminated genes)
##
## Level 6: 19 nodes to be scored (17 eliminated genes)
##
## Level 5: 33 nodes to be scored (17 eliminated genes)
##
## Level 4: 23 nodes to be scored (17 eliminated genes)
##
## Level 3: 15 nodes to be scored (17 eliminated genes)
##
## Level 2: 7 nodes to be scored (17 eliminated genes)
##
## Level 1: 1 nodes to be scored (17 eliminated genes)
resultFisher.elim_0.05
##
## Description: My project
## Ontology: BP
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
## Annotated genes: 2759
## Significant genes: 16
## Min. no. of genes annotated to a GO: 5
## Nontrivial nodes: 137
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo,
classicFisher = resultFisher.classic_0.05,
elimFisher=resultFisher.elim_0.05,
weigthFisher=resultFisher.weight_0.05,
weigth01Fisher=resultFisher.weight01_0.05,
orderBy = "weigth01Fisher",
topNodes = 100)
goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"
p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
xlab("Biological process") +
ggtitle("Fisher.weight01_0.05")+
coord_flip()
pdf("Fig_S7.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
dev.off()
## quartz_off_screen
## 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
geom_col(aes(fill=pValue))+
scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
xlab("Biological process") +
labs(y=expression("log"[2]("enrichment")))+
theme_classic(base_size = 15)+
coord_flip()
tree.WT<-root(read.tree("RAxML_bipartitions.WT"),"ancestor_A1")
tree.dnaQ<-root(read.tree("RAxML_bipartitions.dnaQ"),"ancestor_A2")
tree.bound<-bind.tree(tree.WT, tree.dnaQ, 4)
tree.bound.clean<-drop.tip(tree.bound, "ancestor_A2")
WT=c("2B3","1G1","8B1","7B1")
WT_edge=which.edge(tree.bound.clean, WT)
sep=max(WT_edge)+1
edge_color=rep("black",length(tree.bound.clean$edge))
edge_color[WT_edge]="red"
edge_type=rep(1,length(tree.bound.clean$edge))
edge_type[sep]=2
i=5
tree.bound.clean.modif<-tree.bound.clean
tree.bound.clean.modif$edge.length[sep]<-7.5e-05
tree.bound.clean.modif$edge.length[WT_edge]<-5*tree.bound.clean.modif$edge.length[WT_edge]
postscript("Fig_2.pdf")
plot(tree.bound.clean.modif,type="u",edge.color=edge_color, edge.lty=edge_type,show.tip.labe=T,edge.width=3,rotate.tree=250)
add.scale.bar(0,4e-05,lwd=3,col=F)
add.scale.bar(0,2e-05,lcol="red",lwd=3,col=F)
dev.off()
## quartz_off_screen
## 2
plot(tree.bound.clean.modif,type="u",edge.color=edge_color, edge.lty=edge_type,show.tip.labe=T,edge.width=3,rotate.tree=250)
add.scale.bar(0,4e-05,lwd=3,col=F)
add.scale.bar(0,2e-05,lcol="red",lwd=3,col=F)
#Figure S2a
###########
tree.allBH<-root(read.tree("RAxML_bipartitions.tree_allBH"),"Bhinzii_14-3425")
pdf("Fig_S2a.pdf",family = "Arial")
plot(tree.allBH)
dev.off()
## quartz_off_screen
## 2
plot(tree.allBH)
#figure S1
##########
#Getting the data of the different growth runs
GC.BH1<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AW50",sheet="20191220_48w",col_names = T))
GC.BH2<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AW50",sheet="20200306_48w",col_names = T))
GC.BH3<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AV50",sheet="20210209_48w",col_names = T))
GC.BH4<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AU50",sheet="20210217_48w",col_names = T))
GC.BH5<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AQ50",sheet="20210225_48w",col_names = T))
GC.BH6<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AU50",sheet="20210305_48w",col_names = T))
GC.BH7<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "W1:AM50",sheet="20210315_48w",col_names = T))
## create a data.frame for ggplot2
p2=as.data.frame(matrix(nrow=48*22, ncol=4))
colnames(p2)=c("Time", "Isolate", "run1", "run2")
p2[,1]=rep(0:47, n=22)
I22=c("1B1a", "1B1b", "1G1", "1G2", "2B1", "2B2a", "2B2b", "2B3", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b", "7B1", "8B1")
p2[,2]=rep(I22, each=48)
null_2_NA<- function(x){
if (is.null(x)){
return(NA)
}
else{
return(x)
}
}
#getting all the data of the runs, and using NA when the run doesn't contain one particular isolate
all_run1=c()
all_run2=c()
all_run3=c()
all_run4=c()
all_run5=c()
all_run6=c()
all_run7=c()
for (i in 1:length(I22)){
#print("ok")
#print (I22[i])
#GC.BH1[1,I22[i]]
#print(GC.BH1$I22[i])
for (j in 1:48){
#print(i)
#print(j)
#print(GC.BH1[j,i])
all_run1=c(all_run1,null_2_NA(GC.BH1[j,paste("X",I22[i],sep="")]))
all_run2=c(all_run2,null_2_NA(GC.BH2[j,paste("X",I22[i],sep="")]))
all_run3=c(all_run3,null_2_NA(GC.BH3[j,paste("X",I22[i],sep="")]))
all_run4=c(all_run4,null_2_NA(GC.BH4[j,paste("X",I22[i],sep="")]))
all_run5=c(all_run5,null_2_NA(GC.BH5[j,paste("X",I22[i],sep="")]))
all_run6=c(all_run6,null_2_NA(GC.BH6[j,paste("X",I22[i],sep="")]))
all_run7=c(all_run7,null_2_NA(GC.BH7[j,paste("X",I22[i],sep="")]))
}
}
p2[,3]=all_run1
p2[,4]=all_run2
p2[,5]=all_run3
p2[,6]=all_run4
p2[,7]=all_run5
p2[,8]=all_run6
p2[,9]=all_run7
length(all_run1)
## [1] 1056
length(all_run3)
## [1] 1056
run_mean=apply(p2[,3:9],1,function(x) mean(x,na.rm=T))
run_sd=apply(p2[,3:9],1,function(x) sd(x,na.rm=T))
p2[,10]=run_mean
p2[,11]=run_sd
# isolation date
date=rep(c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 7, 8), each=48)
p2[,12]=date
colnames(p2)=c("Time", "Isolate", "run1", "run2", "run3", "run4", "run5", "run6", "run7", "mean","sd","date")
scaleFUN<- function(x) sprintf("%.2f", x)
pdf("Fig_S1.pdf",width=10, height=10)
ggplot(data=p2, aes(x=Time, y=mean, group=Isolate, color=Isolate)) +
geom_line(linetype="solid") + xlab("Time (h)") +
geom_vline(xintercept=0,color="grey",linetype=3)+
geom_vline(xintercept=10,color="grey",linetype=3)+
geom_vline(xintercept=20,color="grey",linetype=3)+
geom_vline(xintercept=30,color="grey",linetype=3)+
geom_vline(xintercept=40,color="grey",linetype=3)+
geom_vline(xintercept=50,color="grey",linetype=3)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=0.5, color="black",size=0.3,alpha=0.5) +
geom_point(aes(fill=Isolate),pch=21,color="black",size=1,lwd=0.2)+
ylab("OD600") +
scale_y_continuous(labels=scaleFUN) +
theme_classic(base_size = 12)+
facet_grid(rows=vars(date))+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1))
## Warning: Duplicated aesthetics after name standardisation: size
dev.off()
## quartz_off_screen
## 2
ggplot(data=p2, aes(x=Time, y=mean, group=Isolate, color=Isolate)) +
geom_line(linetype="solid") + xlab("Time (h)") +
geom_vline(xintercept=0,color="grey",linetype=3)+
geom_vline(xintercept=10,color="grey",linetype=3)+
geom_vline(xintercept=20,color="grey",linetype=3)+
geom_vline(xintercept=30,color="grey",linetype=3)+
geom_vline(xintercept=40,color="grey",linetype=3)+
geom_vline(xintercept=50,color="grey",linetype=3)+
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=0.5, color="black",size=0.3,alpha=0.5) +
geom_point(aes(fill=Isolate),pch=21,color="black",size=1,lwd=0.2)+
ylab("OD600") +
scale_y_continuous(labels=scaleFUN) +
theme_classic(base_size = 12)+
facet_grid(rows=vars(date))+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1))
## Warning: Duplicated aesthetics after name standardisation: size
### PT SNV data
PT.SNV.dt<-data.table(read_xlsx("Patient_mutations.xlsx"))
setnames(PT.SNV.dt, 'In isolates',"isolates")
PT.SNV.dt
## Position Reference allele Variant allele Efffect
## 1: 113 A G INTERGENIC
## 2: 897 G A NON_SYNONYMOUS_CODING
## 3: 1527 G T NON_SYNONYMOUS_CODING
## 4: 2780 T G INTERGENIC
## 5: 3592 A G NON_SYNONYMOUS_CODING
## ---
## 6455: 3728264 A T NON_SYNONYMOUS_CODING
## 6456: 3897012 G A NON_SYNONYMOUS_CODING
## 6457: 4268100 A T NON_SYNONYMOUS_CODING
## 6458: 4293566 T C NON_SYNONYMOUS_CODING
## 6459: 4676024 T A NON_SYNONYMOUS_CODING
## Amino acid change Gene length Gene name
## 1: - - -
## 2: A233T 483 dnaA
## 3: D443Y 483 dnaA
## 4: - - -
## 5: N249S 817 gyrB
## ---
## 6455: V44E 398 kefC_3
## 6456: A18V 305 A_03713
## 6457: W100R 430 ptsJ
## 6458: Y30C 323 A_04082
## 6459: L74Q 297 lpxL_2
## Product
## 1: intergenic region
## 2: Chromosomal replication initiator protein DnaA
## 3: Chromosomal replication initiator protein DnaA
## 4: intergenic region
## 5: DNA gyrase subunit B
## ---
## 6455: Glutathione-regulated potassium-efflux system protein KefC
## 6456: hypothetical protein
## 6457: Vitamin B6 salvage pathway transcriptional repressor PtsJ
## 6458: hypothetical protein
## 6459: Lipid A biosynthesis lauroyltransferase
## isolates
## 1: 3G2
## 2: 2B2a,2B2b
## 3: 7B1
## 4: 6G1b
## 5: 4G1a,4G1c,4G1d
## ---
## 6455: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6456: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6457: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6458: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6459: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
######################
### isolate annotation
iso.dt<-fread("Isolates_divergence.v1.csv")
######################
### ordered list of isolates: WT->E9G, WT elements 1-4
iso_lst<-c("1G1", "2B3", "7B1", "8B1", "1B1a", "1B1b", "1G2", "2B1", "2B2a", "2B2b", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b","6G1a","6G1b")
### ordered dataframe of isolates with GT: WT->E9G, WT elements 1-4
iso.gt.dt<-data.table(cbind(
isolate=c("1G1", "2B3", "7B1", "8B1", "1B1a", "1B1b", "1G2", "2B1", "2B2a", "2B2b", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b","6G1a","6G1b"),
GT=c("WT", "WT", "WT", "WT", "dnaQ E9G + mutM/mutY", "dnaQ E9G + mutM/mutY", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G + mutM/mutY", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G"))
)
### Import aggregated dN/dS data
dNdS.dt<-data.table(read_xlsx(path = "mutation_counts_all_isolates_and_sites.xlsx", range = cell_cols("A:O")))
setnames(dNdS.dt, c('NS sites','S sites'), c("NS_sites","S_sites"))
WT.SNV.dt<-rbind(PT.SNV.dt[isolates %like% "1G1",.(Position,isolate="1G1")],
PT.SNV.dt[isolates %like% "7B1",.(Position,isolate="7B1")],
PT.SNV.dt[isolates %like% "8B1",.(Position,isolate="8B1")],
PT.SNV.dt[isolates %like% "2B3",.(Position,isolate="2B3")])
dcast(WT.SNV.dt[WT.SNV.dt,on=c("Position"="Position"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)],isolate ~i.isolate)
## Using 'N' as value column. Use 'value.var' to override
## isolate 1G1 2B3 7B1 8B1
## 1: 1G1 17 NA NA 1
## 2: 2B3 NA 37 23 34
## 3: 7B1 NA 23 47 20
## 4: 8B1 1 34 20 48
ggplot(WT.SNV.dt[WT.SNV.dt,on=c("Position"="Position"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)], aes(x=isolate,y=i.isolate,fill=N,label=N))+ geom_tile() +geom_text() +scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in wild-type DnaQ isolates",x="isolate",y="",fill="SNVs") + theme_classic(base_size = 14)
iso_list<-iso.gt.dt[,isolate]
cc.dt<-PT.SNV.dt[isolates %like% iso_list[1], .(SNV=paste(Position,`Reference allele`,`Variant allele`, sep="_"),isolate=iso_list[1])]
for(a in rev(iso_list[-1])) { print(a); cc.dt<-rbind(cc.dt,PT.SNV.dt[isolates %like% a, .(SNV=paste(Position,`Reference allele`,`Variant allele`, sep="_"), isolate=a)])}
## [1] "6G1b"
## [1] "6G1a"
## [1] "5G1b"
## [1] "5G1a"
## [1] "4G1d"
## [1] "4G1c"
## [1] "4G1b"
## [1] "4G1a"
## [1] "3G3"
## [1] "3G2"
## [1] "3G1"
## [1] "3B1"
## [1] "2G2"
## [1] "2G1"
## [1] "2B2b"
## [1] "2B2a"
## [1] "2B1"
## [1] "1G2"
## [1] "1B1b"
## [1] "1B1a"
## [1] "8B1"
## [1] "7B1"
## [1] "2B3"
PT.SNV.long.dt<-cc.dt
rm(cc.dt)
### all-vs-all shared SNV plot
ggplot(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)], aes(
x=factor(isolate, levels=iso.gt.dt[,isolate]),
y=factor(i.isolate, levels=iso.gt.dt[,isolate]),
fill=N,label=N)) + geom_tile() + geom_text(size=2.75) +
scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in PT isolates",x="isolate",y="",fill="SNVs") +
theme_classic(base_size = 10)
PT.SNV.long.dt[,.N,by=isolate]
## isolate N
## 1: 1G1 17
## 2: 6G1b 297
## 3: 6G1a 338
## 4: 5G1b 276
## 5: 5G1a 243
## 6: 4G1d 534
## 7: 4G1c 542
## 8: 4G1b 610
## 9: 4G1a 485
## 10: 3G3 431
## 11: 3G2 533
## 12: 3G1 371
## 13: 3B1 1272
## 14: 2G2 428
## 15: 2G1 255
## 16: 2B2b 400
## 17: 2B2a 405
## 18: 2B1 310
## 19: 1G2 267
## 20: 1B1b 1067
## 21: 1B1a 1080
## 22: 8B1 48
## 23: 7B1 47
## 24: 2B3 37
## isolate N
ggplot(PT.SNV.long.dt[,.N, by=SNV][order(N)], aes(x=N)) + geom_histogram(binwidth = 1) + scale_y_sqrt() + theme_classic(base_size = 14) + labs(x="Number of isolates",title = "SNV frequency acorss PT isolates")+ theme_classic(base_size = 14)
### smaller subset of heatmap for WT isolates
ggplot(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)][isolate %in% iso.gt.dt[GT == "WT",isolate]][i.isolate %in% iso.gt.dt[GT == "WT",isolate]], aes(
x=factor(isolate, levels=iso.gt.dt[,isolate]),
y=factor(i.isolate, levels=iso.gt.dt[,isolate]),
fill=N,label=N)) + geom_tile() +geom_text() +
scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in wild-type DnaQ isolates",x="isolate",y="",fill="SNVs") + theme_classic(base_size = 14)
dcast(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)][isolate %in% iso.gt.dt[GT == "WT",isolate]][i.isolate %in% iso.gt.dt[GT == "WT",isolate]],isolate~i.isolate)
## Using 'N' as value column. Use 'value.var' to override
## isolate 1G1 2B3 7B1 8B1
## 1: 1G1 17 NA NA 1
## 2: 2B3 NA 37 23 34
## 3: 7B1 NA 23 47 20
## 4: 8B1 1 34 20 48
#Figure S5:
###########
### divergence from SNV table:
div.snv.dt<-PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][isolate==i.isolate,.(SNV=.N),by=isolate]
### and now full annotated table
div.date.dt<-div.snv.dt[iso.dt,on=.("isolate"=`Isolate`)][iso.gt.dt,.(isolate,SNV, GT, date=`Days since first sample`),on=.(isolate=isolate)]
pdf("Fig_S5.pdf")
ggplot(div.date.dt,aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,1500)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_brewer(type="qual",palette = "Dark2")+ theme_classic(base_size = 14)
dev.off()
## quartz_off_screen
## 2
ggplot(div.date.dt,aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,1500)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_brewer(type="qual",palette = "Dark2")+ theme_classic(base_size = 14)
#Figure S6
##########
p1 <- ggplot(div.date.dt[GT=="WT"],aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,100)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_viridis_d()+ theme_classic(base_size = 14)
lm_eqn = function(m) {
l <- list(a = format(unname(coef(m)[1]), digits = 2),
b = format(abs(unname(coef(m)[2])), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
pdf("Fig_S6.pdf")
p1 + stat_smooth(formula = y~x,method = "lm") + geom_text(colour="black",aes(x = 500, y = 80, label = lm_eqn(lm(y ~ x,div.date.dt[GT=="WT",.(x=date,y=SNV)]))), parse = TRUE)
dev.off()
## quartz_off_screen
## 2
p1 + stat_smooth(formula = y~x,method = "lm") + geom_text(colour="black",aes(x = 500, y = 80, label = lm_eqn(lm(y ~ x,div.date.dt[GT=="WT",.(x=date,y=SNV)]))), parse = TRUE)
Now some dN/dS analysis and exploration
###calculate genome-wide dS rates for subsequent calcualtions
GW_PT_dS<-dNdS.dt[,sum(PT_S)/sum(S_sites)]
GW_MA_dS<-dNdS.dt[,sum(MA_S)/sum(S_sites)]
### P-value calculations using binomial test. P value is calculated for dN/dS ratio difference from 1
dNdS.dt[, P.PT := round(
binom.test(x=c(PT_NS,PT_S),p=NS_sites/length,alternative = "g")$p.value,
digits = 3),
by=gene]
dNdS.dt[, P.MA := round(
binom.test(x=c(MA_NS,MA_S),p=NS_sites/length,alternative = "g")$p.value,
digits = 3),
by=gene]
### adjusted dN/dS ratios using genome-wide estimates for dS rate:
dNdS.dt[,dNdS_PT_adj := round(PT_NS/NS_sites/GW_PT_dS,digits = 2)]
dNdS.dt[,dNdS_MA_adj := round(MA_NS/NS_sites/GW_MA_dS,digits = 2)]
### 95% CI estimates for adjusted dN/dS ratios. Observed NS count was adjusted according to binomial CI; P-value is one-sided
dNdS.dt[, CI.PT := paste0(round(
binom.test(x=c(PT_NS,PT_S), p=NS_sites/length, alternative = "g")$conf.int*PT_NS/NS_sites/GW_PT_dS,
digits = 3),collapse = "-"),
by=gene]
dNdS.dt[, CI.MA := paste0(round(
binom.test(x=c(MA_NS,MA_S), p=NS_sites/length, alternative = "g")$conf.int*MA_NS/NS_sites/GW_MA_dS,
digits = 3),collapse = "-"),
by=gene]
fwrite(x=dNdS.dt,file = "dNdS.export.tsv", sep = "\t", row.names = F, col.names = T)
### some P-values
### aggregate N/S data over subsets - GW or 24 genes, PT and MA
GW_PT_S<-dNdS.dt[,sum(PT_S)]
GW_MA_S<-dNdS.dt[,sum(MA_S)]
GW_PT_NS<-dNdS.dt[,sum(PT_NS)]
GW_MA_NS<-dNdS.dt[,sum(MA_NS)]
E9G_PT_S<-dNdS.dt[`24 shared genes` >0,sum(PT_S)]
E9G_MA_S<-dNdS.dt[`24 shared genes` >0,sum(MA_S)]
E9G_PT_NS<-dNdS.dt[`24 shared genes` >0,sum(PT_NS)]
E9G_MA_NS<-dNdS.dt[`24 shared genes` >0,sum(MA_NS)]
### test for GW PT vs MA N/S data
fisher.test(
as.table(rbind(c(GW_PT_S,GW_MA_S),
c(GW_PT_NS,GW_MA_NS)
)))
##
## Fisher's Exact Test for Count Data
##
## data: as.table(rbind(c(GW_PT_S, GW_MA_S), c(GW_PT_NS, GW_MA_NS)))
## p-value = 0.04267
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.003486 1.267496
## sample estimates:
## odds ratio
## 1.127349
### test for E9G founder vs GW in PT data
fisher.test(
as.table(rbind(c(GW_PT_S,E9G_PT_S),
c(GW_PT_NS,E9G_PT_NS)
)))
##
## Fisher's Exact Test for Count Data
##
## data: as.table(rbind(c(GW_PT_S, E9G_PT_S), c(GW_PT_NS, E9G_PT_NS)))
## p-value = 0.01613
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.119825 4.569592
## sample estimates:
## odds ratio
## 2.158966
### test for E9G founder vs GW in MA data
fisher.test(
as.table(rbind(c(GW_MA_S,E9G_PT_S),
c(GW_MA_NS,E9G_PT_NS)
)))
##
## Fisher's Exact Test for Count Data
##
## data: as.table(rbind(c(GW_MA_S, E9G_PT_S), c(GW_MA_NS, E9G_PT_NS)))
## p-value = 0.05156
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9866348 4.0742356
## sample estimates:
## odds ratio
## 1.914822
### and a little data exploration
dNdS.dt
## 24 shared genes gene length NS_sites S_sites PT_NS PT_S PT_dN PT_dS
## 1: 0 ycaC_2 678 513.33 161.67 0 1 0.0000 0.0062
## 2: 0 ycaC_1 633 489.00 141.00 1 0 0.0020 0.0000
## 3: 0 proP_1 1527 1153.33 370.67 0 3 0.0000 0.0081
## 4: 0 proP_2 1314 982.33 328.67 1 0 0.0010 0.0000
## 5: 0 surA_1 1542 1168.33 370.67 2 1 0.0017 0.0027
## ---
## 4598: 0 mcl2 864 634.67 226.33 1 1 0.0016 0.0044
## 4599: 0 dsbG 774 584.67 186.33 1 0 0.0017 0.0000
## 4600: 0 A_03632 189 138.00 48.00 0 0 0.0000 0.0000
## 4601: 0 lysN_2 1200 900.00 297.00 0 1 0.0000 0.0034
## 4602: 1 A_03291 2868 2186.33 678.67 4 0 0.0018 0.0000
## MA_NS MA_S MA_dN MA_dS PT_dNdS MA_dNdS P.PT P.MA dNdS_PT_adj
## 1: 0 0 0.00000 0 0 NA 1.000 1.000 0.00
## 2: 0 0 0.00000 0 NA NA 0.773 1.000 1.36
## 3: 0 0 0.00000 0 0 NA 1.000 1.000 0.00
## 4: 1 0 0.00100 0 NA NA 0.748 0.748 0.68
## 5: 0 0 0.00000 0 0.63 NA 0.852 1.000 1.14
## ---
## 4598: 0 0 0.00000 0 0.36 NA 0.930 1.000 1.05
## 4599: 0 0 0.00000 0 NA NA 0.755 1.000 1.14
## 4600: 0 0 0.00000 0 NA NA 1.000 1.000 0.00
## 4601: 0 0 0.00000 0 0 NA 1.000 1.000 0.00
## 4602: 2 0 0.00091 0 NA NA 0.338 0.581 1.22
## dNdS_MA_adj CI.PT CI.MA
## 1: 0.00 0-0 0-0
## 2: 0.00 0.068-1.359 0-0
## 3: 0.00 0-0 0-0
## 4: 2.16 0.034-0.677 0.108-2.162
## 5: 0.00 0.154-1.138 0-0
## ---
## 4598: 0.00 0.027-1.047 0-0
## 4599: 0.00 0.057-1.137 0-0
## 4600: 0.00 0-0 0-0
## 4601: 0.00 0-0 0-0
## 4602: 1.94 0.575-1.216 0.434-1.943
ggplot(dNdS.dt,aes(x=dNdS_MA_adj))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing missing values (geom_bar).
ggplot(dNdS.dt,aes(x=dNdS_PT_adj))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 5 rows containing missing values (geom_bar).
ggplot(dNdS.dt,aes(x=P.PT))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(dNdS.dt,aes(x=P.MA))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 9 rows containing missing values (geom_bar).
ggplot(melt.data.table(dNdS.dt,id.vars = "gene",measure.vars = c("P.PT","P.MA"), value.name = "P.value", variable.name = "dataset" ),
aes(x=P.value,colour=dataset))+ geom_density(size=1.5) + scale_y_sqrt()+ scale_color_brewer(type = "qual",palette = 1) + theme_classic(base_size = 14)
ggplot(melt.data.table(dNdS.dt,id.vars = "gene",measure.vars = c("dNdS_PT_adj","dNdS_MA_adj"), value.name = "dNdS", variable.name = "dataset" ),
aes(x=dNdS,colour=dataset))+ geom_density(size=1.5) + scale_y_sqrt()+ scale_color_brewer(type = "qual",palette = 5) + theme_classic(base_size = 14)
#Figure S4
##########
#Making the phylogeny
BH.tree<-read.tree("RAxML_bipartitions.tree_without_ancestor")
BH.bootstrap<-read.tree("RAxML_bootstrap.tree_without_ancestor")
BH.tree$node.label
## [1] "" "" "68" "100" "100" "100" "97" "100" "100" "100" "100" "31"
## [13] "35" "100" "96" "100" "100" "100" "100" "100" "100" "100" "99" "100"
BH.tree$tip.label<-gsub("from_2B3_", "", BH.tree$tip.label)
plot(BH.tree,use.edge.length=FALSE,cex=0.75)
nodelabels(BH.tree$node.label, adj = -0.2, frame = "n", cex = 0.45)
nodelabels(pch=21,node=27,bg="red")
pdf("Fig_S4.pdf")
plot(BH.tree,use.edge.length=FALSE,cex=0.75)
nodelabels(BH.tree$node.label, adj = -0.2, frame = "n", cex = 0.45)
nodelabels(pch=21,node=27,bg="red")
dev.off()
## quartz_off_screen
## 2