Data import and variable setup

### Number of divisions per colony
generations = 27.2
### max rate - max number of mutation per division
max_rate = 100 
### genome size
genome_size = 4800000
### Passages per strain

passages.dt<-data.table(read_xlsx("passages.xlsx",range = "A1:B24",col_names = T))

setnames(passages.dt,c("isolate",'number of pasages'),c("isolate","rounds"))

MA2.SNV.dt<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))

MA2.SNV.dt
##    Mutational Accumulation 5G1b 5G1a 6G1b 2B2a 2B2b 2B1 8B1 2B3 7B1 1G1 6G1a
## 1:                 A:T>G:C   58   26   NA   65   29   8  NA   1   1  NA   NA
## 2:                 G:C>A:T   31   15   NA  100   30   8  NA   2   1  NA   NA
## 3:                 A:T>T:A   14    7   NA   17   10   4  NA   0   1  NA   NA
## 4:                 A:T>C:G   19   10   NA   21   11   9  NA   0   1  NA   NA
## 5:                 G:C>T:A   17   14   NA   52   17   4  NA   0   0  NA   NA
## 6:                 G:C>C:G    0    0   NA    2    0   1  NA   1   0  NA   NA
##    2G2 3G1 1B1a 1B1b 3B1 4G1d 4G1c 4G1a 4G1b 1G2 3G3 3G2 2G1
## 1:   6  27    2   29   2   26    1   16    9  44 122  62 150
## 2:  16  49    6  109   4   16    7    7    8  34  32  32  82
## 3:  26  17    5   24  15   20   23   32    6   5   6   5  14
## 4:  16  22    6   25  10   10   10   19    8   9  20   6  26
## 5:  16  23   14   88  54   33   10    7    7  17  21  12  26
## 6:   0   1    1    1   1    0    0    1    0   0   3   0   3
setnames(MA2.SNV.dt, 'Mutational Accumulation',"SNV")

### create output form:
MA.rates.dt<-melt(MA2.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[,.(SNP=sum(value)),by=isolate][passages.dt,on=c("isolate")]
## Warning in melt.data.table(MA2.SNV.dt, id.vars = "SNV", variable.name =
## "isolate"): 'measure.vars' [5G1b, 5G1a, 6G1b, 2B2a, ...] are not all of the
## same type. By order of hierarchy, the molten data value column will be of type
## 'double'. All measure variables not of type 'double' will be coerced too. Check
## DETAILS in ?melt.data.table for more on coercion.
MA.rates.dt[,GT := ifelse((isolate == "2B3" | isolate == "7B1"),yes ="WT",no="DnaQ")]

MA.rates.dt
##        isolate SNP rounds   GT
##  1:       1B1a  34      4 DnaQ
##  2:       1B1b 276      4 DnaQ
##  3:        1G2 109      4 DnaQ
##  4:        2B1  34      7 DnaQ
##  5:       2B2a 257      7 DnaQ
##  6:       2B2b  97      6 DnaQ
##  7:        2G1 301      6 DnaQ
##  8:        2G2  80     36 DnaQ
##  9:        3B1  86      4 DnaQ
## 10:        3G1 139      6 DnaQ
## 11:        3G2 117      4 DnaQ
## 12:        3G3 204     20 DnaQ
## 13:       4G1a  82     39 DnaQ
## 14:       4G1b  38     32 DnaQ
## 15:       4G1c  51     29 DnaQ
## 16:       4G1d 105     18 DnaQ
## 17:       5G1a  72      7 DnaQ
## 18:       5G1b 139      7 DnaQ
## 19:        7B1   4    113   WT
## 20:        2B3   4    113   WT
## 21: ATCC_51783  NA    113 DnaQ
## 22: ATCC_51784  NA    113 DnaQ
## 23: ATCC_51730  NA    113 DnaQ
##        isolate SNP rounds   GT
## some manual correction for ATCC isolates
MA.rates.dt[isolate == "ATCC_51783", `:=` (SNP = 2, GT = "ATCC")]
MA.rates.dt[isolate == "ATCC_51784", `:=` (SNP = 3, GT = "ATCC")]
MA.rates.dt[isolate == "ATCC_51730", `:=` (SNP = 4, GT = "ATCC")] 
MA.rates.dt
##        isolate SNP rounds   GT
##  1:       1B1a  34      4 DnaQ
##  2:       1B1b 276      4 DnaQ
##  3:        1G2 109      4 DnaQ
##  4:        2B1  34      7 DnaQ
##  5:       2B2a 257      7 DnaQ
##  6:       2B2b  97      6 DnaQ
##  7:        2G1 301      6 DnaQ
##  8:        2G2  80     36 DnaQ
##  9:        3B1  86      4 DnaQ
## 10:        3G1 139      6 DnaQ
## 11:        3G2 117      4 DnaQ
## 12:        3G3 204     20 DnaQ
## 13:       4G1a  82     39 DnaQ
## 14:       4G1b  38     32 DnaQ
## 15:       4G1c  51     29 DnaQ
## 16:       4G1d 105     18 DnaQ
## 17:       5G1a  72      7 DnaQ
## 18:       5G1b 139      7 DnaQ
## 19:        7B1   4    113   WT
## 20:        2B3   4    113   WT
## 21: ATCC_51783   2    113 ATCC
## 22: ATCC_51784   3    113 ATCC
## 23: ATCC_51730   4    113 ATCC
##        isolate SNP rounds   GT
### stats
MA.rates.dt[, SNV_per_division := round(
  binom.test(SNP,round(generations*rounds*max_rate))$estimate*max_rate,
                                        digits = 4),
            by=isolate]

MA.rates.dt[, ci := 
              paste0(round(
                binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate,
                digits = 4),collapse = "-"),
            by=isolate]

MA.rates.dt
##        isolate SNP rounds   GT SNV_per_division            ci
##  1:       1B1a  34      4 DnaQ           0.3125 0.2165-0.4364
##  2:       1B1b 276      4 DnaQ           2.5368 2.2495-2.8498
##  3:        1G2 109      4 DnaQ           1.0018 0.8233-1.2073
##  4:        2B1  34      7 DnaQ           0.1786 0.1237-0.2494
##  5:       2B2a 257      7 DnaQ           1.3498  1.1907-1.524
##  6:       2B2b  97      6 DnaQ           0.5944 0.4822-0.7246
##  7:        2G1 301      6 DnaQ           1.8444 1.6435-2.0627
##  8:        2G2  80     36 DnaQ           0.0817 0.0648-0.1017
##  9:        3B1  86      4 DnaQ           0.7904 0.6327-0.9753
## 10:        3G1 139      6 DnaQ           0.8517 0.7165-1.0049
## 11:        3G2 117      4 DnaQ           1.0754 0.8901-1.2874
## 12:        3G3 204     20 DnaQ           0.3750   0.3254-0.43
## 13:       4G1a  82     39 DnaQ           0.0773 0.0615-0.0959
## 14:       4G1b  38     32 DnaQ           0.0437 0.0309-0.0599
## 15:       4G1c  51     29 DnaQ           0.0647  0.0481-0.085
## 16:       4G1d 105     18 DnaQ           0.2145 0.1754-0.2596
## 17:       5G1a  72      7 DnaQ           0.3782   0.296-0.476
## 18:       5G1b 139      7 DnaQ           0.7300 0.6141-0.8614
## 19:        7B1   4    113   WT           0.0013  4e-04-0.0033
## 20:        2B3   4    113   WT           0.0013  4e-04-0.0033
## 21: ATCC_51783   2    113 ATCC           0.0007  1e-04-0.0024
## 22: ATCC_51784   3    113 ATCC           0.0010  2e-04-0.0029
## 23: ATCC_51730   4    113 ATCC           0.0013  4e-04-0.0033
##        isolate SNP rounds   GT SNV_per_division            ci
### analysis of trunks and splits
MM.SNV.dt<-data.table(read_xlsx("branch_and_truncs.xlsx",col_names = T))
MM.SNV.dt
##     Isolates Unique SNVs Shared between them Shared with closest relative
##  1:     1B1a         142                 842                            -
##  2:     1B1b         132                 842                            -
##  3:     2B2a         126                  86                          145
##  4:     2B2b         117                  86                          145
##  5:     4G1a          19                 220                            -
##  6:     4G1b         317                 220                            -
##  7:     4G1c          18                 220                            -
##  8:     4G1d          10                 220                            -
##  9:     5G1a          58                  21                          113
## 10:     5G1b          90                  21                          113
##     Trunc total
##  1:         842
##  2:         842
##  3:         231
##  4:         231
##  5:         220
##  6:         220
##  7:         220
##  8:         220
##  9:         134
## 10:         134
MA.rates.dt[MM.SNV.dt,.(isolate,SNP,'Unique SNVs'),on=c(isolate ="Isolates")]
##     isolate SNP          V3
##  1:    1B1a  34 Unique SNVs
##  2:    1B1b 276 Unique SNVs
##  3:    2B2a 257 Unique SNVs
##  4:    2B2b  97 Unique SNVs
##  5:    4G1a  82 Unique SNVs
##  6:    4G1b  38 Unique SNVs
##  7:    4G1c  51 Unique SNVs
##  8:    4G1d 105 Unique SNVs
##  9:    5G1a  72 Unique SNVs
## 10:    5G1b 139 Unique SNVs
MM.div.dt<-MA.rates.dt[MM.SNV.dt,on=c(isolate ="Isolates")][,.(isolate,SNP, rounds, SNV_per_division, div=`Unique SNVs`)]
MM.div.dt
##     isolate SNP rounds SNV_per_division div
##  1:    1B1a  34      4           0.3125 142
##  2:    1B1b 276      4           2.5368 132
##  3:    2B2a 257      7           1.3498 126
##  4:    2B2b  97      6           0.5944 117
##  5:    4G1a  82     39           0.0773  19
##  6:    4G1b  38     32           0.0437 317
##  7:    4G1c  51     29           0.0647  18
##  8:    4G1d 105     18           0.2145  10
##  9:    5G1a  72      7           0.3782  58
## 10:    5G1b 139      7           0.7300  90
MM.div.dt[,divisions := 
                round(
                div/(binom.test(SNP,round(generations*rounds*max_rate))$estimate*max_rate),
                digits = 1),
            by=isolate]

MM.div.dt[,ci_divisions := 
              paste0(
                sort(
                round(
                div/(binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate),
                digits = 1)
                )
                ,collapse = "-"),
            by=isolate]

MM.div.dt[, passages := round(divisions/generations,digits = 1)          ]

MM.div.dt[,ci_passages := 
                paste0(
                sort(
                round(
                div/(binom.test(SNP,round(generations*rounds*max_rate))$conf.int*max_rate)/
                  generations,
                digits = 1)
                )
                ,collapse = "-"),
            by=isolate]

MM.div.dt[,.(isolate,divergence=div, SNV_per_division,divisions, ci_divisions, passages, ci_passages)]
##     isolate divergence SNV_per_division divisions   ci_divisions passages
##  1:    1B1a        142           0.3125     454.4    325.4-655.9     16.7
##  2:    1B1b        132           2.5368      52.0      46.3-58.7      1.9
##  3:    2B2a        126           1.3498      93.3     82.7-105.8      3.4
##  4:    2B2b        117           0.5944     196.8    161.5-242.6      7.2
##  5:    4G1a         19           0.0773     245.8        198-309      9.0
##  6:    4G1b        317           0.0437    7261.0 5290.5-10259.9    266.9
##  7:    4G1c         18           0.0647     278.4    211.8-373.9     10.2
##  8:    4G1d         10           0.2145      46.6        38.5-57      1.7
##  9:    5G1a         58           0.3782     153.4    121.9-195.9      5.6
## 10:    5G1b         90           0.7300     123.3    104.5-146.6      4.5
##     ci_passages
##  1:     12-24.1
##  2:     1.7-2.2
##  3:       3-3.9
##  4:     5.9-8.9
##  5:    7.3-11.4
##  6: 194.5-377.2
##  7:    7.8-13.7
##  8:     1.4-2.1
##  9:     4.5-7.2
## 10:     3.8-5.4
### and now some plots:
factor(MA.rates.dt$GT,levels = c("DnaQ","WT","ATCC"),ordered=TRUE)
##  [1] DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ DnaQ
## [16] DnaQ DnaQ DnaQ WT   WT   ATCC ATCC ATCC
## Levels: DnaQ < WT < ATCC
MA.rates.dt
##        isolate SNP rounds   GT SNV_per_division            ci
##  1:       1B1a  34      4 DnaQ           0.3125 0.2165-0.4364
##  2:       1B1b 276      4 DnaQ           2.5368 2.2495-2.8498
##  3:        1G2 109      4 DnaQ           1.0018 0.8233-1.2073
##  4:        2B1  34      7 DnaQ           0.1786 0.1237-0.2494
##  5:       2B2a 257      7 DnaQ           1.3498  1.1907-1.524
##  6:       2B2b  97      6 DnaQ           0.5944 0.4822-0.7246
##  7:        2G1 301      6 DnaQ           1.8444 1.6435-2.0627
##  8:        2G2  80     36 DnaQ           0.0817 0.0648-0.1017
##  9:        3B1  86      4 DnaQ           0.7904 0.6327-0.9753
## 10:        3G1 139      6 DnaQ           0.8517 0.7165-1.0049
## 11:        3G2 117      4 DnaQ           1.0754 0.8901-1.2874
## 12:        3G3 204     20 DnaQ           0.3750   0.3254-0.43
## 13:       4G1a  82     39 DnaQ           0.0773 0.0615-0.0959
## 14:       4G1b  38     32 DnaQ           0.0437 0.0309-0.0599
## 15:       4G1c  51     29 DnaQ           0.0647  0.0481-0.085
## 16:       4G1d 105     18 DnaQ           0.2145 0.1754-0.2596
## 17:       5G1a  72      7 DnaQ           0.3782   0.296-0.476
## 18:       5G1b 139      7 DnaQ           0.7300 0.6141-0.8614
## 19:        7B1   4    113   WT           0.0013  4e-04-0.0033
## 20:        2B3   4    113   WT           0.0013  4e-04-0.0033
## 21: ATCC_51783   2    113 ATCC           0.0007  1e-04-0.0024
## 22: ATCC_51784   3    113 ATCC           0.0010  2e-04-0.0029
## 23: ATCC_51730   4    113 ATCC           0.0013  4e-04-0.0033
##        isolate SNP rounds   GT SNV_per_division            ci
MA.rates.dt[isolate == "ATCC_51730", `:=` (SNP = 4, GT = "ATCC")] 

MA.rates.dt<-cbind(MA.rates.dt,DnaQ=c("E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","E9G","WT","WT","WT","WT","WT"))

#Figure 3c:
###########  
  
pdf("Figure_3c.pdf",width=4,height = 4,family = "Arial")
ggplot(MA.rates.dt,aes(y=SNV_per_division,x=DnaQ, fill=DnaQ)) +
  geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) + 
  #geom_dotplot(binaxis = 'y', stackdir = 'center', position = position_dodge(),dotsize =.2)+
  scale_y_log10(limits=c(1E-4,10), expand = c(0, 0),
                breaks = scales::trans_breaks("log10", function(x) 10^x),
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  geom_jitter(width =.1)+
  #scale_y_log10(limits=c(1E-4,10)) + 
  scale_fill_manual(values = c("coral3","slateblue")) +
  theme_classic(base_size = 20)+
  theme(axis.text.y =element_text(size=14))+
  labs(y="SNV per division")+
  annotation_logticks(sides = "l", outside = TRUE,long = unit(0,"mm"),short = unit(1,"mm"),
  mid = unit(1.25,"mm"),)+
  coord_cartesian(clip = "off")
dev.off()
## quartz_off_screen 
##                 2
ggplot(MA.rates.dt,aes(y=SNV_per_division,x=DnaQ, fill=DnaQ)) +
  geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) + 
  #geom_dotplot(binaxis = 'y', stackdir = 'center', position = position_dodge(),dotsize =.2)+
  scale_y_log10(limits=c(1E-4,10), expand = c(0, 0),
                breaks = scales::trans_breaks("log10", function(x) 10^x),
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  geom_jitter(width =.1)+
  #scale_y_log10(limits=c(1E-4,10)) + 
  scale_fill_manual(values = c("coral3","slateblue")) +
  theme_classic(base_size = 20)+
  theme(axis.text.y =element_text(size=14))+
  labs(y="SNV per division")+
  annotation_logticks(sides = "l", outside = TRUE,long = unit(0,"mm"),short = unit(1,"mm"),
  mid = unit(1.25,"mm"),)+
  coord_cartesian(clip = "off")

#Figure 3d:
###########  

### data import from last dataset
PT.SNV.dt<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))


###############################################################
setnames(PT.SNV.dt, 'Patient Isolate SNVs',"SNV")
PT.div.dt<-melt(PT.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[,.(SNP=sum(value)),by=isolate]


PT.div.dt[,DnaQ := ifelse((isolate == "2B3" | isolate == "7B1" | isolate == "1G1" | isolate == "8B1"),yes ="WT",no="E9G")]

pdf("Figure_3d.pdf",width=4,height = 4,family = "Arial")
ggplot(PT.div.dt,aes(y=SNP,x= DnaQ, fill=DnaQ)) +
  geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) + 
  geom_jitter(width =.1) +
#  scale_y_log10(limits=c(1E-4,10)) + 
  scale_fill_manual(values = c("coral3","slateblue")) +
  theme_classic(base_size = 20)+
  theme(axis.text.y =element_text(size=14),axis.title.y=element_text(size=19))+
  labs(y="Divergence from LCA (SNV)")
dev.off()
## quartz_off_screen 
##                 2
ggplot(PT.div.dt,aes(y=SNP,x= DnaQ, fill=DnaQ)) +
  geom_boxplot(width=.5,lwd=.5,outlier.shape = NA) + 
  geom_jitter(width =.1) +
#  scale_y_log10(limits=c(1E-4,10)) + 
  scale_fill_manual(values = c("coral3","slateblue")) +
  theme_classic(base_size = 20)+
  theme(axis.text.y =element_text(size=14),axis.title.y=element_text(size=19))+
  labs(y="Divergence from LCA (SNV)")

#Figure S2b:

FigS2b.dt<-fread("Pairewise_diff.txt")

FigS2b.dt[V1 != V2][order(V3)]
##        V1   V2   V3
##   1:  2B3  8B1   13
##   2: 4G1c 4G1d   28
##   3:  7B1  2B3   34
##   4:  2B3  1G1   39
##   5:  7B1  8B1   47
##  ---               
## 272: 4G1c  3B1 1713
## 273:  3G2  3B1 1724
## 274:  3B1 4G1b 1765
## 275: 1B1b  3B1 2169
## 276: 1B1a  3B1 2179
length(rownames(FigS2b.dt[V1 != V2]))
## [1] 276
pdf("Figure_S2a.pdf",width=2,height = 4,family = "Arial")

ggplot(FigS2b.dt[V1 != V2],aes(x=1,y=V3)) + 
  geom_violin(lwd=.1,fill="slateblue") + scale_y_log10(limits=c(10,3000)) + 
  geom_boxplot(width=.3,fill='navy',color='black',lwd=.5,
                 notch=TRUE,outlier.alpha=0)+
  geom_dotplot(data = FigS2b.dt[V1 !=V2][V3<100],aes(x=1,y=V3),binaxis='y', stackdir='center', dotsize=0.2, width =1,stackratio = 1.5)+
  theme_classic(base_size = 12) + theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +
  annotation_logticks(sides = "l") + 
  labs(y="SNV",x=NULL) 
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
dev.off()
## quartz_off_screen 
##                 2
ggplot(FigS2b.dt[V1 != V2],aes(x=1,y=V3)) + 
  geom_violin(lwd=.1,fill="slateblue") + scale_y_log10(limits=c(10,3000)) + 
  geom_boxplot(width=.3,fill='navy',color='black',lwd=.5,
                 notch=TRUE,outlier.alpha=0)+
  geom_dotplot(data = FigS2b.dt[V1 !=V2][V3<100],aes(x=1,y=V3),binaxis='y', stackdir='center', dotsize=0.2, width =1,stackratio = 1.5)+
  theme_classic(base_size = 12) + theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +
  annotation_logticks(sides = "l") + 
  labs(y="SNV",x=NULL) 
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

MA.spectra.dt<-melt(MA2.SNV.dt,id.vars = 'SNV', variable.name = "isolate")[
  !(isolate %in% c("8B1","6G1a","6G1b","1G1"))]


MA.spectra.dt[, freq:=value/sum(value), by=isolate]
setkeyv(MA.spectra.dt,c("isolate","SNV"))

PT.spectra.dt<-melt(PT.SNV.dt,id.vars = 'SNV', variable.name = "isolate")
PT.spectra.dt[, freq:=value/sum(value), by=isolate]
setkeyv(PT.spectra.dt,c("isolate","SNV"))
#Figure S10:
############


PT.spectra.dt[MA.spectra.dt]
##          SNV isolate value       freq i.value      i.freq
##   1: A:T>C:G    5G1b    35 0.18134715      19 0.136690647
##   2: A:T>G:C    5G1b    31 0.16062176      58 0.417266187
##   3: A:T>T:A    5G1b    31 0.16062176      14 0.100719424
##   4: G:C>A:T    5G1b    42 0.21761658      31 0.223021583
##   5: G:C>C:G    5G1b     4 0.02072539       0 0.000000000
##  ---                                                     
## 116: A:T>G:C     2G1    30 0.16129032     150 0.498338870
## 117: A:T>T:A     2G1    23 0.12365591      14 0.046511628
## 118: G:C>A:T     2G1    36 0.19354839      82 0.272425249
## 119: G:C>C:G     2G1     5 0.02688172       3 0.009966777
## 120: G:C>T:A     2G1    51 0.27419355      26 0.086378738
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)]
##      isolate     SNV   delta_freq
##   1:    5G1b A:T>C:G  0.044656503
##   2:    5G1b A:T>G:C -0.256644425
##   3:    5G1b A:T>T:A  0.059902337
##   4:    5G1b G:C>A:T -0.005405002
##   5:    5G1b G:C>C:G  0.020725389
##  ---                             
## 116:     2G1 A:T>G:C -0.337048548
## 117:     2G1 A:T>T:A  0.077144286
## 118:     2G1 G:C>A:T -0.078876862
## 119:     2G1 G:C>C:G  0.016914943
## 120:     2G1 G:C>T:A  0.187814811
pdf("Figure_S10.pdf",width=7,height = 5,family = "Arial")
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)], aes(x=SNV,y=delta_freq,fill=SNV)) +
  geom_boxplot(outlier.shape = NA,varwidth = .4) + geom_jitter(width = .1) +
  scale_fill_brewer(type = "qual",palette = "Dark2" ) +
  theme_classic()
dev.off()
## quartz_off_screen 
##                 2
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)], aes(x=SNV,y=delta_freq,fill=SNV)) +
  geom_boxplot(outlier.shape = NA,varwidth = .4) + geom_jitter(width = .1) +
  scale_fill_brewer(type = "qual",palette = "Dark2" ) +
  theme_classic()

#knitr::opts_chunk$set() 

MA.spectra.dt
##          SNV isolate value        freq
##   1: A:T>C:G    5G1b    19 0.136690647
##   2: A:T>G:C    5G1b    58 0.417266187
##   3: A:T>T:A    5G1b    14 0.100719424
##   4: G:C>A:T    5G1b    31 0.223021583
##   5: G:C>C:G    5G1b     0 0.000000000
##  ---                                  
## 116: A:T>G:C     2G1   150 0.498338870
## 117: A:T>T:A     2G1    14 0.046511628
## 118: G:C>A:T     2G1    82 0.272425249
## 119: G:C>C:G     2G1     3 0.009966777
## 120: G:C>T:A     2G1    26 0.086378738
PT.spectra.dt
##          SNV isolate value       freq
##   1: A:T>C:G    5G1b    35 0.18134715
##   2: A:T>G:C    5G1b    31 0.16062176
##   3: A:T>T:A    5G1b    31 0.16062176
##   4: G:C>A:T    5G1b    42 0.21761658
##   5: G:C>C:G    5G1b     4 0.02072539
##  ---                                 
## 140: A:T>G:C     2G1    30 0.16129032
## 141: A:T>T:A     2G1    23 0.12365591
## 142: G:C>A:T     2G1    36 0.19354839
## 143: G:C>C:G     2G1     5 0.02688172
## 144: G:C>T:A     2G1    51 0.27419355
PT.spectra.dt[MA.spectra.dt]
##          SNV isolate value       freq i.value      i.freq
##   1: A:T>C:G    5G1b    35 0.18134715      19 0.136690647
##   2: A:T>G:C    5G1b    31 0.16062176      58 0.417266187
##   3: A:T>T:A    5G1b    31 0.16062176      14 0.100719424
##   4: G:C>A:T    5G1b    42 0.21761658      31 0.223021583
##   5: G:C>C:G    5G1b     4 0.02072539       0 0.000000000
##  ---                                                     
## 116: A:T>G:C     2G1    30 0.16129032     150 0.498338870
## 117: A:T>T:A     2G1    23 0.12365591      14 0.046511628
## 118: G:C>A:T     2G1    36 0.19354839      82 0.272425249
## 119: G:C>C:G     2G1     5 0.02688172       3 0.009966777
## 120: G:C>T:A     2G1    51 0.27419355      26 0.086378738
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)]
##      isolate     SNV   delta_freq
##   1:    5G1b A:T>C:G  0.044656503
##   2:    5G1b A:T>G:C -0.256644425
##   3:    5G1b A:T>T:A  0.059902337
##   4:    5G1b G:C>A:T -0.005405002
##   5:    5G1b G:C>C:G  0.020725389
##  ---                             
## 116:     2G1 A:T>G:C -0.337048548
## 117:     2G1 A:T>T:A  0.077144286
## 118:     2G1 G:C>A:T -0.078876862
## 119:     2G1 G:C>C:G  0.016914943
## 120:     2G1 G:C>T:A  0.187814811
PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV]
##        SNV   difference       logP
## 1: A:T>C:G -0.005037704 0.05159177
## 2: A:T>G:C -0.099484202 1.37809475
## 3: A:T>T:A -0.001692194 0.32339325
## 4: G:C>A:T -0.015714808 0.17119647
## 5: G:C>C:G  0.008590900 1.23464401
## 6: G:C>T:A  0.113338008 3.49168321
wilcox.test(PT.spectra.dt[MA.spectra.dt]$freq-PT.spectra.dt[MA.spectra.dt]$i.freq)$p.value
## [1] 0.3219527
ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV],
       aes(x=difference,y=logP,label=SNV,colour=SNV)) +
  geom_point(size=3) + 
  geom_text_repel(colour="black",nudge_x=.053) +
  scale_colour_brewer(type = "qual",palette = "Dark2", guide =F) +
  scale_x_continuous(limits=c(-.15,.15)) +
  labs(x="Mean PT-MA difference",y="-logP") +
  theme_classic(base_size = 12)

PT.spectra.dt[MA.spectra.dt]
##          SNV isolate value       freq i.value      i.freq
##   1: A:T>C:G    5G1b    35 0.18134715      19 0.136690647
##   2: A:T>G:C    5G1b    31 0.16062176      58 0.417266187
##   3: A:T>T:A    5G1b    31 0.16062176      14 0.100719424
##   4: G:C>A:T    5G1b    42 0.21761658      31 0.223021583
##   5: G:C>C:G    5G1b     4 0.02072539       0 0.000000000
##  ---                                                     
## 116: A:T>G:C     2G1    30 0.16129032     150 0.498338870
## 117: A:T>T:A     2G1    23 0.12365591      14 0.046511628
## 118: G:C>A:T     2G1    36 0.19354839      82 0.272425249
## 119: G:C>C:G     2G1     5 0.02688172       3 0.009966777
## 120: G:C>T:A     2G1    51 0.27419355      26 0.086378738
PT.spectra.dt[PT.spectra.dt,allow.cartesian=T,on="SNV"][order(isolate,i.isolate)]
##           SNV isolate value       freq i.isolate i.value     i.freq
##    1: A:T>C:G    5G1b    35 0.18134715      5G1b      35 0.18134715
##    2: A:T>G:C    5G1b    31 0.16062176      5G1b      31 0.16062176
##    3: A:T>T:A    5G1b    31 0.16062176      5G1b      31 0.16062176
##    4: G:C>A:T    5G1b    42 0.21761658      5G1b      42 0.21761658
##    5: G:C>C:G    5G1b     4 0.02072539      5G1b       4 0.02072539
##   ---                                                              
## 3452: A:T>G:C     2G1    30 0.16129032       2G1      30 0.16129032
## 3453: A:T>T:A     2G1    23 0.12365591       2G1      23 0.12365591
## 3454: G:C>A:T     2G1    36 0.19354839       2G1      36 0.19354839
## 3455: G:C>C:G     2G1     5 0.02688172       2G1       5 0.02688172
## 3456: G:C>T:A     2G1    51 0.27419355       2G1      51 0.27419355
PT.spectra.pval.dt <-PT.spectra.dt[PT.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]
MA.spectra.pval.dt <-MA.spectra.dt[MA.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]
PTvsMA.spectra.pval.dt <-PT.spectra.dt[MA.spectra.dt,allow.cartesian=T,on="SNV"][,chisq.test(matrix(data = c(value,i.value),ncol = 2))$p.value,by=.(isolate,i.isolate)]

dcast(PT.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
##     isolate         5G1b         5G1a         6G1b         2B2a         2B2b
##  1:    5G1b 1.000000e+00 9.349139e-01 9.688062e-01 1.524869e-02 1.987835e-02
##  2:    5G1a 9.349139e-01 1.000000e+00 5.975245e-01 1.653591e-03 2.576276e-03
##  3:    6G1b 9.688062e-01 5.975245e-01 1.000000e+00 5.143139e-02 5.599936e-02
##  4:    2B2a 1.524869e-02 1.653591e-03 5.143139e-02 1.000000e+00 8.260670e-01
##  5:    2B2b 1.987835e-02 2.576276e-03 5.599936e-02 8.260670e-01 1.000000e+00
##  6:     2B1 2.404062e-01 3.815275e-02 3.615634e-01 5.050247e-02 2.505466e-01
##  7:     8B1 3.143907e-03 3.214699e-03 7.639774e-04 2.338284e-03 5.154551e-04
##  8:     2B3 3.887974e-04 3.977783e-04 6.291931e-05 1.611130e-04 4.060590e-05
##  9:     7B1 1.926923e-05 2.408350e-05 1.786559e-06 3.390010e-06 8.511987e-07
## 10:     1G1 5.129370e-01 4.799516e-01 2.780501e-01 3.269489e-01 2.933917e-01
## 11:    6G1a 3.534372e-01 4.953403e-02 7.886153e-01 9.019921e-02 5.984663e-02
## 12:     2G2 1.393980e-01 6.366288e-02 1.490029e-01 1.316104e-07 2.594930e-07
## 13:     3G1 2.027142e-01 2.467556e-01 1.364252e-01 4.274990e-08 1.062865e-07
## 14:    1B1a 1.219244e-08 5.381779e-07 6.033561e-11 1.172237e-27 4.992083e-28
## 15:    1B1b 4.383228e-09 3.278957e-07 1.375868e-11 1.349314e-28 6.159104e-29
## 16:     3B1 4.174516e-20 1.278850e-15 1.494218e-24 5.894622e-54 1.648180e-51
## 17:    4G1d 1.551785e-02 1.346368e-03 1.087796e-01 6.655144e-01 3.078904e-01
## 18:    4G1c 1.282896e-02 1.243758e-03 9.172317e-02 5.086889e-01 2.371737e-01
## 19:    4G1a 1.833134e-02 1.774794e-03 1.290526e-01 3.613974e-01 2.013570e-01
## 20:    4G1b 5.258126e-05 1.772721e-06 9.897128e-04 8.043299e-02 1.239029e-02
## 21:     1G2 7.936768e-01 5.136746e-01 8.668955e-01 3.962396e-01 4.315178e-01
## 22:     3G3 3.773864e-01 1.036704e-01 8.008204e-01 3.762653e-03 2.041073e-03
## 23:     3G2 2.661662e-01 8.482486e-02 6.670256e-01 7.984447e-04 2.748562e-04
## 24:     2G1 8.309870e-01 6.790375e-01 5.954864e-01 3.381823e-04 2.322387e-04
##     isolate         5G1b         5G1a         6G1b         2B2a         2B2b
##              2B1          8B1          2B3          7B1          1G1
##  1: 2.404062e-01 3.143907e-03 3.887974e-04 1.926923e-05 5.129370e-01
##  2: 3.815275e-02 3.214699e-03 3.977783e-04 2.408350e-05 4.799516e-01
##  3: 3.615634e-01 7.639774e-04 6.291931e-05 1.786559e-06 2.780501e-01
##  4: 5.050247e-02 2.338284e-03 1.611130e-04 3.390010e-06 3.269489e-01
##  5: 2.505466e-01 5.154551e-04 4.060590e-05 8.511987e-07 2.933917e-01
##  6: 1.000000e+00 8.485104e-04 1.265298e-04 5.248816e-06 3.381611e-01
##  7: 8.485104e-04 1.000000e+00 7.572787e-01 5.726745e-01 2.421688e-01
##  8: 1.265298e-04 7.572787e-01          NaN          NaN 3.072073e-02
##  9: 5.248816e-06 5.726745e-01          NaN          NaN 9.573314e-03
## 10: 3.381611e-01 2.421688e-01 3.072073e-02 9.573314e-03 1.000000e+00
## 11: 2.557498e-01 4.647781e-04 3.712193e-05 6.317315e-07 2.693280e-01
## 12: 5.759091e-03 2.929672e-06 1.288526e-07 7.060235e-10 1.910280e-02
## 13: 8.625983e-04 8.995986e-07 3.212174e-08 2.869536e-10 4.406905e-03
## 14: 1.327272e-16 4.902517e-06 2.406803e-07 3.818316e-11 5.080661e-02
## 15: 2.401037e-17 6.016314e-06 3.363978e-07 6.535096e-11 5.279367e-02
## 16: 2.068455e-31 1.357045e-14 3.707852e-16 2.091597e-22 9.222175e-05
## 17: 1.892381e-02 4.836448e-05 1.319652e-06 3.703457e-09 1.154916e-01
## 18: 1.093009e-02 2.508552e-05 6.630646e-07 1.543333e-09 1.166283e-01
## 19: 1.895904e-02 1.949674e-05 5.571333e-07 1.780492e-09 9.753207e-02
## 20: 1.204318e-04 3.039028e-05 9.850733e-07 1.764737e-09 1.356799e-01
## 21: 3.459355e-01 2.255912e-03 1.875796e-04 8.641687e-06 3.538293e-01
## 22: 5.823781e-02 7.574914e-06 1.360562e-07 5.116427e-10 1.398123e-02
## 23: 9.639617e-03 7.005500e-07 5.551896e-09 6.309959e-12 3.908427e-03
## 24: 1.627183e-02 3.145136e-03 4.910797e-04 2.300620e-05 6.626006e-01
##              2B1          8B1          2B3          7B1          1G1
##             6G1a          2G2          3G1         1B1a         1B1b
##  1: 3.534372e-01 1.393980e-01 2.027142e-01 1.219244e-08 4.383228e-09
##  2: 4.953403e-02 6.366288e-02 2.467556e-01 5.381779e-07 3.278957e-07
##  3: 7.886153e-01 1.490029e-01 1.364252e-01 6.033561e-11 1.375868e-11
##  4: 9.019921e-02 1.316104e-07 4.274990e-08 1.172237e-27 1.349314e-28
##  5: 5.984663e-02 2.594930e-07 1.062865e-07 4.992083e-28 6.159104e-29
##  6: 2.557498e-01 5.759091e-03 8.625983e-04 1.327272e-16 2.401037e-17
##  7: 4.647781e-04 2.929672e-06 8.995986e-07 4.902517e-06 6.016314e-06
##  8: 3.712193e-05 1.288526e-07 3.212174e-08 2.406803e-07 3.363978e-07
##  9: 6.317315e-07 7.060235e-10 2.869536e-10 3.818316e-11 6.535096e-11
## 10: 2.693280e-01 1.910280e-02 4.406905e-03 5.080661e-02 5.279367e-02
## 11: 1.000000e+00 1.593525e-02 4.652631e-03 5.895554e-17 6.669275e-18
## 12: 1.593525e-02 1.000000e+00 6.391007e-01 2.449398e-06 7.482776e-07
## 13: 4.652631e-03 6.391007e-01 1.000000e+00 1.707436e-07 7.497355e-08
## 14: 5.895554e-17 2.449398e-06 1.707436e-07 1.000000e+00 9.994785e-01
## 15: 6.669275e-18 7.482776e-07 7.497355e-08 9.994785e-01 1.000000e+00
## 16: 4.142097e-35 2.561961e-15 7.268410e-15 2.278003e-06 8.592549e-06
## 17: 3.803576e-01 4.242324e-07 2.057488e-07 3.484129e-32 2.994213e-33
## 18: 3.403489e-01 1.527924e-07 1.423124e-07 4.658362e-34 3.916566e-35
## 19: 4.788574e-01 1.090791e-06 8.796074e-07 1.732247e-31 1.384282e-32
## 20: 3.881222e-02 1.651289e-11 6.298062e-12 2.125414e-45 1.392457e-46
## 21: 3.949317e-01 7.380060e-03 9.358212e-03 2.734315e-12 7.237920e-13
## 22: 5.322772e-01 2.151378e-01 6.999627e-02 5.515349e-13 8.934373e-14
## 23: 3.117112e-01 1.416776e-01 6.390796e-02 2.976163e-14 4.759195e-15
## 24: 1.620287e-01 8.854705e-02 2.413217e-01 2.497864e-08 9.566429e-09
##             6G1a          2G2          3G1         1B1a         1B1b
##              3B1         4G1d         4G1c         4G1a         4G1b
##  1: 4.174516e-20 1.551785e-02 1.282896e-02 1.833134e-02 5.258126e-05
##  2: 1.278850e-15 1.346368e-03 1.243758e-03 1.774794e-03 1.772721e-06
##  3: 1.494218e-24 1.087796e-01 9.172317e-02 1.290526e-01 9.897128e-04
##  4: 5.894622e-54 6.655144e-01 5.086889e-01 3.613974e-01 8.043299e-02
##  5: 1.648180e-51 3.078904e-01 2.371737e-01 2.013570e-01 1.239029e-02
##  6: 2.068455e-31 1.892381e-02 1.093009e-02 1.895904e-02 1.204318e-04
##  7: 1.357045e-14 4.836448e-05 2.508552e-05 1.949674e-05 3.039028e-05
##  8: 3.707852e-16 1.319652e-06 6.630646e-07 5.571333e-07 9.850733e-07
##  9: 2.091597e-22 3.703457e-09 1.543333e-09 1.780492e-09 1.764737e-09
## 10: 9.222175e-05 1.154916e-01 1.166283e-01 9.753207e-02 1.356799e-01
## 11: 4.142097e-35 3.803576e-01 3.403489e-01 4.788574e-01 3.881222e-02
## 12: 2.561961e-15 4.242324e-07 1.527924e-07 1.090791e-06 1.651289e-11
## 13: 7.268410e-15 2.057488e-07 1.423124e-07 8.796074e-07 6.298062e-12
## 14: 2.278003e-06 3.484129e-32 4.658362e-34 1.732247e-31 2.125414e-45
## 15: 8.592549e-06 2.994213e-33 3.916566e-35 1.384282e-32 1.392457e-46
## 16: 1.000000e+00 5.463871e-60 3.197277e-62 1.105915e-57 2.323230e-78
## 17: 5.463871e-60 1.000000e+00 9.987691e-01 9.899509e-01 4.496617e-01
## 18: 3.197277e-62 9.987691e-01 1.000000e+00 9.991280e-01 5.170326e-01
## 19: 1.105915e-57 9.899509e-01 9.991280e-01 1.000000e+00 5.562112e-01
## 20: 2.323230e-78 4.496617e-01 5.170326e-01 5.562112e-01 1.000000e+00
## 21: 4.192463e-27 3.001195e-01 2.540580e-01 2.397194e-01 4.982540e-03
## 22: 9.678001e-30 1.924931e-02 1.019030e-02 2.126999e-02 2.562950e-05
## 23: 1.289819e-32 5.709164e-03 2.851758e-03 6.710227e-03 1.917749e-06
## 24: 4.781325e-19 7.081954e-04 7.494697e-04 1.548128e-03 2.987067e-06
##              3B1         4G1d         4G1c         4G1a         4G1b
##              1G2          3G3          3G2          2G1
##  1: 7.936768e-01 3.773864e-01 2.661662e-01 8.309870e-01
##  2: 5.136746e-01 1.036704e-01 8.482486e-02 6.790375e-01
##  3: 8.668955e-01 8.008204e-01 6.670256e-01 5.954864e-01
##  4: 3.962396e-01 3.762653e-03 7.984447e-04 3.381823e-04
##  5: 4.315178e-01 2.041073e-03 2.748562e-04 2.322387e-04
##  6: 3.459355e-01 5.823781e-02 9.639617e-03 1.627183e-02
##  7: 2.255912e-03 7.574914e-06 7.005500e-07 3.145136e-03
##  8: 1.875796e-04 1.360562e-07 5.551896e-09 4.910797e-04
##  9: 8.641687e-06 5.116427e-10 6.309959e-12 2.300620e-05
## 10: 3.538293e-01 1.398123e-02 3.908427e-03 6.626006e-01
## 11: 3.949317e-01 5.322772e-01 3.117112e-01 1.620287e-01
## 12: 7.380060e-03 2.151378e-01 1.416776e-01 8.854705e-02
## 13: 9.358212e-03 6.999627e-02 6.390796e-02 2.413217e-01
## 14: 2.734315e-12 5.515349e-13 2.976163e-14 2.497864e-08
## 15: 7.237920e-13 8.934373e-14 4.759195e-15 9.566429e-09
## 16: 4.192463e-27 9.678001e-30 1.289819e-32 4.781325e-19
## 17: 3.001195e-01 1.924931e-02 5.709164e-03 7.081954e-04
## 18: 2.540580e-01 1.019030e-02 2.851758e-03 7.494697e-04
## 19: 2.397194e-01 2.126999e-02 6.710227e-03 1.548128e-03
## 20: 4.982540e-03 2.562950e-05 1.917749e-06 2.987067e-06
## 21: 1.000000e+00 3.227614e-01 2.228573e-01 1.844926e-01
## 22: 3.227614e-01 1.000000e+00 9.974790e-01 9.239711e-02
## 23: 2.228573e-01 9.974790e-01 1.000000e+00 6.612473e-02
## 24: 1.844926e-01 9.239711e-02 6.612473e-02 1.000000e+00
##              1G2          3G3          3G2          2G1
dcast(MA.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
##     isolate         5G1b         5G1a         2B2a         2B2b          2B1
##  1:    5G1b          NaN          NaN 2.329194e-04          NaN 9.682551e-02
##  2:    5G1a          NaN          NaN 5.315323e-02          NaN 2.808922e-01
##  3:    2B2a 2.329194e-04 5.315323e-02 1.000000e+00 4.348749e-01 9.191913e-03
##  4:    2B2b          NaN          NaN 4.348749e-01          NaN 1.542864e-01
##  5:     2B1 9.682551e-02 2.808922e-01 9.191913e-03 1.542864e-01 1.000000e+00
##  6:     2B3 4.245329e-07 7.123851e-04 6.158249e-04 7.842315e-05 2.812776e-01
##  7:     7B1          NaN          NaN 5.082750e-01          NaN 9.563633e-01
##  8:     2G2          NaN          NaN 8.354934e-11          NaN 2.571156e-02
##  9:     3G1 3.254822e-03 8.630508e-02 6.243772e-02 4.540034e-01 4.580641e-01
## 10:    1B1a 3.492163e-05 1.003886e-02 7.059109e-04 3.274400e-03 7.103096e-02
## 11:    1B1b 3.026118e-13 2.454219e-06 1.618892e-04 1.413948e-04 4.635797e-04
## 12:     3B1 1.298090e-17 2.582877e-10 1.006755e-17 1.566370e-12 2.061579e-07
## 13:    4G1d          NaN          NaN 2.049361e-05          NaN 1.533985e-02
## 14:    4G1c          NaN          NaN 2.381914e-14          NaN 1.303989e-03
## 15:    4G1a 2.261058e-07 5.949174e-05 4.959379e-16 1.007677e-06 5.327518e-02
## 16:    4G1b          NaN          NaN 2.934877e-02          NaN 8.322826e-01
## 17:     1G2          NaN          NaN 9.655994e-02          NaN 1.055563e-02
## 18:     3G3 3.294290e-03 4.530645e-03 7.534511e-13 1.377871e-05 9.781867e-04
## 19:     3G2          NaN          NaN 2.805301e-05          NaN 3.179531e-04
## 20:     2G1 4.052250e-02 1.223943e-02 6.274110e-08 3.434709e-03 2.805723e-03
##              2B3        7B1          2G2          3G1         1B1a         1B1b
##  1: 4.245329e-07        NaN          NaN 3.254822e-03 3.492163e-05 3.026118e-13
##  2: 7.123851e-04        NaN          NaN 8.630508e-02 1.003886e-02 2.454219e-06
##  3: 6.158249e-04 0.50827501 8.354934e-11 6.243772e-02 7.059109e-04 1.618892e-04
##  4: 7.842315e-05        NaN          NaN 4.540034e-01 3.274400e-03 1.413948e-04
##  5: 2.812776e-01 0.95636326 2.571156e-02 4.580641e-01 7.103096e-02 4.635797e-04
##  6:          NaN        NaN 9.331034e-05 2.409050e-03 8.462851e-02 7.989034e-07
##  7:          NaN        NaN          NaN 9.040959e-01 5.680103e-01 4.763228e-01
##  8: 9.331034e-05        NaN          NaN 8.755415e-04 9.334026e-02 5.777656e-08
##  9: 2.409050e-03 0.90409587 8.755415e-04 1.000000e+00 1.146653e-02 1.788149e-03
## 10: 8.462851e-02 0.56801035 9.334026e-02 1.146653e-02 1.000000e+00 3.677758e-02
## 11: 7.989034e-07 0.47632285 5.777656e-08 1.788149e-03 3.677758e-02 1.000000e+00
## 12: 8.814641e-06 0.03500296 1.509321e-06 2.470711e-13 1.049110e-01 4.350354e-10
## 13: 7.565115e-06        NaN          NaN 1.276270e-03 7.002015e-02 6.487760e-06
## 14: 1.724890e-04        NaN          NaN 2.836844e-06 3.887768e-02 3.151812e-11
## 15: 2.455860e-03 0.88117721 1.955396e-02 1.307276e-06 2.160523e-04 1.213196e-16
## 16: 2.350238e-02        NaN          NaN 6.628190e-01 1.428087e-01 6.010342e-03
## 17: 2.272302e-05        NaN          NaN 3.960443e-03 3.205490e-05 3.557155e-09
## 18: 7.300444e-03 0.15305444 6.348760e-18 9.867396e-12 9.180923e-09 1.311615e-29
## 19: 8.081844e-06        NaN          NaN 1.017013e-06 4.949748e-08 3.248997e-17
## 20: 1.581824e-03 0.36882357 3.685098e-18 7.140353e-08 1.276755e-09 1.112248e-24
##              3B1         4G1d         4G1c         4G1a         4G1b
##  1: 1.298090e-17          NaN          NaN 2.261058e-07          NaN
##  2: 2.582877e-10          NaN          NaN 5.949174e-05          NaN
##  3: 1.006755e-17 2.049361e-05 2.381914e-14 4.959379e-16 2.934877e-02
##  4: 1.566370e-12          NaN          NaN 1.007677e-06          NaN
##  5: 2.061579e-07 1.533985e-02 1.303989e-03 5.327518e-02 8.322826e-01
##  6: 8.814641e-06 7.565115e-06 1.724890e-04 2.455860e-03 2.350238e-02
##  7: 3.500296e-02          NaN          NaN 8.811772e-01          NaN
##  8: 1.509321e-06          NaN          NaN 1.955396e-02          NaN
##  9: 2.470711e-13 1.276270e-03 2.836844e-06 1.307276e-06 6.628190e-01
## 10: 1.049110e-01 7.002015e-02 3.887768e-02 2.160523e-04 1.428087e-01
## 11: 4.350354e-10 6.487760e-06 3.151812e-11 1.213196e-16 6.010342e-03
## 12: 1.000000e+00 3.778790e-06 5.966340e-05 5.562571e-11 2.962644e-06
## 13: 3.778790e-06          NaN          NaN 5.849945e-05          NaN
## 14: 5.966340e-05          NaN          NaN 2.873488e-02          NaN
## 15: 5.562571e-11 5.849945e-05 2.873488e-02 1.000000e+00 6.019479e-02
## 16: 2.962644e-06          NaN          NaN 6.019479e-02          NaN
## 17: 4.998926e-17          NaN          NaN 6.107294e-11          NaN
## 18: 2.121175e-27 2.575983e-11 7.957685e-20 2.781901e-17 1.500693e-04
## 19: 5.035201e-22          NaN          NaN 2.833432e-13          NaN
## 20: 1.421551e-33 1.689209e-12 4.448014e-21 1.156384e-19 6.858215e-04
##              1G2          3G3          3G2          2G1
##  1:          NaN 3.294290e-03          NaN 4.052250e-02
##  2:          NaN 4.530645e-03          NaN 1.223943e-02
##  3: 9.655994e-02 7.534511e-13 2.805301e-05 6.274110e-08
##  4:          NaN 1.377871e-05          NaN 3.434709e-03
##  5: 1.055563e-02 9.781867e-04 3.179531e-04 2.805723e-03
##  6: 2.272302e-05 7.300444e-03 8.081844e-06 1.581824e-03
##  7:          NaN 1.530544e-01          NaN 3.688236e-01
##  8:          NaN 6.348760e-18          NaN 3.685098e-18
##  9: 3.960443e-03 9.867396e-12 1.017013e-06 7.140353e-08
## 10: 3.205490e-05 9.180923e-09 4.949748e-08 1.276755e-09
## 11: 3.557155e-09 1.311615e-29 3.248997e-17 1.112248e-24
## 12: 4.998926e-17 2.121175e-27 5.035201e-22 1.421551e-33
## 13:          NaN 2.575983e-11          NaN 1.689209e-12
## 14:          NaN 7.957685e-20          NaN 4.448014e-21
## 15: 6.107294e-11 2.781901e-17 2.833432e-13 1.156384e-19
## 16:          NaN 1.500693e-04          NaN 6.858215e-04
## 17:          NaN 4.240471e-03          NaN 2.397349e-01
## 18: 4.240471e-03 1.000000e+00 8.144742e-02 4.934277e-02
## 19:          NaN 8.144742e-02          NaN 7.066273e-01
## 20: 2.397349e-01 4.934277e-02 7.066273e-01 1.000000e+00
dcast(PTvsMA.spectra.pval.dt,isolate ~ i.isolate)
## Using 'V1' as value column. Use 'value.var' to override
##     isolate         5G1b         5G1a         2B2a         2B2b          2B1
##  1:    5G1b 3.185055e-06 1.412074e-02 1.121191e-06 8.358178e-03 4.293127e-01
##  2:    5G1a 2.544259e-07 1.625780e-03 1.605713e-07 1.570537e-03 3.466166e-01
##  3:    6G1b 4.731188e-05 7.013782e-02 1.428750e-05 4.037808e-02 4.066520e-01
##  4:    2B2a 3.527046e-04 1.062972e-01 1.435416e-02 4.156633e-01 1.006759e-01
##  5:    2B2b 5.231326e-04 9.795970e-02 2.730312e-04 2.499545e-01 1.262357e-01
##  6:     2B1 2.188225e-05 4.641625e-02 1.270167e-07 1.345403e-02 1.180695e-01
##  7:     8B1 1.262194e-07 1.997230e-04 5.646063e-05 2.676773e-04 3.404208e-03
##  8:     2B3 7.132611e-09 2.160979e-05 7.856221e-07 1.575802e-05 5.165790e-04
##  9:     7B1 1.880419e-10 1.652119e-06 3.347229e-09 1.196073e-06 7.276136e-05
## 10:     1G1 1.402112e-03 4.978863e-02 1.930753e-02 1.409918e-02 8.979154e-01
## 11:    6G1a 1.169320e-03 3.472597e-01 8.474982e-05 1.253657e-01 3.144066e-01
## 12:     2G2 3.402271e-10 8.327167e-04 9.064400e-13 2.909229e-05 4.179656e-02
## 13:     3G1 5.959752e-10 4.950401e-04 8.575240e-13 2.856050e-05 6.542536e-02
## 14:    1B1a 8.545720e-36 3.271680e-16 3.024220e-34 2.590310e-17 2.872800e-05
## 15:    1B1b 2.931325e-37 2.768413e-17 2.668991e-35 2.830783e-18 1.486416e-05
## 16:     3B1 2.635981e-59 6.083773e-31 8.927357e-65 1.283509e-34 1.185754e-10
## 17:    4G1d 5.561156e-03 4.377635e-01 1.110550e-02 7.046951e-01 1.757833e-01
## 18:    4G1c 1.256696e-02 5.331308e-01 7.774735e-03 7.535217e-01 2.398383e-01
## 19:    4G1a 2.311704e-02 6.413869e-01 4.184261e-03 7.136695e-01 2.583898e-01
## 20:    4G1b 7.279233e-02 7.200585e-01 9.027863e-03 7.595482e-01 8.317685e-02
## 21:     1G2 1.024362e-04 5.057477e-02 4.087977e-04 1.082752e-01 4.012659e-01
## 22:     3G3 2.303829e-06 4.527218e-02 1.999919e-06 2.512142e-02 1.045023e-01
## 23:     3G2 4.098572e-07 2.784436e-02 7.829325e-07 1.914551e-02 8.772617e-02
## 24:     2G1 1.204876e-06 1.075540e-02 8.001951e-08 2.034032e-03 5.150039e-01
##     isolate         5G1b         5G1a         2B2a         2B2b          2B1
##              2B3          7B1          2G2          3G1       1B1a         1B1b
##  1: 3.328139e-02 0.8982690366 2.300141e-02 4.827954e-02 0.43835185 9.823354e-06
##  2: 2.151599e-02 0.8828767580 8.224819e-02 4.303813e-02 0.47338488 2.226052e-05
##  3: 8.369651e-03 0.9004838793 6.708663e-03 7.782511e-02 0.30917615 1.095444e-05
##  4: 6.222115e-02 0.8304645105 2.801958e-05 2.347390e-01 0.01784768 2.983368e-06
##  5: 3.781072e-02 0.8838661809 6.304373e-04 1.201427e-01 0.01110026 1.831070e-08
##  6: 3.023660e-02 0.8784430285 1.119181e-02 3.093482e-03 0.13053338 4.389006e-09
##  7: 4.595397e-01 0.1499801583 2.908298e-04 3.689740e-04 0.15700770 1.291944e-04
##  8:          NaN 0.0045856031 7.631276e-05 1.870543e-05 0.05251756 1.062100e-06
##  9:          NaN 0.0008913283 1.789677e-05 9.299380e-07 0.01838332 7.672293e-09
## 10: 6.999858e-01 0.9130698145 4.123940e-02 1.260305e-01 0.62027468 1.333423e-03
## 11: 1.248720e-02 0.8613940801 1.360635e-04 2.595034e-02 0.17469741 3.135915e-07
## 12: 1.451337e-05 0.8065776966 2.105768e-03 1.122437e-05 0.54056435 8.220078e-09
## 13: 3.583163e-07 0.8406228516 9.514424e-03 7.011489e-05 0.34919691 6.516494e-09
## 14: 1.946488e-04 0.3902193272 1.929646e-07 1.526917e-15 0.88256332 3.307811e-16
## 15: 2.023972e-04 0.3632511313 2.407593e-07 3.909811e-16 0.89408439 1.615780e-16
## 16: 1.132568e-10 0.1178207908 3.198295e-08 2.110445e-32 0.19306291 1.752397e-37
## 17: 3.263498e-03 0.8517820276 5.004186e-06 2.126198e-01 0.02035657 1.580858e-07
## 18: 2.816178e-03 0.8657749828 4.205551e-06 2.362728e-01 0.01516164 2.614123e-08
## 19: 1.721707e-03 0.8709408132 7.778508e-06 1.725212e-01 0.01788889 1.740949e-08
## 20: 9.490161e-03 0.7624457068 4.732093e-09 1.881255e-02 0.00358503 5.540085e-11
## 21: 2.037875e-02 0.9131844053 1.393694e-02 4.233760e-01 0.15102365 1.501987e-04
## 22: 1.268558e-05 0.8508335833 4.193515e-04 1.444594e-02 0.23657703 1.207090e-05
## 23: 7.031341e-07 0.8437367618 1.402457e-04 1.523783e-02 0.19365757 1.248193e-05
## 24: 5.824894e-02 0.8536120297 2.021537e-03 8.198261e-03 0.50010861 5.830760e-07
##              2B3          7B1          2G2          3G1       1B1a         1B1b
##              3B1         4G1d         4G1c         4G1a         4G1b
##  1: 2.700457e-08 5.567167e-02 1.711735e-04 2.720740e-05 7.418636e-01
##  2: 4.763595e-08 7.586571e-03 8.095476e-04 4.299898e-05 5.518726e-01
##  3: 7.890708e-09 1.570348e-01 3.527979e-05 8.855078e-06 8.421781e-01
##  4: 6.955438e-15 1.537433e-02 1.220711e-07 7.892125e-09 3.359317e-01
##  5: 2.309249e-15 2.636622e-02 8.868404e-06 2.079096e-06 4.414266e-01
##  6: 1.183058e-09 4.060696e-01 4.577685e-04 1.372159e-04 5.164940e-01
##  7: 1.051659e-05 1.584179e-04 1.028711e-04 2.973536e-08 3.999615e-03
##  8: 1.228531e-05 1.930001e-05 1.542802e-04 1.496365e-08 5.986744e-04
##  9: 1.921113e-06 2.326366e-06 5.376220e-05 1.717287e-09 1.070901e-04
## 10: 7.323495e-03 4.955758e-03 4.557018e-02 1.631485e-01 3.684512e-01
## 11: 6.941648e-10 2.498987e-01 4.954783e-07 2.131483e-07 7.510267e-01
## 12: 3.179834e-05 1.420964e-01 2.332575e-05 3.772046e-07 3.346734e-01
## 13: 1.581303e-06 2.522402e-02 6.972338e-05 1.839396e-06 4.554932e-01
## 14: 2.014782e-02 1.365603e-09 1.388655e-09 1.296519e-17 1.307099e-04
## 15: 2.428560e-02 2.592389e-10 2.256040e-09 7.687251e-18 6.136205e-05
## 16: 3.854299e-01 5.733489e-19 1.739940e-07 3.134026e-20 1.496520e-09
## 17: 2.986492e-15 2.886577e-02 6.831357e-09 8.364029e-10 5.673147e-01
## 18: 4.386724e-16 1.779161e-02 4.824700e-09 1.062857e-09 6.417208e-01
## 19: 3.175133e-15 3.121915e-02 1.156179e-08 4.117866e-09 6.980579e-01
## 20: 2.370948e-18 2.651610e-03 3.918440e-12 4.395419e-13 2.592651e-01
## 21: 2.425999e-10 5.287815e-02 9.112672e-05 1.762806e-05 7.777173e-01
## 22: 1.988935e-08 2.594306e-01 8.536372e-07 2.393693e-08 6.468409e-01
## 23: 7.124589e-09 1.328350e-01 1.264797e-07 1.888441e-09 6.329953e-01
## 24: 5.032927e-08 1.292797e-02 7.929629e-06 2.176646e-06 6.402667e-01
##              3B1         4G1d         4G1c         4G1a         4G1b
##              1G2           3G3          3G2           2G1
##  1: 6.022979e-07  1.597316e-17 2.353161e-12  2.642402e-17
##  2: 3.972108e-08  6.721966e-19 8.976560e-14  2.502188e-18
##  3: 1.063254e-05  5.146226e-16 9.341964e-11  1.239686e-15
##  4: 1.873697e-03  1.234684e-15 6.216024e-08  4.124491e-12
##  5: 5.244105e-04  1.362566e-15 3.213555e-08  7.457772e-13
##  6: 3.416779e-06  1.629218e-16 1.223661e-10  4.455397e-17
##  7: 1.051495e-05  2.145324e-10 6.819468e-08  1.761543e-10
##  8: 4.098330e-07  1.879515e-11 3.359428e-09  4.323408e-12
##  9: 1.011410e-08  1.835900e-14 4.909911e-11  2.080229e-15
## 10: 2.483215e-03  3.179414e-02 3.705167e-04  1.815656e-02
## 11: 3.398198e-04  5.446664e-13 2.013604e-08  5.883587e-13
## 12: 8.218510e-11  6.096974e-26 7.055411e-18  8.346034e-29
## 13: 5.576471e-11  1.581202e-25 4.571979e-18  9.330995e-28
## 14: 1.277760e-31  2.075305e-75 1.843625e-50  2.492973e-77
## 15: 4.828281e-33  5.001328e-77 3.008564e-52  9.395959e-79
## 16: 3.485755e-56 7.416757e-109 8.855616e-81 1.555432e-115
## 17: 8.997242e-03  3.501785e-14 4.613311e-07  3.234371e-11
## 18: 1.251964e-02  1.680551e-13 9.437576e-07  1.427406e-10
## 19: 1.377691e-02  2.561250e-12 2.061479e-06  6.023616e-10
## 20: 1.319406e-01  1.288165e-09 2.198072e-04  5.541483e-07
## 21: 4.807293e-05  1.907395e-15 8.505714e-10  1.905203e-13
## 22: 2.606714e-06  8.787301e-21 1.623145e-12  2.269247e-20
## 23: 8.729562e-07  1.167094e-23 8.715689e-14  8.780920e-23
## 24: 2.029376e-07  6.011658e-17 7.482682e-13  6.861934e-18
##              1G2           3G3          3G2           2G1
ggplot(PT.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(title = "Similarity between mutation spectra, PT data",fill ="-log10(P)")

ggplot(MA.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(title = "Similarity between mutation spectra, MA data",fill ="-log10(P)")

ggplot(PTvsMA.spectra.pval.dt,aes(x=isolate,y=i.isolate,fill=-log10(V1)))+ geom_tile() + scale_fill_viridis_c() + labs(x="PT spectra", y="MA spectra", title = "Similarity between mutation spectra, PT vs MA data",fill ="-log10(P)")

WT=c("2B3","7B1","8B1","1G1")
mutMY=c("1B1a","1B1b","3B1")
mutacc<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))
Patient_all<-data.table(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))

#I am removing the values of 2B3 and 7B1 in mutacc as they are too few mutations
mutacc$"2B3"=c(NA,NA,NA,NA,NA,NA)
mutacc$"7B1"=c(NA,NA,NA,NA,NA,NA)

colnames(mutacc)[1]<-"SNV"
colnames(Patient_all)[1]<-"SNV"

mutacc[,exp:="MA"]
Patient_all[,exp:="PT"]

final_table<-rbindlist(
  list(
    melt(mutacc,id.vars = c("SNV","exp"),variable.name = "iso", value.name = "count"),
    melt(Patient_all,id.vars = c("SNV","exp"),variable.name = "iso", value.name = "count"))
)




total=c()
for (i in 1:length(row.names(final_table))){
  #print(i)
  total=c(total,sum(final_table[(final_table$iso==final_table$iso[i])&(final_table$exp==final_table$exp[i]),]$count))
}

final_table$tot=total


#Creating the table with all the frequencies:
d<-as.data.frame(final_table[,.(AT_GC=sum(.SD[SNV %in% c("A:T>G:C"),count/tot]), 
               GC_AT=sum(.SD[SNV %in% c("G:C>A:T"),count/tot]), 
               AT_TA=sum(.SD[SNV %in% c("A:T>T:A"),count/tot]), 
               AT_CG=sum(.SD[SNV %in% c("A:T>C:G"),count/tot]), 
               GC_TA=sum(.SD[SNV %in% c("G:C>T:A"),count/tot]), 
               GC_CG=sum(.SD[SNV %in% c("G:C>C:G"),count/tot]),tot=sum(count))
            , by=c("iso","exp")])
d$mut=ifelse(d$iso %in% WT,"WT",ifelse(d$iso %in% mutMY,"mutMY","dnaQ"))
#here I am removing the values of 2B3 and 7B1 isolates in mutacc for the purpose of the figure


#Preparing the different dataset for all the independant frequency comparisons.
AT_GC.P<-d[d$exp=="PT",][,"AT_GC"]
AT_GC.M<-d[d$exp=="MA",][,"AT_GC"]

GC_AT.P<-d[d$exp=="PT",][,"GC_AT"]
GC_AT.M<-d[d$exp=="MA",][,"GC_AT"]

AT_GC.P<-d[d$exp=="PT",][,"AT_GC"]
AT_GC.M<-d[d$exp=="MA",][,"AT_GC"]

AT_TA.P<-d[d$exp=="PT",][,"AT_TA"]
AT_TA.M<-d[d$exp=="MA",][,"AT_TA"]

AT_CG.P<-d[d$exp=="PT",][,"AT_CG"]
AT_CG.M<-d[d$exp=="MA",][,"AT_CG"]

GC_TA.P<-d[d$exp=="PT",][,"GC_TA"]
GC_TA.M<-d[d$exp=="MA",][,"GC_TA"]

Ti.P <-d[d$exp=="PT",][,"AT_GC"]+d[d$exp=="PT",][,"GC_AT"]
Ti.M <-d[d$exp=="MA",][,"AT_GC"]+d[d$exp=="MA",][,"GC_AT"]

Tv.P <-d[d$exp=="PT",][,"AT_TA"]+d[d$exp=="PT",][,"AT_CG"]+d[d$exp=="PT",][,"GC_TA"]+d[d$exp=="PT",][,"GC_CG"]
Tv.M <-d[d$exp=="MA",][,"AT_TA"]+d[d$exp=="MA",][,"AT_CG"]+d[d$exp=="MA",][,"GC_TA"]+d[d$exp=="MA",][,"GC_CG"]


GC.P <-d[d$exp=="PT",][,"GC_AT"]+d[d$exp=="PT",][,"GC_TA"]+d[d$exp=="PT",][,"GC_CG"]
GC.M <-d[d$exp=="MA",][,"GC_AT"]+d[d$exp=="MA",][,"GC_TA"]+d[d$exp=="MA",][,"GC_CG"]




###### Figure 5a: transition vs GC dotplot#####
###############################################

plot_seg=data.frame(AT_GC.P,AT_GC.M,GC_AT.P,GC_AT.M)
d_filt=d[d$iso!="1G1",]

pdf("Figure_5a.pdf",width=6,height = 6,family = "Arial")

ggplot(d_filt,aes(x=GC_AT+GC_TA+GC_CG, y=AT_GC+GC_AT))+
  geom_encircle(expand=0.05,aes(group=mut),fill="grey93",colour="lightgrey")+
  geom_point(aes(fill=exp,size=tot),pch=21,colour="black")+
  geom_segment(data = plot_seg, aes(x=GC.P,xend=GC.M, y=Ti.P, yend=Ti.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x='G:C substitutions', y="Transitions",size="")+
  scale_size(limits=c(15,1500), breaks = c(50,500,1500) ) +
  scale_x_continuous(limits=c(0,1), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  scale_y_continuous(limits=c(0,1), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) +
  theme(panel.grid.minor = element_blank(),
        legend.position = c(.95, .95),
        legend.justification = c("right", "top"),
        legend.box.just = "right",
        legend.margin = margin(6, 6, 6, 6))+
  scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"="navyblue","PT"="red3"),guide=FALSE)



dev.off()
## quartz_off_screen 
##                 2
pdf("Figure_5a_&_5b.pdf",width=9,height = 6,family = "Arial")


Fig5a<-ggplot(d_filt,aes(x=GC_AT+GC_TA+GC_CG, y=AT_GC+GC_AT))+
  geom_encircle(expand=0.05,aes(group=mut),fill="grey93",colour="lightgrey")+
  geom_point(aes(fill=exp,size=tot),pch=21,colour="black")+
  geom_segment(data = plot_seg, aes(x=GC.P,xend=GC.M, y=Ti.P, yend=Ti.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x='G:C substitutions (%)', y="Transitions (%)",size="",tag="a")+
  scale_size(name="SNV",limits=c(15,1500), breaks = c(50,500,1500) ) +
  scale_x_continuous(limits=c(0-0.05,1), breaks = seq(0,1,.2), expand = c(0, 0), labels =function(x)x*100 ) +
  scale_y_continuous(limits=c(0-0.05,1), breaks = seq(0,1,.2), expand = c(0, 0), labels = function(x)x*100) +
  theme_classic(base_size = 18) +
  theme(panel.grid.minor = element_blank(),
        legend.position = c(1, 1),
        legend.justification = c("right", "top"),
        legend.box.just = "right",
        legend.margin = margin(6, 6, 6, 6),
        legend.background = element_blank(),
        plot.margin = unit(c(10,25,10,25), "pt"),
        plot.tag=element_text(face="bold"))+
  scale_fill_manual(name="",labels = c("MA","Patient") ,values = c("MA"="navyblue","PT"="red3"))+
  guides(fill = guide_legend(override.aes = list(size=4)),size=guide_legend(override.aes = list(fill="grey"))) +
  annotate(geom="text",label = "DnaQ WT", x =0.9, y = 0.25,size=4.5, colour = "black")+
  annotate(geom="text",label = "DnaQ E9G", x =0.3, y = 0.9,size=4.5, colour = "black")+
  annotate(geom="text",label = "BER + DnaQ E9G", x =0.45, y = 0.025,size=4.5, colour = "black")



Fig5b<-ggplot(PT.spectra.dt[MA.spectra.dt][,.(isolate, SNV, delta_freq = freq-i.freq)][,.(difference = mean(delta_freq),logP = -log10(wilcox.test(delta_freq)$p.value)), by=SNV],
       aes(x=difference,y=logP,label=SNV,colour=SNV)) +
  geom_point(size=4) + 
  #geom_text_repel(colour="black",segment.color = "black",cex=4.5,box.padding = unit(0.8, "lines"),text.padding=unit(0.5,"lines")) + #,nudge_x=.053
  scale_colour_brewer(type = "qual",palette = "Dark2", guide =F) +
  scale_x_continuous(limits=c(-.15,.15),breaks=c(-0.1,0,0.1) ,labels=c(-10,0,10),expand = c(0, 0)) +
  scale_y_continuous(limits=c(0-(0.05*4),4),expand = c(0, 0))+
  labs(x="PT-MA difference (%)",y="-logP",tag="b") +
  annotate(geom="text",label = "A:T>C:G", x =0.07, y = 0.05,size=4.5, colour = "black")+
  annotate(geom="text",label = "G:C>A:T", x =-0.085, y = 0.2,size=4.5, colour = "black")+
  annotate(geom="text",label = "A:T>T:A", x =0, y = 0.5,size=4.5, colour = "black")+
  annotate(geom="text",label = "G:C>C:G", x =0.01, y = 1.1,size=4.5, colour = "black")+
  annotate(geom="text",label = "A:T>G:C", x =-0.085, y = 1.55,size=4.5, colour = "black")+
  annotate(geom="text",label = "G:C>T:A", x =0.09, y = 3.3,size=4.5, colour = "black")+
  theme_classic(base_size = 18)+
  theme(plot.margin = unit(c(10,25,10,10), "pt"),
        plot.tag=element_text(face="bold"))

hlay <- rbind(c(1,1,2),
              c(1,1,2),
              c(1,1,2),
              c(1,1,2))

grid.arrange(Fig5a,Fig5b,layout_matrix=hlay)
dev.off()
## quartz_off_screen 
##                 2
grid.arrange(Fig5a,Fig5b,layout_matrix=hlay)

#####Figure S9: pairwise comparisons of mutation frequency for each substitution#####
#####################################################################################

pdf("Fig_S9.pdf",width=10,height = 10,family = "Arial")

#1 1-2 AT_GC GC_AT
plot12=data.frame(AT_GC.P,AT_GC.M,GC_AT.P,GC_AT.M)
p12<-ggplot(d,aes(x=AT_GC, y=GC_AT))+ 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot12, aes(x=AT_GC.P,xend=AT_GC.M, y=GC_AT.P, yend=GC_AT.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="G:C>A:T")+
  scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))


#2 1-3 AT_GC AT_TA
plot13=data.frame(AT_GC.P,AT_GC.M,AT_TA.P,AT_TA.M)
p13<-ggplot(d, aes(x=AT_GC, y=AT_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot13, aes(x=AT_GC.P,xend=AT_GC.M, y=AT_TA.P, yend=AT_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="A:T>T:A")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#3 1-4 AT_GC AT_CG
plot2=data.frame(AT_GC.P,AT_GC.M,AT_CG.P,AT_CG.M)
p14<-ggplot(d, aes(x=AT_GC, y=AT_CG)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=AT_GC.P,xend=AT_GC.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="A:T>C:G")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#4 1-5 T_GC GC_TA
plot2=data.frame(AT_GC.P,AT_GC.M,GC_TA.P,GC_TA.M)
p15<-ggplot(d, aes(x=AT_GC, y=GC_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=AT_GC.P,xend=AT_GC.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="A:T>G:C",y="G:C>T:A")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#5 2-3 GC_AT AT_TA
plot2=data.frame(GC_AT.P,GC_AT.M,AT_TA.P,AT_TA.M)
p23<-ggplot(d, aes(x=GC_AT, y=AT_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=AT_TA.P, yend=AT_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#6 2-4 GC_AT AT_CG
plot2=data.frame(GC_AT.P,GC_AT.M,AT_CG.P,AT_CG.M)
p24<-ggplot(d, aes(x=GC_AT, y=AT_CG)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#7 2-5 GC_AT GC_TA
plot2=data.frame(GC_AT.P,GC_AT.M,GC_TA.P,GC_TA.M)
p25<-ggplot(d, aes(x=GC_AT, y=GC_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=GC_AT.P,xend=GC_AT.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="G:C>A:T",y="")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#8 3-4 AT_TA AT_CG
plot2=data.frame(AT_TA.P,AT_TA.M,AT_CG.P,AT_CG.M)
p34<-ggplot(d, aes(x=AT_TA, y=AT_CG)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=AT_TA.P,xend=AT_TA.M, y=AT_CG.P, yend=AT_CG.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="",y="")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#9 3-5 AT_TA GC_TA
plot2=data.frame(AT_TA.P,AT_TA.M,GC_TA.P,GC_TA.M)
p35<-ggplot(d, aes(x=AT_TA, y=GC_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=AT_TA.P,xend=AT_TA.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="A:T>T:A",y="")+         scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank(),legend.position = "none") +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

#10 4-5 AT_CG GC_TA
plot2=data.frame(AT_CG.P,AT_CG.M,GC_TA.P,GC_TA.M)
p45<-ggplot(d, aes(x=AT_CG, y=GC_TA)) + 
  geom_point(aes(shape=exp,colour=mut))+
  geom_segment(data = plot2, aes(x=AT_CG.P,xend=AT_CG.M, y=GC_TA.P, yend=GC_TA.M),linetype = 3, colour="grey50", size=0.5)+
  labs(x="A:T>C:G",y="")+         
  scale_x_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) + 
  scale_y_continuous(limits=c(0,.65), breaks = seq(0,1,.2), labels = scales::percent_format()) +
  theme_classic(base_size = 18) + 
  theme(panel.grid.minor = element_blank()) +
  scale_color_manual(name="Mutations",labels = c("DnaQ E9G","DnaQ E9G & MutY/MutM","Wild Type"),values = c("dnaQ"="#F8766D","mutMY"="#00BA38","WT"="#619CFF"))+
  scale_shape_manual(name="Data source",labels = c("Mutation accumulation","Patient"),values = c("MA"=16,"PT"=17))

legend <- cowplot::get_legend(p45)
p45<-p45+ theme(legend.position = "none")


hlay <- rbind(c(1,NA,NA,11),
              c(2,5,NA,NA),
              c(3,6,8,NA),
              c(4,7,9,10))

grid.arrange(p12,p13,p14,p15, p23,p24,p25,p34,p35,p45,legend, layout_matrix=hlay)

dev.off()     
## quartz_off_screen 
##                 2
grid.arrange(p12,p13,p14,p15, p23,p24,p25,p34,p35,p45,legend, layout_matrix=hlay)

####Figure 4: Heatmap
#####################

#without wild types
iso_order<-c('3B1','1B1a','1B1b','4G1b','4G1c','4G1d','4G1a','3G2','3G3','2G2','2B2a','2B2b','3G1','2B1','5G1b','5G1a','2G1','1G2','1G1','2B3','7B1','8B1',"6G1a","6G1b")
mutacc<-data.frame(read_xlsx("Table_mutational_spectra.xlsx",range = "B9:Z15",col_names = T))
Patient_all<-data.frame(read_xlsx("Table_mutational_spectra.xlsx",range = "B2:Z8",col_names = T))

#I am removing the values of 2B3 and 7B1 in mutacc as they are too few mutations
mutacc$"X2B3"<-c(NA,NA,NA,NA,NA,NA)
mutacc$"X7B1"<-c(NA,NA,NA,NA,NA,NA)

colnames(mutacc)[1]<-"SNV"
colnames(Patient_all)[1]<-"SNV"






row.names(mutacc)<-mutacc$SNV
mutacc<-mutacc[,2:length(colnames(mutacc))]

#removing the X in the col_names for mutacc
colnames(mutacc)<-str_remove(colnames(mutacc), "X")
row.names(Patient_all)<-Patient_all$SNV
Patient_all<-Patient_all[,2:length(colnames(Patient_all))]



#removing the X in the col_names for Patient_all
colnames(Patient_all)<-str_remove(colnames(Patient_all), "X")

Patient_WT=subset(Patient_all, select = c("8B1","2B3","1G1","7B1") )

Patient_all<-Patient_all[,iso_order]


for (i in 1:length(colnames(Patient_all))){
  mutacc[,i]<-mutacc[,i]/sum(mutacc[,i])
  Patient_all[,i]<-Patient_all[,i]/sum(Patient_all[,i])
}

for (i in 1:length(colnames(Patient_WT))){
  Patient_WT[,i]<-Patient_WT[,i]/sum(Patient_WT[,i])
}

all_data=rbind(Patient_all,mutacc)
sort(colnames(Patient_all))
##  [1] "1B1a" "1B1b" "1G1"  "1G2"  "2B1"  "2B2a" "2B2b" "2B3"  "2G1"  "2G2" 
## [11] "3B1"  "3G1"  "3G2"  "3G3"  "4G1a" "4G1b" "4G1c" "4G1d" "5G1a" "5G1b"
## [21] "6G1a" "6G1b" "7B1"  "8B1"
sort(colnames(mutacc))
##  [1] "1B1a" "1B1b" "1G1"  "1G2"  "2B1"  "2B2a" "2B2b" "2B3"  "2G1"  "2G2" 
## [11] "3B1"  "3G1"  "3G2"  "3G3"  "4G1a" "4G1b" "4G1c" "4G1d" "5G1a" "5G1b"
## [21] "6G1a" "6G1b" "7B1"  "8B1"
iso<-colnames(Patient_all)
Mutations<-c("DnaQ E9G & MutY/MutM","DnaQ E9G & MutY/MutM","DnaQ E9G & MutY/MutM","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G","DnaQ E9G",'WT','WT','WT','WT',"DnaQ E9G","DnaQ E9G")
annotation_c <- as.data.frame(Mutations)
rownames(annotation_c) <- colnames(Patient_all)

Source<-c("Patient","Patient","Patient","Patient","Patient","Patient","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation","Mutation Accumulation")



annotation_r <- as.data.frame(Source)
rownames(annotation_r) <- row.names(all_data)



lab=c("A:T>G:C",
      "G:C>A:T",
      "A:T>T:A",
      "A:T>C:G",
      "G:C>T:A",
      "G:C>C:G")

#specifying colors for row/col side
annotation_colors = list(
  Mutations = c("DnaQ E9G"="#F0E442", "DnaQ E9G & MutY/MutM"="#E69F00"),
  Source = c("Patient"="red2", "Mutation Accumulation"="steelblue2"))

annotation_colors = list(
  Mutations = c("DnaQ E9G"="#F0E442", "DnaQ E9G & MutY/MutM"="#E69F00","WT"="lightyellow"),
  Source = c("Patient"="red3", "Mutation Accumulation"="navyblue"))



colnames(Patient_WT)<-str_remove(colnames(Patient_WT), "X")

WT=c("2B3","7B1","8B1","1G1")
mutMY=c("1B1a","1B1b","3B1")
dnaQ=c("4G1b","4G1c","4G1d","4G1a","3G2","3G3","2G2","2B2a","2B2b","3G1","2B1","5G1b","5G1a","2G1","1G2")
row_lab=c("A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G","A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G")

anno_simple(1:10, height = unit(2, "cm"))
## An AnnotationFunction object
##   function: anno_simple()
##   position: column 
##   items: 10 
##   width: 1npc 
##   height: 2cm 
##   imported variable: pt_size, border, value, color_mapping, gp, pt_gp, pch 
##   subsetable variable: value 
##   this object is subsetable
pdf("Fig_4.pdf",height=4,width=10,family = "Arial")
H1<-Heatmap(as.matrix(
  all_data[WT]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
  row_split = factor(c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE,
  show_heatmap_legend = FALSE,left_annotation = HeatmapAnnotation(which = "row",group = anno_block(gp = gpar(fill = c("white","white"),col=c(NA,NA))),width = unit(0.75, "cm")),
  top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))

H2<-Heatmap(as.matrix(
  all_data[dnaQ]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
  row_split =factor( c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE,show_heatmap_legend = FALSE,
  top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))

H3<-Heatmap(as.matrix(
  all_data[mutMY]),col=circlize::colorRamp2(seq(0, 0.7, length = 100), rev(YlGnBu(100))),na_col="white",cluster_rows = F,row_labels=row_lab,column_names_rot = 66,
  row_split = factor(c(rep("Patient", 6),rep("Mutation Accumulation",6)),levels=c("Patient","Mutation Accumulation")),row_title = NULL,cluster_row_slices=FALSE, 
  ,heatmap_legend_param =list(col_fun = circlize::colorRamp2(seq(0, 0.65, length = 100), viridis(100)),legend_height = unit(6, "cm"), title = "", border = F, at = c(0,0.1,0.2,0.3,0.4,0.5,0.60,0.7)),
  top_annotation=HeatmapAnnotation(which = "col",group = anno_block(gp = gpar(fill = c("grey85"))),height = unit(0.75, "cm")))
ht_list =H1+H2+H3
ht_list

dev.off()
## quartz_off_screen 
##                 2
ht_list

#figure S8
##########

### Now we wil calculate 95% CI interval for spectra driven by sampling 
### For convenience, we'l create new data table

PT1.spectra.dt<-PT.spectra.dt[PT.spectra.dt[,.(tot=sum(value)),by=isolate],.(SNV,isolate,value,tot)][, `:=`(CI_L=binom.test(value,tot)$conf.int[1],CI_H=binom.test(value,tot)$conf.int[2],estimate=binom.test(value,tot)$estimate),by=c("isolate","SNV")][]
 
MA1.spectra.dt<-rbind(MA.spectra.dt[MA.spectra.dt[,.(tot=sum(value)),by=isolate]][value>0,.(SNV,isolate,value,tot)][, `:=`(CI_L=binom.test(value,tot)$conf.int[1],CI_H=binom.test(value,tot)$conf.int[2],estimate=binom.test(value,tot)$estimate),by=c("isolate","SNV")][],
MA.spectra.dt[MA.spectra.dt[,.(tot=sum(value)),by=isolate]][value==0,.(SNV,isolate,value,tot)][, `:=`(CI_L=0,CI_H=3/tot,estimate=0),by=c("isolate","SNV")][])
 
All.spectra.dt<-rbind(PT1.spectra.dt[,source:="PT"],MA1.spectra.dt[,source:="MA"])

SNV_order<-c("A:T>G:C","G:C>A:T","A:T>T:A","A:T>C:G","G:C>T:A","G:C>C:G")
All.spectra.dt$SNV<-factor(All.spectra.dt$SNV,levels= rev(SNV_order))

pdf("Fig_S8.pdf",width=12,family = "Arial")
ggplot(All.spectra.dt, aes(x = SNV, ,y=estimate, fill = source))+
  geom_hline(yintercept=0.9,color="grey",linetype=3)+
  geom_hline(yintercept=0.6,color="grey",linetype=3)+
  geom_hline(yintercept=0.3,color="grey",linetype=3)+
  geom_hline(yintercept=-0.9,color="grey",linetype=3)+
  geom_hline(yintercept=-0.6,color="grey",linetype=3)+
  geom_hline(yintercept=-0.3,color="grey",linetype=3)+
  facet_wrap(~isolate,ncol = 6)+
  geom_bar(stat='identity',aes(y =estimate),color="black",width = 0.5,position = position_dodge())+
  geom_errorbar(aes(ymin=CI_L, ymax=CI_H), width=.2,position=position_dodge(.5)) +
  coord_flip()+
  ylab("Frequency")+
  annotate("segment", x=-Inf, xend=Inf, y=-Inf, yend=-Inf,lwd=1 )+
  annotate("segment", x=-Inf, xend=-Inf, y=-Inf, yend=Inf,lwd=1 )+
  scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),
                    values = c("MA"="steelblue","PT"="red3"))+
  theme_classic(base_size = 15) + 
  theme(axis.line = element_blank(), axis.title.y =element_blank(),
        axis.text.x = element_text(size=9),panel.grid.minor = element_blank(),
        strip.background=element_rect( fill=NA, linetype="blank"),
        strip.text = element_text(angle = 0, hjust = 0))+
  scale_y_continuous(limits=c(-.05,.90), breaks = c(0,.30,.60,.90), 
                     labels = c("0%","30%","60%","90%"))
## Warning: Removed 24 rows containing missing values (geom_hline).

## Warning: Removed 24 rows containing missing values (geom_hline).

## Warning: Removed 24 rows containing missing values (geom_hline).
dev.off()
## quartz_off_screen 
##                 2
ggplot(All.spectra.dt, aes(x = SNV, ,y=estimate, fill = source))+
  geom_hline(yintercept=0.9,color="grey",linetype=3)+
  geom_hline(yintercept=0.6,color="grey",linetype=3)+
  geom_hline(yintercept=0.3,color="grey",linetype=3)+
  geom_hline(yintercept=-0.9,color="grey",linetype=3)+
  geom_hline(yintercept=-0.6,color="grey",linetype=3)+
  geom_hline(yintercept=-0.3,color="grey",linetype=3)+
  facet_wrap(~isolate,ncol = 6)+
  geom_bar(stat='identity',aes(y =estimate),color="black",width = 0.5,position = position_dodge())+
  geom_errorbar(aes(ymin=CI_L, ymax=CI_H), width=.2,position=position_dodge(.5)) +
  coord_flip()+
  ylab("Frequency")+
  annotate("segment", x=-Inf, xend=Inf, y=-Inf, yend=-Inf,lwd=1 )+
  annotate("segment", x=-Inf, xend=-Inf, y=-Inf, yend=Inf,lwd=1 )+
  scale_fill_manual(name="Data source",labels = c("Mutation accumulation","Patient"),
                    values = c("MA"="steelblue","PT"="red3"))+
  theme_classic(base_size = 15) + 
  theme(axis.line = element_blank(), axis.title.y =element_blank(),
        axis.text.x = element_text(size=9),panel.grid.minor = element_blank(),
        strip.background=element_rect( fill=NA, linetype="blank"),
        strip.text = element_text(angle = 0, hjust = 0))+
  scale_y_continuous(limits=c(-.05,.90), breaks = c(0,.30,.60,.90), 
                     labels = c("0%","30%","60%","90%"))
## Warning: Removed 24 rows containing missing values (geom_hline).

## Warning: Removed 24 rows containing missing values (geom_hline).

## Warning: Removed 24 rows containing missing values (geom_hline).

#TopGO analysis
###############

geneID2GO_2 <- readMappings(file ="for_topGO_blast2go.tsv")

#Figure S11: Targeted genes
###########################

genesOfInterest <- c('A_02623',
                     'glnQ_3',
                     'A_00662',
                     'glnE',
                     'A_04152',
                     'envZ',
                     'yurK',
                     'trxB_1',
                     'auaH',
                     'dmlR_11',
                     'dapE_2',
                     'tpiA',
                     'A_02548',
                     'pheA',
                     'trmJ',
                     'fhuA_2',
                     'A_03862',
                     'mdeA',
                     'dadA1_2',
                     'A_03735',
                     'epsL',
                     'cdhR_4',
                     'A_02613',
                     'fecR_5',
                     'maeB_1',
                     'bvgS',
                     'A_02878',
                     'A_02494',
                     'A_02127',
                     'A_00709',
                     'A_01818',
                     'rhlE_1',
                     'gltJ',
                     'A_01884',
                     'lptD',
                     'rplS',
                     'glnQ_1',
                     'A_03071',
                     'arnT',
                     'A_02007',
                     'A_00272',
                     'A_04343',
                     'A_01460',
                     'lptG',
                     'crt_6',
                     'recG',
                     'sgaA',
                     'A_00847',
                     'quiA_1',
                     'tyrB',
                     'A_01797',
                     'ydcV_1',
                     'dxr',
                     'fadJ_2',
                     'A_02482',
                     'A_03346',
                     'A_03300',
                     'A_02367',
                     'lvr_2',
                     'A_03171',
                     'A_02540',
                     'fabG_3',
                     'A_02474',
                     'A_02287',
                     'A_02621',
                     'arsC_3',
                     'gcvA_6',
                     'cmpR_2',
                     'A_04570',
                     'A_02140',
                     'suhB_1',
                     'paaK_1',
                     'hscB',
                     'adiA',
                     'amaB',
                     'clpA',
                     'dmlR_5',
                     'dapA_3',
                     'gsiC_11',
                     'A_03192',
                     'dmlR_6',
                     'slyX',
                     'A_03882',
                     'fumC',
                     'trxB_2',
                     'recO',
                     'ttr',
                     'yhaJ_2',
                     'bdlA',
                     'mrgA',
                     'A_00638',
                     'A_00540',
                     'ycaD',
                     'A_03767',
                     'fecR_4',
                     'nagK_1',
                     'A_04054',
                     'A_01061',
                     'bfmBAB',
                     'oppD_5',
                     'ghrA_2',
                     'nudC_2',
                     'A_02270',
                     'A_04083',
                     'frc_2',
                     'tgnC',
                     'hmuU',
                     'A_01131',
                     'ves',
                     'dppD_2',
                     'A_04167',
                     'rng',
                     'A_02777',
                     'wbgU',
                     'ydjA',
                     'grxC',
                     'fbp',
                     'comM',
                     'A_00439',
                     'lpxK',
                     'yhdN',
                     'pgrR_6',
                     'A_00290',
                     'gltB',
                     'A_02764',
                     'kstR2_3',
                     'nagZ',
                     'A_00627',
                     'dppA_3',
                     'A_01006',
                     'queE',
                     'A_03947',
                     'A_03995',
                     'lcfB_2')




geneUniverse <- names(geneID2GO_2) 


geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList,  annot = annFUN.gene2GO, 
                       nodeSize = 5,gene2GO = geneID2GO_2)
## 
## Building most specific GOs .....
##  ( 666 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 1575 GO terms and 3369 relations. )
## 
## Annotating nodes ...............
##  ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis

#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 408 nontrivial nodes
##       parameters: 
##           test statistic: fisher
resultFisher.classic_0.05
## 
## Description: My project 
## Ontology: BP 
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 83 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 408
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 408 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 13:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 12:  2 nodes to be scored    (0 eliminated genes)
## 
##   Level 11:  7 nodes to be scored    (10 eliminated genes)
## 
##   Level 10:  24 nodes to be scored   (18 eliminated genes)
## 
##   Level 9:   41 nodes to be scored   (436 eliminated genes)
## 
##   Level 8:   46 nodes to be scored   (595 eliminated genes)
## 
##   Level 7:   50 nodes to be scored   (856 eliminated genes)
## 
##   Level 6:   68 nodes to be scored   (1247 eliminated genes)
## 
##   Level 5:   82 nodes to be scored   (1530 eliminated genes)
## 
##   Level 4:   51 nodes to be scored   (1788 eliminated genes)
## 
##   Level 3:   27 nodes to be scored   (2245 eliminated genes)
## 
##   Level 2:   8 nodes to be scored    (2310 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (2730 eliminated genes)
resultFisher.weight01_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 83 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 408
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 408 nontrivial nodes
##       parameters: 
##           test statistic: fisher : ratio
## 
##   Level 13:  1 nodes to be scored.
## 
##   Level 12:  2 nodes to be scored.
## 
##   Level 11:  7 nodes to be scored.
## 
##   Level 10:  24 nodes to be scored.
## 
##   Level 9:   41 nodes to be scored.
## 
##   Level 8:   46 nodes to be scored.
## 
##   Level 7:   50 nodes to be scored.
## 
##   Level 6:   68 nodes to be scored.
## 
##   Level 5:   82 nodes to be scored.
## 
##   Level 4:   51 nodes to be scored.
## 
##   Level 3:   27 nodes to be scored.
## 
##   Level 2:   8 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
resultFisher.weight_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 83 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 408
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 408 nontrivial nodes
##       parameters: 
##           test statistic: fisher
##           cutOff: 0.01
## 
##   Level 13:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 12:  2 nodes to be scored    (0 eliminated genes)
## 
##   Level 11:  7 nodes to be scored    (0 eliminated genes)
## 
##   Level 10:  24 nodes to be scored   (0 eliminated genes)
## 
##   Level 9:   41 nodes to be scored   (0 eliminated genes)
## 
##   Level 8:   46 nodes to be scored   (0 eliminated genes)
## 
##   Level 7:   50 nodes to be scored   (0 eliminated genes)
## 
##   Level 6:   68 nodes to be scored   (0 eliminated genes)
## 
##   Level 5:   82 nodes to be scored   (0 eliminated genes)
## 
##   Level 4:   51 nodes to be scored   (0 eliminated genes)
## 
##   Level 3:   27 nodes to be scored   (0 eliminated genes)
## 
##   Level 2:   8 nodes to be scored    (0 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (0 eliminated genes)
resultFisher.elim_0.05
## 
## Description: My project 
## Ontology: BP 
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 0 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 83 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 408
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo, 
                          classicFisher = resultFisher.classic_0.05,
                          elimFisher=resultFisher.elim_0.05,
                          weigthFisher=resultFisher.weight_0.05,
                          weigth01Fisher=resultFisher.weight01_0.05,
                          orderBy = "weigth01Fisher", 
                          topNodes = 100)

goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"

p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
  xlab("Biological process") +
  ggtitle("Fisher.weight01_0.05")+
  coord_flip()



pdf("Fig_S11.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()
dev.off()
## quartz_off_screen 
##                 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()

#Fig S12: pseudogenized genes
#############################

genesOfInterest <- c('A_00158',
                     'A_00177',
                     'A_00227',
                     'A_00238',
                     'A_00328',
                     'A_00381',
                     'A_00481',
                     'A_00501',
                     'A_00559',
                     'A_00608',
                     'A_00627',
                     'A_00662',
                     'A_00678',
                     'A_00725',
                     'A_00728',
                     'A_00729',
                     'A_00816',
                     'A_00821',
                     'A_00895',
                     'A_01028',
                     'A_01045',
                     'A_01067',
                     'A_01122',
                     'A_01140',
                     'A_01169',
                     'A_01183',
                     'A_01370',
                     'A_01449',
                     'A_01456',
                     'A_01587',
                     'A_01623',
                     'A_01660',
                     'A_02007',
                     'A_02008',
                     'A_02046',
                     'A_02072',
                     'A_02079',
                     'A_02091',
                     'A_02105',
                     'A_02114',
                     'A_02117',
                     'A_02182',
                     'A_02209',
                     'A_02377',
                     'A_02434',
                     'A_02518',
                     'A_02623',
                     'A_02632',
                     'A_02635',
                     'A_02680',
                     'A_02712',
                     'A_02718',
                     'A_02825',
                     'A_02951',
                     'A_03043',
                     'A_03112',
                     'A_03192',
                     'A_03332',
                     'A_03363',
                     'A_03635',
                     'A_03650',
                     'A_03668',
                     'A_03669',
                     'A_03693',
                     'A_03720',
                     'A_03736',
                     'A_03766',
                     'A_03821',
                     'A_03847',
                     'A_03878',
                     'A_03921',
                     'A_03922',
                     'A_04091',
                     'A_04122',
                     'A_04167',
                     'A_04230',
                     'A_04234',
                     'A_04236',
                     'A_04246',
                     'A_04260',
                     'A_04291',
                     'A_04316',
                     'acoR_2',
                     'actP_1',
                     'actP_2',
                     'aidB',
                     'amnE',
                     'ampD',
                     'appA',
                     'apt',
                     'araC',
                     'artM_1',
                     'asnB_1',
                     'bauA_2',
                     'bcsA',
                     'bdlA',
                     'braC_10',
                     'braC_9',
                     'btuD_9',
                     'caiA_2',
                     'cbpA',
                     'ccoN1_2',
                     'cdhR_4',
                     'cheA',
                     'cheB_2',
                     'clsB_2',
                     'cmpR_6',
                     'cynR_8',
                     'dac',
                     'dadA1_1',
                     'dadA1_2',
                     'dapE_1',
                     'dapE_2',
                     'dctM_4',
                     'dmlR_1',
                     'dmsA',
                     'dnaE2',
                     'ectB',
                     'egtB',
                     'eno_1',
                     'ettA',
                     'fabG2',
                     'fadD_1',
                     'fhuA_5',
                     'flhA_2',
                     'fumB_1',
                     'fur_2',
                     'ggt',
                     'glnE',
                     'glnH_2',
                     'glnQ_3',
                     'glpE_1',
                     'gltB',
                     'gltC_7',
                     'gsiA_4',
                     'gsiB_2',
                     'gsiB_4',
                     'hisC2',
                     'hsrA_1',
                     'hsrA_2',
                     'htrE',
                     'hyuA',
                     'iaaA',
                     'intA_3',
                     'kdpB_2',
                     'kefC_2',
                     'kpsM_2',
                     'lcfB_2',
                     'leuD1_2',
                     'lon',
                     'lon2',
                     'lptB_6',
                     'ltaE',
                     'mexB',
                     'mfd',
                     'mhbM',
                     'mshA_4',
                     'mshD',
                     'murA_2',
                     'mutS2',
                     'nagK_1',
                     'nhaP2_2',
                     'ntaA_2',
                     'pbpC',
                     'pcaB',
                     'pgi_1',
                     'phrB',
                     'pitA',
                     'plcN_2',
                     'pncA',
                     'ppc',
                     'queC',
                     'rfnT_2',
                     'rhlE_2',
                     'rhtB_2',
                     'rnr',
                     'rplY',
                     'sasA_4',
                     'sasA_8',
                     'secD_2',
                     'selO',
                     'smf-1_1',
                     'surE',
                     'tag',
                     'tauD',
                     'tcyC_2',
                     'tdh_1',
                     'tdh_2',
                     'thadh',
                     'tolQ_4',
                     'trxB_1',
                     'ttr',
                     'wecH',
                     'ydcV_1',
                     'yhaJ_2',
                     'trkH',
                     'A_02961',
                     'steT',
                     'A_01369',
                     'adoK',
                     'rsmB_2')



geneUniverse <- names(geneID2GO_2) 


geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList,  annot = annFUN.gene2GO, 
                       nodeSize = 5,gene2GO = geneID2GO_2)
## 
## Building most specific GOs .....
##  ( 666 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 1575 GO terms and 3369 relations. )
## 
## Annotating nodes ...............
##  ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis

#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 433 nontrivial nodes
##       parameters: 
##           test statistic: fisher
resultFisher.classic_0.05
## 
## Description: My project 
## Ontology: BP 
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 2 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 125 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 433
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 433 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 13:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 12:  5 nodes to be scored    (0 eliminated genes)
## 
##   Level 11:  11 nodes to be scored   (10 eliminated genes)
## 
##   Level 10:  32 nodes to be scored   (48 eliminated genes)
## 
##   Level 9:   53 nodes to be scored   (469 eliminated genes)
## 
##   Level 8:   56 nodes to be scored   (679 eliminated genes)
## 
##   Level 7:   51 nodes to be scored   (932 eliminated genes)
## 
##   Level 6:   62 nodes to be scored   (1319 eliminated genes)
## 
##   Level 5:   83 nodes to be scored   (1575 eliminated genes)
## 
##   Level 4:   44 nodes to be scored   (1775 eliminated genes)
## 
##   Level 3:   24 nodes to be scored   (2226 eliminated genes)
## 
##   Level 2:   10 nodes to be scored   (2306 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (2716 eliminated genes)
resultFisher.weight01_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 125 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 433
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 433 nontrivial nodes
##       parameters: 
##           test statistic: fisher : ratio
## 
##   Level 13:  1 nodes to be scored.
## 
##   Level 12:  5 nodes to be scored.
## 
##   Level 11:  11 nodes to be scored.
## 
##   Level 10:  32 nodes to be scored.
## 
##   Level 9:   53 nodes to be scored.
## 
##   Level 8:   56 nodes to be scored.
## 
##   Level 7:   51 nodes to be scored.
## 
##   Level 6:   62 nodes to be scored.
## 
##   Level 5:   83 nodes to be scored.
## 
##   Level 4:   44 nodes to be scored.
## 
##   Level 3:   24 nodes to be scored.
## 
##   Level 2:   10 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
resultFisher.weight_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 125 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 433
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 433 nontrivial nodes
##       parameters: 
##           test statistic: fisher
##           cutOff: 0.01
## 
##   Level 13:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 12:  5 nodes to be scored    (0 eliminated genes)
## 
##   Level 11:  11 nodes to be scored   (0 eliminated genes)
## 
##   Level 10:  32 nodes to be scored   (0 eliminated genes)
## 
##   Level 9:   53 nodes to be scored   (8 eliminated genes)
## 
##   Level 8:   56 nodes to be scored   (8 eliminated genes)
## 
##   Level 7:   51 nodes to be scored   (8 eliminated genes)
## 
##   Level 6:   62 nodes to be scored   (8 eliminated genes)
## 
##   Level 5:   83 nodes to be scored   (8 eliminated genes)
## 
##   Level 4:   44 nodes to be scored   (8 eliminated genes)
## 
##   Level 3:   24 nodes to be scored   (8 eliminated genes)
## 
##   Level 2:   10 nodes to be scored   (8 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (8 eliminated genes)
resultFisher.elim_0.05
## 
## Description: My project 
## Ontology: BP 
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 125 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 433
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo, 
                          classicFisher = resultFisher.classic_0.05,
                          elimFisher=resultFisher.elim_0.05,
                          weigthFisher=resultFisher.weight_0.05,
                          weigth01Fisher=resultFisher.weight01_0.05,
                          orderBy = "weigth01Fisher", 
                          topNodes = 100)

goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"

p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
  xlab("Biological process") +
  ggtitle("Fisher.weight01_0.05")+
  coord_flip()



pdf("Fig_S12.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()
dev.off()
## quartz_off_screen 
##                 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()

#Fig S7: 25 exo- genes
######################

genesOfInterest <- c('tdhA',
'nhaP2_1',
'cheD',
'A_00560',
'dnaQ',
'envZ',
'gstB_2',
'A_01313',
'sucA',
'fliF',
'ppsR',
'A_02012',
'A_02072',
'icd_1',
'yhhQ',
'gsiD_5',
'ygaZ',
'A_03098',
'A_03291',
'A_03455',
'kefC_3',
'A_03713',
'ptsJ',
'A_04082',
'lpxL_2')



geneUniverse <- names(geneID2GO_2) 


geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
myGOdata_pseudo <- new("topGOdata", description="My project", ontology="BP", allGenes=geneList,  annot = annFUN.gene2GO, 
                       nodeSize = 5,gene2GO = geneID2GO_2)
## 
## Building most specific GOs .....
##  ( 666 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 1575 GO terms and 3369 relations. )
## 
## Annotating nodes ...............
##  ( 2759 genes annotated to the GO terms. )
#Making the 4 analysis

#first Fisher
resultFisher.classic_0.05 <- runTest(myGOdata_pseudo, algorithm = "classic", statistic = "fisher")
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 137 nontrivial nodes
##       parameters: 
##           test statistic: fisher
resultFisher.classic_0.05
## 
## Description: My project 
## Ontology: BP 
## 'classic' algorithm with the 'fisher' test
## 773 GO terms scored: 4 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 16 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 137
resultFisher.weight01_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight01", statistic = "fisher")
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 137 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 11:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 10:  4 nodes to be scored    (0 eliminated genes)
## 
##   Level 9:   8 nodes to be scored    (372 eliminated genes)
## 
##   Level 8:   11 nodes to be scored   (413 eliminated genes)
## 
##   Level 7:   15 nodes to be scored   (473 eliminated genes)
## 
##   Level 6:   19 nodes to be scored   (556 eliminated genes)
## 
##   Level 5:   33 nodes to be scored   (975 eliminated genes)
## 
##   Level 4:   23 nodes to be scored   (1403 eliminated genes)
## 
##   Level 3:   15 nodes to be scored   (2138 eliminated genes)
## 
##   Level 2:   7 nodes to be scored    (2250 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (2655 eliminated genes)
resultFisher.weight01_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 16 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 137
resultFisher.weight_0.05 <- runTest(myGOdata_pseudo, algorithm = "weight", statistic = "fisher")
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 137 nontrivial nodes
##       parameters: 
##           test statistic: fisher : ratio
## 
##   Level 11:  1 nodes to be scored.
## 
##   Level 10:  4 nodes to be scored.
## 
##   Level 9:   8 nodes to be scored.
## 
##   Level 8:   11 nodes to be scored.
## 
##   Level 7:   15 nodes to be scored.
## 
##   Level 6:   19 nodes to be scored.
## 
##   Level 5:   33 nodes to be scored.
## 
##   Level 4:   23 nodes to be scored.
## 
##   Level 3:   15 nodes to be scored.
## 
##   Level 2:   7 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
resultFisher.weight_0.05
## 
## Description: My project 
## Ontology: BP 
## 'weight' algorithm with the 'fisher : ratio' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 16 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 137
resultFisher.elim_0.05 <- runTest(myGOdata_pseudo, algorithm = "elim", statistic = "fisher")
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 137 nontrivial nodes
##       parameters: 
##           test statistic: fisher
##           cutOff: 0.01
## 
##   Level 11:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 10:  4 nodes to be scored    (0 eliminated genes)
## 
##   Level 9:   8 nodes to be scored    (0 eliminated genes)
## 
##   Level 8:   11 nodes to be scored   (17 eliminated genes)
## 
##   Level 7:   15 nodes to be scored   (17 eliminated genes)
## 
##   Level 6:   19 nodes to be scored   (17 eliminated genes)
## 
##   Level 5:   33 nodes to be scored   (17 eliminated genes)
## 
##   Level 4:   23 nodes to be scored   (17 eliminated genes)
## 
##   Level 3:   15 nodes to be scored   (17 eliminated genes)
## 
##   Level 2:   7 nodes to be scored    (17 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (17 eliminated genes)
resultFisher.elim_0.05
## 
## Description: My project 
## Ontology: BP 
## 'elim' algorithm with the 'fisher : 0.01' test
## 773 GO terms scored: 1 terms with p < 0.01
## Annotation data:
##     Annotated genes: 2759 
##     Significant genes: 16 
##     Min. no. of genes annotated to a GO: 5 
##     Nontrivial nodes: 137
#Creating a table output with the 100 best nodes
allRes_pseudo <- GenTable(myGOdata_pseudo, 
                          classicFisher = resultFisher.classic_0.05,
                          elimFisher=resultFisher.elim_0.05,
                          weigthFisher=resultFisher.weight_0.05,
                          weigth01Fisher=resultFisher.weight01_0.05,
                          orderBy = "weigth01Fisher", 
                          topNodes = 100)

goEnrichmentFisher.weight01_0.05 <- GenTable(myGOdata_pseudo, pValue=resultFisher.weight01_0.05, orderBy="pValue", topNodes=100)
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[goEnrichmentFisher.weight01_0.05$pValue<0.05,]
goEnrichmentFisher.weight01_0.05 <- goEnrichmentFisher.weight01_0.05[,c("GO.ID","Term","pValue","Annotated","Significant","Expected")]
goEnrichmentFisher.weight01_0.05$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- gsub("\\.\\.\\.$", "", goEnrichmentFisher.weight01_0.05$Term)
goEnrichmentFisher.weight01_0.05$Term <- paste(goEnrichmentFisher.weight01_0.05$GO.ID, goEnrichmentFisher.weight01_0.05$Term, sep=", ")
goEnrichmentFisher.weight01_0.05$Term <- factor(goEnrichmentFisher.weight01_0.05$Term, levels=rev(goEnrichmentFisher.weight01_0.05$Term))
goEnrichmentFisher.weight01_0.05$pValue <- as.numeric(goEnrichmentFisher.weight01_0.05$pValue)
goEnrichmentFisher.weight01_0.05$test="KS_elim_0.01"

p3<-ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue", high="red",limits=c(0,0.05))+
  xlab("Biological process") +
  ggtitle("Fisher.weight01_0.05")+
  coord_flip()


pdf("Fig_S7.pdf",width=9,family = "Arial")
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()
dev.off()
## quartz_off_screen 
##                 2
ggplot(goEnrichmentFisher.weight01_0.05, aes(x=Term, y=log2(Significant/Expected))) +
  geom_col(aes(fill=pValue))+
  scale_fill_gradient(low="blue3", high="red3",limits=c(0,0.05))+
  xlab("Biological process") +
  labs(y=expression("log"[2]("enrichment")))+
  theme_classic(base_size = 15)+
  coord_flip()

tree.WT<-root(read.tree("RAxML_bipartitions.WT"),"ancestor_A1")
tree.dnaQ<-root(read.tree("RAxML_bipartitions.dnaQ"),"ancestor_A2")

tree.bound<-bind.tree(tree.WT, tree.dnaQ, 4)
tree.bound.clean<-drop.tip(tree.bound, "ancestor_A2")
WT=c("2B3","1G1","8B1","7B1")
WT_edge=which.edge(tree.bound.clean, WT)
sep=max(WT_edge)+1

edge_color=rep("black",length(tree.bound.clean$edge))
edge_color[WT_edge]="red"

edge_type=rep(1,length(tree.bound.clean$edge))
edge_type[sep]=2
i=5
tree.bound.clean.modif<-tree.bound.clean
tree.bound.clean.modif$edge.length[sep]<-7.5e-05
tree.bound.clean.modif$edge.length[WT_edge]<-5*tree.bound.clean.modif$edge.length[WT_edge]


postscript("Fig_2.pdf")
plot(tree.bound.clean.modif,type="u",edge.color=edge_color, edge.lty=edge_type,show.tip.labe=T,edge.width=3,rotate.tree=250)

add.scale.bar(0,4e-05,lwd=3,col=F)
add.scale.bar(0,2e-05,lcol="red",lwd=3,col=F)

dev.off()
## quartz_off_screen 
##                 2
plot(tree.bound.clean.modif,type="u",edge.color=edge_color, edge.lty=edge_type,show.tip.labe=T,edge.width=3,rotate.tree=250)

add.scale.bar(0,4e-05,lwd=3,col=F)
add.scale.bar(0,2e-05,lcol="red",lwd=3,col=F)

#Figure S2a
###########

tree.allBH<-root(read.tree("RAxML_bipartitions.tree_allBH"),"Bhinzii_14-3425")
pdf("Fig_S2a.pdf",family = "Arial")
plot(tree.allBH)
dev.off()
## quartz_off_screen 
##                 2
plot(tree.allBH)

#figure S1
##########

#Getting the data of the different growth runs

GC.BH1<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AW50",sheet="20191220_48w",col_names = T))

GC.BH2<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AW50",sheet="20200306_48w",col_names = T))

GC.BH3<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AV50",sheet="20210209_48w",col_names = T))

GC.BH4<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AU50",sheet="20210217_48w",col_names = T))

GC.BH5<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AQ50",sheet="20210225_48w",col_names = T))

GC.BH6<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "AB1:AU50",sheet="20210305_48w",col_names = T))

GC.BH7<-data.frame(read_xlsx("growth_curve_all.xlsx",range = "W1:AM50",sheet="20210315_48w",col_names = T))


## create a data.frame for ggplot2
p2=as.data.frame(matrix(nrow=48*22, ncol=4))
colnames(p2)=c("Time", "Isolate", "run1", "run2")
p2[,1]=rep(0:47, n=22)
I22=c("1B1a", "1B1b", "1G1", "1G2", "2B1", "2B2a", "2B2b", "2B3", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b", "7B1", "8B1")
p2[,2]=rep(I22, each=48)


null_2_NA<- function(x){
  if (is.null(x)){
    return(NA)
  }
  else{
    return(x)
  }
}

#getting all the data of the runs, and using NA when the run doesn't contain one particular isolate

all_run1=c()
all_run2=c()
all_run3=c()
all_run4=c()
all_run5=c()
all_run6=c()
all_run7=c()
for (i in 1:length(I22)){
  #print("ok")
  #print (I22[i])
  #GC.BH1[1,I22[i]]
  #print(GC.BH1$I22[i])
  for (j in 1:48){
    #print(i)
    #print(j)
    #print(GC.BH1[j,i])
    all_run1=c(all_run1,null_2_NA(GC.BH1[j,paste("X",I22[i],sep="")]))
    all_run2=c(all_run2,null_2_NA(GC.BH2[j,paste("X",I22[i],sep="")]))
    all_run3=c(all_run3,null_2_NA(GC.BH3[j,paste("X",I22[i],sep="")]))
    all_run4=c(all_run4,null_2_NA(GC.BH4[j,paste("X",I22[i],sep="")]))
    all_run5=c(all_run5,null_2_NA(GC.BH5[j,paste("X",I22[i],sep="")]))
    all_run6=c(all_run6,null_2_NA(GC.BH6[j,paste("X",I22[i],sep="")]))
    all_run7=c(all_run7,null_2_NA(GC.BH7[j,paste("X",I22[i],sep="")]))
    
  }
}


p2[,3]=all_run1
p2[,4]=all_run2
p2[,5]=all_run3
p2[,6]=all_run4
p2[,7]=all_run5
p2[,8]=all_run6
p2[,9]=all_run7

length(all_run1)
## [1] 1056
length(all_run3)
## [1] 1056
run_mean=apply(p2[,3:9],1,function(x) mean(x,na.rm=T))
run_sd=apply(p2[,3:9],1,function(x) sd(x,na.rm=T))

p2[,10]=run_mean
p2[,11]=run_sd
  

# isolation date
 
date=rep(c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 7, 8), each=48)
p2[,12]=date
colnames(p2)=c("Time", "Isolate", "run1", "run2", "run3", "run4", "run5", "run6", "run7", "mean","sd","date")

scaleFUN<- function(x) sprintf("%.2f", x)
pdf("Fig_S1.pdf",width=10, height=10)

ggplot(data=p2, aes(x=Time, y=mean, group=Isolate, color=Isolate)) +
  geom_line(linetype="solid") + xlab("Time (h)") + 
  geom_vline(xintercept=0,color="grey",linetype=3)+
  geom_vline(xintercept=10,color="grey",linetype=3)+
  geom_vline(xintercept=20,color="grey",linetype=3)+
  geom_vline(xintercept=30,color="grey",linetype=3)+
  geom_vline(xintercept=40,color="grey",linetype=3)+
  geom_vline(xintercept=50,color="grey",linetype=3)+
  geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=0.5, color="black",size=0.3,alpha=0.5) +
  geom_point(aes(fill=Isolate),pch=21,color="black",size=1,lwd=0.2)+
  ylab("OD600") + 
  scale_y_continuous(labels=scaleFUN) + 
  theme_classic(base_size = 12)+
  facet_grid(rows=vars(date))+
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1))
## Warning: Duplicated aesthetics after name standardisation: size
dev.off()
## quartz_off_screen 
##                 2
ggplot(data=p2, aes(x=Time, y=mean, group=Isolate, color=Isolate)) +
  geom_line(linetype="solid") + xlab("Time (h)") + 
  geom_vline(xintercept=0,color="grey",linetype=3)+
  geom_vline(xintercept=10,color="grey",linetype=3)+
  geom_vline(xintercept=20,color="grey",linetype=3)+
  geom_vline(xintercept=30,color="grey",linetype=3)+
  geom_vline(xintercept=40,color="grey",linetype=3)+
  geom_vline(xintercept=50,color="grey",linetype=3)+
  geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=0.5, color="black",size=0.3,alpha=0.5) +
  geom_point(aes(fill=Isolate),pch=21,color="black",size=1,lwd=0.2)+
  ylab("OD600") + 
  scale_y_continuous(labels=scaleFUN) + 
  theme_classic(base_size = 12)+
  facet_grid(rows=vars(date))+
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1))
## Warning: Duplicated aesthetics after name standardisation: size

### PT SNV data
PT.SNV.dt<-data.table(read_xlsx("Patient_mutations.xlsx"))
setnames(PT.SNV.dt, 'In isolates',"isolates")

PT.SNV.dt
##       Position Reference allele Variant allele               Efffect
##    1:      113                A              G            INTERGENIC
##    2:      897                G              A NON_SYNONYMOUS_CODING
##    3:     1527                G              T NON_SYNONYMOUS_CODING
##    4:     2780                T              G            INTERGENIC
##    5:     3592                A              G NON_SYNONYMOUS_CODING
##   ---                                                               
## 6455:  3728264                A              T NON_SYNONYMOUS_CODING
## 6456:  3897012                G              A NON_SYNONYMOUS_CODING
## 6457:  4268100                A              T NON_SYNONYMOUS_CODING
## 6458:  4293566                T              C NON_SYNONYMOUS_CODING
## 6459:  4676024                T              A NON_SYNONYMOUS_CODING
##       Amino acid change Gene length Gene name
##    1:                 -           -         -
##    2:             A233T         483      dnaA
##    3:             D443Y         483      dnaA
##    4:                 -           -         -
##    5:             N249S         817      gyrB
##   ---                                        
## 6455:              V44E         398    kefC_3
## 6456:              A18V         305   A_03713
## 6457:             W100R         430      ptsJ
## 6458:              Y30C         323   A_04082
## 6459:              L74Q         297    lpxL_2
##                                                          Product
##    1:                                          intergenic region
##    2:             Chromosomal replication initiator protein DnaA
##    3:             Chromosomal replication initiator protein DnaA
##    4:                                          intergenic region
##    5:                                       DNA gyrase subunit B
##   ---                                                           
## 6455: Glutathione-regulated potassium-efflux system protein KefC
## 6456:                                       hypothetical protein
## 6457:  Vitamin B6 salvage pathway transcriptional repressor PtsJ
## 6458:                                       hypothetical protein
## 6459:                    Lipid A biosynthesis lauroyltransferase
##                                                                                          isolates
##    1:                                                                                         3G2
##    2:                                                                                   2B2a,2B2b
##    3:                                                                                         7B1
##    4:                                                                                        6G1b
##    5:                                                                              4G1a,4G1c,4G1d
##   ---                                                                                            
## 6455: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6456: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6457: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6458: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
## 6459: 1B1a,1B1b,1G2,2B1,2B2a,2B2b,2G1,2G2,3B1,3G1,3G2,3G3,4G1a,4G1b,4G1c,4G1d,5G1a,5G1b,6G1a,6G1b
######################
### isolate annotation
iso.dt<-fread("Isolates_divergence.v1.csv")
######################

### ordered list of isolates: WT->E9G, WT elements 1-4
iso_lst<-c("1G1", "2B3", "7B1", "8B1", "1B1a", "1B1b", "1G2", "2B1", "2B2a", "2B2b", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b","6G1a","6G1b")

### ordered dataframe of isolates with GT: WT->E9G, WT elements 1-4
iso.gt.dt<-data.table(cbind(
  isolate=c("1G1", "2B3", "7B1", "8B1", "1B1a", "1B1b", "1G2", "2B1", "2B2a", "2B2b", "2G1", "2G2", "3B1", "3G1", "3G2", "3G3", "4G1a", "4G1b", "4G1c", "4G1d", "5G1a", "5G1b","6G1a","6G1b"),
  GT=c("WT", "WT", "WT", "WT", "dnaQ E9G + mutM/mutY", "dnaQ E9G + mutM/mutY", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G + mutM/mutY", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G", "dnaQ E9G"))
)

### Import aggregated dN/dS data
dNdS.dt<-data.table(read_xlsx(path = "mutation_counts_all_isolates_and_sites.xlsx", range = cell_cols("A:O")))
setnames(dNdS.dt, c('NS sites','S sites'), c("NS_sites","S_sites"))
WT.SNV.dt<-rbind(PT.SNV.dt[isolates %like% "1G1",.(Position,isolate="1G1")],
PT.SNV.dt[isolates %like% "7B1",.(Position,isolate="7B1")],
PT.SNV.dt[isolates %like% "8B1",.(Position,isolate="8B1")],
PT.SNV.dt[isolates %like% "2B3",.(Position,isolate="2B3")])

dcast(WT.SNV.dt[WT.SNV.dt,on=c("Position"="Position"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)],isolate ~i.isolate)
## Using 'N' as value column. Use 'value.var' to override
##    isolate 1G1 2B3 7B1 8B1
## 1:     1G1  17  NA  NA   1
## 2:     2B3  NA  37  23  34
## 3:     7B1  NA  23  47  20
## 4:     8B1   1  34  20  48
ggplot(WT.SNV.dt[WT.SNV.dt,on=c("Position"="Position"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)], aes(x=isolate,y=i.isolate,fill=N,label=N))+ geom_tile() +geom_text() +scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in wild-type DnaQ isolates",x="isolate",y="",fill="SNVs")  + theme_classic(base_size = 14)

iso_list<-iso.gt.dt[,isolate]
cc.dt<-PT.SNV.dt[isolates %like% iso_list[1], .(SNV=paste(Position,`Reference allele`,`Variant allele`, sep="_"),isolate=iso_list[1])]

for(a in rev(iso_list[-1])) { print(a); cc.dt<-rbind(cc.dt,PT.SNV.dt[isolates %like% a, .(SNV=paste(Position,`Reference allele`,`Variant allele`, sep="_"), isolate=a)])}
## [1] "6G1b"
## [1] "6G1a"
## [1] "5G1b"
## [1] "5G1a"
## [1] "4G1d"
## [1] "4G1c"
## [1] "4G1b"
## [1] "4G1a"
## [1] "3G3"
## [1] "3G2"
## [1] "3G1"
## [1] "3B1"
## [1] "2G2"
## [1] "2G1"
## [1] "2B2b"
## [1] "2B2a"
## [1] "2B1"
## [1] "1G2"
## [1] "1B1b"
## [1] "1B1a"
## [1] "8B1"
## [1] "7B1"
## [1] "2B3"
PT.SNV.long.dt<-cc.dt
rm(cc.dt)
### all-vs-all shared SNV plot

ggplot(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)], aes(
x=factor(isolate,   levels=iso.gt.dt[,isolate]), 
y=factor(i.isolate, levels=iso.gt.dt[,isolate]),
fill=N,label=N)) + geom_tile() + geom_text(size=2.75) +
scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in PT isolates",x="isolate",y="",fill="SNVs") + 
  theme_classic(base_size = 10)

PT.SNV.long.dt[,.N,by=isolate]
##     isolate    N
##  1:     1G1   17
##  2:    6G1b  297
##  3:    6G1a  338
##  4:    5G1b  276
##  5:    5G1a  243
##  6:    4G1d  534
##  7:    4G1c  542
##  8:    4G1b  610
##  9:    4G1a  485
## 10:     3G3  431
## 11:     3G2  533
## 12:     3G1  371
## 13:     3B1 1272
## 14:     2G2  428
## 15:     2G1  255
## 16:    2B2b  400
## 17:    2B2a  405
## 18:     2B1  310
## 19:     1G2  267
## 20:    1B1b 1067
## 21:    1B1a 1080
## 22:     8B1   48
## 23:     7B1   47
## 24:     2B3   37
##     isolate    N
ggplot(PT.SNV.long.dt[,.N, by=SNV][order(N)], aes(x=N)) + geom_histogram(binwidth = 1) + scale_y_sqrt() + theme_classic(base_size = 14) + labs(x="Number of isolates",title = "SNV frequency acorss PT isolates")+ theme_classic(base_size = 14)

### smaller subset of heatmap for WT isolates

ggplot(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)][isolate %in% iso.gt.dt[GT == "WT",isolate]][i.isolate %in% iso.gt.dt[GT == "WT",isolate]], aes(
x=factor(isolate,   levels=iso.gt.dt[,isolate]), 
y=factor(i.isolate, levels=iso.gt.dt[,isolate]),
fill=N,label=N)) + geom_tile() +geom_text() +
scale_fill_distiller(palette ="Blues", direction = 1) +
labs(title = "Shared SNVs in wild-type DnaQ isolates",x="isolate",y="",fill="SNVs") + theme_classic(base_size = 14) 

dcast(PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][,.N,by=.(isolate,i.isolate)][isolate %in% iso.gt.dt[GT == "WT",isolate]][i.isolate %in% iso.gt.dt[GT == "WT",isolate]],isolate~i.isolate)
## Using 'N' as value column. Use 'value.var' to override
##    isolate 1G1 2B3 7B1 8B1
## 1:     1G1  17  NA  NA   1
## 2:     2B3  NA  37  23  34
## 3:     7B1  NA  23  47  20
## 4:     8B1   1  34  20  48
#Figure S5:
###########

### divergence from SNV table:

div.snv.dt<-PT.SNV.long.dt[PT.SNV.long.dt,on=c("SNV"="SNV"),allow.cartesian=TRUE][isolate==i.isolate,.(SNV=.N),by=isolate]

### and now full annotated table
div.date.dt<-div.snv.dt[iso.dt,on=.("isolate"=`Isolate`)][iso.gt.dt,.(isolate,SNV, GT, date=`Days since first sample`),on=.(isolate=isolate)]

pdf("Fig_S5.pdf")
ggplot(div.date.dt,aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,1500)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_brewer(type="qual",palette = "Dark2")+ theme_classic(base_size = 14)
dev.off()
## quartz_off_screen 
##                 2
ggplot(div.date.dt,aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,1500)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_brewer(type="qual",palette = "Dark2")+ theme_classic(base_size = 14)

#Figure S6
##########

p1 <- ggplot(div.date.dt[GT=="WT"],aes(x=date,y=SNV,colour=GT)) + geom_point(size=4) + scale_y_continuous(limits=c(0,100)) + theme_classic() + scale_x_continuous(limits=c(0,1500)) + labs(x="Collection date (days)")+ scale_color_viridis_d()+ theme_classic(base_size = 14)

lm_eqn = function(m) {

  l <- list(a = format(unname(coef(m)[1]), digits = 2),
      b = format(abs(unname(coef(m)[2])), digits = 2),
      r2 = format(summary(m)$r.squared, digits = 3));

  if (coef(m)[2] >= 0)  {
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
  } else {
    eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
  }

  as.character(as.expression(eq));

}

pdf("Fig_S6.pdf")
p1 + stat_smooth(formula = y~x,method = "lm") + geom_text(colour="black",aes(x = 500, y = 80, label = lm_eqn(lm(y ~ x,div.date.dt[GT=="WT",.(x=date,y=SNV)]))), parse = TRUE)
dev.off()
## quartz_off_screen 
##                 2
p1 + stat_smooth(formula = y~x,method = "lm") + geom_text(colour="black",aes(x = 500, y = 80, label = lm_eqn(lm(y ~ x,div.date.dt[GT=="WT",.(x=date,y=SNV)]))), parse = TRUE)

Now some dN/dS analysis and exploration

###calculate genome-wide dS rates for subsequent calcualtions
GW_PT_dS<-dNdS.dt[,sum(PT_S)/sum(S_sites)]
GW_MA_dS<-dNdS.dt[,sum(MA_S)/sum(S_sites)]

### P-value calculations using binomial test. P value is calculated for dN/dS ratio difference from 1

dNdS.dt[, P.PT := round(
binom.test(x=c(PT_NS,PT_S),p=NS_sites/length,alternative = "g")$p.value,
digits = 3),
by=gene]

dNdS.dt[, P.MA := round(
binom.test(x=c(MA_NS,MA_S),p=NS_sites/length,alternative = "g")$p.value,
digits = 3),
by=gene]

### adjusted dN/dS ratios using genome-wide estimates for dS rate:

dNdS.dt[,dNdS_PT_adj := round(PT_NS/NS_sites/GW_PT_dS,digits = 2)]
dNdS.dt[,dNdS_MA_adj := round(MA_NS/NS_sites/GW_MA_dS,digits = 2)]
### 95% CI estimates for adjusted dN/dS ratios. Observed NS count was adjusted according to binomial CI; P-value is one-sided

dNdS.dt[, CI.PT := paste0(round(
binom.test(x=c(PT_NS,PT_S), p=NS_sites/length, alternative = "g")$conf.int*PT_NS/NS_sites/GW_PT_dS,
digits = 3),collapse = "-"),
by=gene]

dNdS.dt[, CI.MA := paste0(round(
binom.test(x=c(MA_NS,MA_S), p=NS_sites/length, alternative = "g")$conf.int*MA_NS/NS_sites/GW_MA_dS,
digits = 3),collapse = "-"),
by=gene]
fwrite(x=dNdS.dt,file = "dNdS.export.tsv", sep = "\t", row.names = F, col.names = T)
### some P-values
### aggregate N/S data over subsets - GW or 24 genes, PT and MA
GW_PT_S<-dNdS.dt[,sum(PT_S)]
GW_MA_S<-dNdS.dt[,sum(MA_S)]
GW_PT_NS<-dNdS.dt[,sum(PT_NS)]
GW_MA_NS<-dNdS.dt[,sum(MA_NS)]

E9G_PT_S<-dNdS.dt[`24 shared genes` >0,sum(PT_S)]
E9G_MA_S<-dNdS.dt[`24 shared genes` >0,sum(MA_S)]
E9G_PT_NS<-dNdS.dt[`24 shared genes` >0,sum(PT_NS)]
E9G_MA_NS<-dNdS.dt[`24 shared genes` >0,sum(MA_NS)]

### test for GW PT vs MA N/S data
fisher.test(
  as.table(rbind(c(GW_PT_S,GW_MA_S),
                 c(GW_PT_NS,GW_MA_NS)
                 )))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  as.table(rbind(c(GW_PT_S, GW_MA_S), c(GW_PT_NS, GW_MA_NS)))
## p-value = 0.04267
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.003486 1.267496
## sample estimates:
## odds ratio 
##   1.127349
### test for E9G founder vs GW in PT data 
fisher.test(
  as.table(rbind(c(GW_PT_S,E9G_PT_S),
                 c(GW_PT_NS,E9G_PT_NS)
                 )))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  as.table(rbind(c(GW_PT_S, E9G_PT_S), c(GW_PT_NS, E9G_PT_NS)))
## p-value = 0.01613
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.119825 4.569592
## sample estimates:
## odds ratio 
##   2.158966
### test for E9G founder vs GW in MA data 
fisher.test(
  as.table(rbind(c(GW_MA_S,E9G_PT_S),
                 c(GW_MA_NS,E9G_PT_NS)
                 )))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  as.table(rbind(c(GW_MA_S, E9G_PT_S), c(GW_MA_NS, E9G_PT_NS)))
## p-value = 0.05156
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9866348 4.0742356
## sample estimates:
## odds ratio 
##   1.914822
### and a little data exploration
dNdS.dt
##       24 shared genes    gene length NS_sites S_sites PT_NS PT_S  PT_dN  PT_dS
##    1:               0  ycaC_2    678   513.33  161.67     0    1 0.0000 0.0062
##    2:               0  ycaC_1    633   489.00  141.00     1    0 0.0020 0.0000
##    3:               0  proP_1   1527  1153.33  370.67     0    3 0.0000 0.0081
##    4:               0  proP_2   1314   982.33  328.67     1    0 0.0010 0.0000
##    5:               0  surA_1   1542  1168.33  370.67     2    1 0.0017 0.0027
##   ---                                                                         
## 4598:               0    mcl2    864   634.67  226.33     1    1 0.0016 0.0044
## 4599:               0    dsbG    774   584.67  186.33     1    0 0.0017 0.0000
## 4600:               0 A_03632    189   138.00   48.00     0    0 0.0000 0.0000
## 4601:               0  lysN_2   1200   900.00  297.00     0    1 0.0000 0.0034
## 4602:               1 A_03291   2868  2186.33  678.67     4    0 0.0018 0.0000
##       MA_NS MA_S   MA_dN MA_dS PT_dNdS MA_dNdS  P.PT  P.MA dNdS_PT_adj
##    1:     0    0 0.00000     0       0      NA 1.000 1.000        0.00
##    2:     0    0 0.00000     0      NA      NA 0.773 1.000        1.36
##    3:     0    0 0.00000     0       0      NA 1.000 1.000        0.00
##    4:     1    0 0.00100     0      NA      NA 0.748 0.748        0.68
##    5:     0    0 0.00000     0    0.63      NA 0.852 1.000        1.14
##   ---                                                                 
## 4598:     0    0 0.00000     0    0.36      NA 0.930 1.000        1.05
## 4599:     0    0 0.00000     0      NA      NA 0.755 1.000        1.14
## 4600:     0    0 0.00000     0      NA      NA 1.000 1.000        0.00
## 4601:     0    0 0.00000     0       0      NA 1.000 1.000        0.00
## 4602:     2    0 0.00091     0      NA      NA 0.338 0.581        1.22
##       dNdS_MA_adj       CI.PT       CI.MA
##    1:        0.00         0-0         0-0
##    2:        0.00 0.068-1.359         0-0
##    3:        0.00         0-0         0-0
##    4:        2.16 0.034-0.677 0.108-2.162
##    5:        0.00 0.154-1.138         0-0
##   ---                                    
## 4598:        0.00 0.027-1.047         0-0
## 4599:        0.00 0.057-1.137         0-0
## 4600:        0.00         0-0         0-0
## 4601:        0.00         0-0         0-0
## 4602:        1.94 0.575-1.216 0.434-1.943
ggplot(dNdS.dt,aes(x=dNdS_MA_adj))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing missing values (geom_bar).

ggplot(dNdS.dt,aes(x=dNdS_PT_adj))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 5 rows containing missing values (geom_bar).

ggplot(dNdS.dt,aes(x=P.PT))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dNdS.dt,aes(x=P.MA))+ geom_histogram() + scale_y_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 9 rows containing missing values (geom_bar).

ggplot(melt.data.table(dNdS.dt,id.vars = "gene",measure.vars = c("P.PT","P.MA"), value.name = "P.value", variable.name = "dataset" ),
       aes(x=P.value,colour=dataset))+ geom_density(size=1.5) + scale_y_sqrt()+ scale_color_brewer(type = "qual",palette = 1) + theme_classic(base_size = 14)

ggplot(melt.data.table(dNdS.dt,id.vars = "gene",measure.vars = c("dNdS_PT_adj","dNdS_MA_adj"), value.name = "dNdS", variable.name = "dataset" ),
       aes(x=dNdS,colour=dataset))+ geom_density(size=1.5) + scale_y_sqrt()+ scale_color_brewer(type = "qual",palette = 5) + theme_classic(base_size = 14)

#Figure S4
##########

#Making the phylogeny
BH.tree<-read.tree("RAxML_bipartitions.tree_without_ancestor")
BH.bootstrap<-read.tree("RAxML_bootstrap.tree_without_ancestor")
BH.tree$node.label
##  [1] ""    ""    "68"  "100" "100" "100" "97"  "100" "100" "100" "100" "31" 
## [13] "35"  "100" "96"  "100" "100" "100" "100" "100" "100" "100" "99"  "100"
BH.tree$tip.label<-gsub("from_2B3_", "", BH.tree$tip.label)
plot(BH.tree,use.edge.length=FALSE,cex=0.75)
nodelabels(BH.tree$node.label, adj = -0.2, frame = "n", cex = 0.45)
nodelabels(pch=21,node=27,bg="red")

pdf("Fig_S4.pdf")
plot(BH.tree,use.edge.length=FALSE,cex=0.75)
nodelabels(BH.tree$node.label, adj = -0.2, frame = "n", cex = 0.45)
nodelabels(pch=21,node=27,bg="red")
dev.off()
## quartz_off_screen 
##                 2