Sanger sequence analysis - Lauren - August 2020

Julius updated the aligned .fasta file for the 110 Sanger sequences on July 26.

This file parses and interprets the aligned .fasta file using the package “biostrings.”

Setup

Clear workspace

rm(list=ls())

Install packages

# only run the installer the first time
# if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")

# BiocManager::install("Biostrings")

library(Biostrings)
library(dplyr)

Read in data

gaps<-readDNAStringSet('~/Documents/Duke/TMOLab/CFR - drought and herbivory/Previous data/Julius - Allelic variation/Most recent/110RP_updated_7.26.2020.fasta')

g<-data.frame(gaps)
colnames(g)[1]<-"seq"

Analysis of aligned read data

Structural variant 1: 8bp deletion

Identify presence/absence and location of deletion:

g$del<-rep(NA, nrow(g))
g$del.loc<-rep(NA, nrow(g))

g.str<-strsplit(as.character(g$seq), split="")

i<-1
j<-1

for(i in 1:nrow(g)){
  tmp<-g.str[[i]]
  tmp.out<-rep(NA, length(tmp))
  for(j in 1:length(tmp)){
    if(tmp[j]=="-"){
      tmp.out[j]<-j}
    else{
      tmp.out[j]<-NA
    }
  }
  if(sum(!is.na(tmp.out), na.rm=T)>0){
    g$del[i]<-1
    g$del.loc[i]<-min(tmp.out, na.rm=T)
  }
  else{
    g$del[i]<-0
    g$del.loc[i]<-NA
  }
}

Show output:

g[1:5, 2:3]
##                  del del.loc
## RP004.WES.BC.nd    0      NA
## RP014.NOR.BC.nd    0      NA
## RP017.WES.BC.13    0      NA
## RP018.NOR.BC.nd    0      NA
## RP021.WES.BC.13    0      NA
table(g$del, g$del.loc)
##    
##     371
##   0   0
##   1  20

Analysis of non-aligned data

Here, we remove the gaps (“-”) from the .fasta file to allow the 8bp deletion, when present, to (appropriately) disrupt the reading frame of the BCMA3 gene.

This assumes no alternative splicing or alternative reading frames for BCMA3.

Remove gaps:

g$seq.nogaps<-rep(NA, nrow(g))

# sanity check: gsub
x<-"ABC--D-EF---G"
y<-gsub("-", "", x)
y
## [1] "ABCDEFG"
# apply to dataframe
g$seq.nogaps<-gsub("-", "", g$seq)

Show output:

g[110, c(1,4)]
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     seq
## RP608.COL.MET.3 ATGATGATGATGAGCCTTACCACATCATTACCATACCCTTTTCAAATCCTACTAGTCTTTATCGTCTCCATGGCATCAATCTCTATACTAGGTCGAATACTCTCAAGGCCCACCAAAACCAAAGACAGATCTCGCCATCTTCCTCCTAGCCCACCAGAATGGCCCATCCTCGGCAATTTACCCGAATTATTCATGACTCGTCCTAGGTCCAAATATTTCGACCTTGCCATGAAAGAGCTAAAAACGGATATCGCATGTTTCAACTTCGCCGGAGTCGGCGCCATCATCATAAATTCCGTCGAAATTGCTAGAGAAGCGTTTCGAGAACGAGACGCTGATTTGGCAGACCGGCCTCATCTTTTCATCATGA--------GGAGACAATTACAAGTCAATGTTAATCTCACCGTACGGTGAACAATTCATGAAGATGAAAAGATTGATCACAACGGAAATTATGTCCGTTAAAACATTGAACATGTTGGCAGCTGCAAGAAGCATCGAAGCGGATAATCTCATTGCTTACGTTCACTCGATGTATCAACGGTCCGAGACGGTGGACGTTAGAGAACTCTCGAGGGTTTATGGGTACGCAGTGACCATGAGAATGTTGTTTGGAAGGAGACATGTCACGAAAGAAAACGTTTTTTCGGATGAAGGGAGACTAGGTAAAGCGGAAAAACATCATCTTGAGGCTATTTTCAACACTCTAAACTGTTTGCCGAGTTTTAGTCCCGCGGATTACGTGGAACGATGGCTTAAAGGTTGGAATATCGATGGTCAAGAGGAGAGGGTGACAGTGAATTGTAATATTGTTCGTAGTTACAACAATCCCATAATCGACGAGAGGGTCGAGTTATGGAGAGAAAAAGGTGGTAAGGCTGTTGTTGAAGATTGGCTGGATACGTTCATTACGCTAAAAGATCAAAACGGAAAGTACGTGGTCACGCCAGACGAAATAAAAGCTCAATGCGTTGAATTCTGTATAGCAGCGTTCGATAATCCGGCAAATAACATGGAGTGGACACTTGCGGAAATGTTAAAGAACCCGGAGATTCTCAAAAAAGCTCTGAAAGAATTAGACGAAGTAGTTGGAAAAGACAGGCTTGTGCAAGAATCAGACATACCAAATCTAAACTACTTAAAAGCTTGTTGCAGAGAAACATTCAGGATTCACCCAAGCACTCACTATGTCCCTACTCATGTTGCCCGTCAAGATACCACCCTCGGAGGTTATTTCATTCCCGAAGGTAGCCACATTCATGTATGCCGCCGTGGACTAGGCCAGAACCCTAAAATATGGAAAGATCCATTGGTATACAAACCGGAGCGTCACCTCCAAGGAGACGGAATCACACAAGAGGTTACTCTGGTGGAAACAGAGATGCGTTTTGTCTCGTTTGGCACGGGTCGACGTGGCTGCATCGGTGTTAAAGTCGGGACGATCATGATGGTTATGATGTTGGCTAGGTTTATTCAAGGGTTTAACTGGAAACTTCATCAAGATTTTGGACCGTTAAGCCTCGAGGAAGATGATGCATCATTGCTTATGGCTAAACCTCTCCTCTTGTCCGTTGAGCCACGCTTGGCACCAAACCTTTATAAAAAATTTCGTCCTTAA
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      seq.nogaps
## RP608.COL.MET.3 ATGATGATGATGAGCCTTACCACATCATTACCATACCCTTTTCAAATCCTACTAGTCTTTATCGTCTCCATGGCATCAATCTCTATACTAGGTCGAATACTCTCAAGGCCCACCAAAACCAAAGACAGATCTCGCCATCTTCCTCCTAGCCCACCAGAATGGCCCATCCTCGGCAATTTACCCGAATTATTCATGACTCGTCCTAGGTCCAAATATTTCGACCTTGCCATGAAAGAGCTAAAAACGGATATCGCATGTTTCAACTTCGCCGGAGTCGGCGCCATCATCATAAATTCCGTCGAAATTGCTAGAGAAGCGTTTCGAGAACGAGACGCTGATTTGGCAGACCGGCCTCATCTTTTCATCATGAGGAGACAATTACAAGTCAATGTTAATCTCACCGTACGGTGAACAATTCATGAAGATGAAAAGATTGATCACAACGGAAATTATGTCCGTTAAAACATTGAACATGTTGGCAGCTGCAAGAAGCATCGAAGCGGATAATCTCATTGCTTACGTTCACTCGATGTATCAACGGTCCGAGACGGTGGACGTTAGAGAACTCTCGAGGGTTTATGGGTACGCAGTGACCATGAGAATGTTGTTTGGAAGGAGACATGTCACGAAAGAAAACGTTTTTTCGGATGAAGGGAGACTAGGTAAAGCGGAAAAACATCATCTTGAGGCTATTTTCAACACTCTAAACTGTTTGCCGAGTTTTAGTCCCGCGGATTACGTGGAACGATGGCTTAAAGGTTGGAATATCGATGGTCAAGAGGAGAGGGTGACAGTGAATTGTAATATTGTTCGTAGTTACAACAATCCCATAATCGACGAGAGGGTCGAGTTATGGAGAGAAAAAGGTGGTAAGGCTGTTGTTGAAGATTGGCTGGATACGTTCATTACGCTAAAAGATCAAAACGGAAAGTACGTGGTCACGCCAGACGAAATAAAAGCTCAATGCGTTGAATTCTGTATAGCAGCGTTCGATAATCCGGCAAATAACATGGAGTGGACACTTGCGGAAATGTTAAAGAACCCGGAGATTCTCAAAAAAGCTCTGAAAGAATTAGACGAAGTAGTTGGAAAAGACAGGCTTGTGCAAGAATCAGACATACCAAATCTAAACTACTTAAAAGCTTGTTGCAGAGAAACATTCAGGATTCACCCAAGCACTCACTATGTCCCTACTCATGTTGCCCGTCAAGATACCACCCTCGGAGGTTATTTCATTCCCGAAGGTAGCCACATTCATGTATGCCGCCGTGGACTAGGCCAGAACCCTAAAATATGGAAAGATCCATTGGTATACAAACCGGAGCGTCACCTCCAAGGAGACGGAATCACACAAGAGGTTACTCTGGTGGAAACAGAGATGCGTTTTGTCTCGTTTGGCACGGGTCGACGTGGCTGCATCGGTGTTAAAGTCGGGACGATCATGATGGTTATGATGTTGGCTAGGTTTATTCAAGGGTTTAACTGGAAACTTCATCAAGATTTTGGACCGTTAAGCCTCGAGGAAGATGATGCATCATTGCTTATGGCTAAACCTCTCCTCTTGTCCGTTGAGCCACGCTTGGCACCAAACCTTTATAAAAAATTTCGTCCTTAA

Translate gapless sequence to amino acids

Translate gapless sequence using Biostrings:

g$seq.aa<-rep(NA, nrow(g))
  
i<-1
for(i in 1:nrow(g)){
  tmp<-as.character(translate(DNAString(g$seq.nogaps[i])))
  g$seq.aa[i]<-tmp
}

Show output:

g$seq.aa[1]
## [1] "MMMMSLTTSLPYPFQILLVFIVSMASISILGRILSRPTKTKDRSRHLPPSPPEWPILGNLPELFMTRPRSKYFDLAMKELKTDIACFNFAGVRAIIINSVEIAREAFRERDADLADRPHLFIMETIGDNYKSMLISPYGEQFMKMKRVITTEIMSVKTLNMLAAARSIEADNLIAYVHSMYQRSETVDVRELSRVYGYAVTMRMLFGRRHVTKENVFSDEGRLGKAEKHHLEAIFNTLNCLPSFSPADYVERWLKGWNIDGQEERVTMNCNIVRSYNNPIIDERVELWREKGGKAVVEDWLDTFITLKDQNGKYVVTPDEIKAQCVEFCIAAFDNPANNMEWTLAEMLKNPEILKKALKELDEVVGKDRLVQESDIPNLNYLKACCRETFRIHPSTHYVPTHVARQDTTLGGYFIPEGSHIHVCRRGLGQNPKIWKDPLVYKPERHLQGDGITQEVTLVETEMRFVSFGTGRRGCIGVKVGTIMMVMMLARFIQGFNWKLHQDFGPLSLEEDDASLLMAKPLLLSVEPRLAPNLYKKFRP*"

Structural variant 2: premature stop codon vs. full-length exons

Identify the presence and position of the first "*" in the translated sequence.

g$pre<-rep(NA, nrow(g))
g$stop.loc<-rep(NA, nrow(g))

a.str<-strsplit(as.character(g$seq.aa), split="")

i<-1
j<-1

for(i in 1:nrow(g)){
  tmp<-a.str[[i]]
  tmp.out<-rep(NA, length(tmp))
  for(j in 1:length(tmp)){
    if(tmp[j]=="*"){
      tmp.out[j]<-j}
    else{
      tmp.out[j]<-NA
    }
  }
  if(sum(!is.na(tmp.out), na.rm=T)>1){
    g$pre[i]<-1
    g$stop.loc[i]<-min(tmp.out, na.rm=T)
  }
  else{
    g$pre[i]<-0
    g$stop.loc[i]<-min(tmp.out, na.rm=T)
  }
}

Show output:

g[1:5, 6:7]
##                  pre stop.loc
## RP004.WES.BC.nd    0      541
## RP014.NOR.BC.nd    0      541
## RP017.WES.BC.13    0      541
## RP018.NOR.BC.nd    0      541
## RP021.WES.BC.13    0      541

Examine variation in stop codon position:

table(g$stop.loc)
## 
## 137 287 313 541 
##  20   2   1  87

Critical amino acid residues

Before parsing AAs at the positions of interest from Prasad et al. (148 and 268), we must replace all codons following the initial stop codon with *. Otherwise, codons which occur downstream of the first stop codon will be “expressed”, which should not be the case.

g$seq.aa.mod<-rep(NA, nrow(g))

aa.tmp<-strsplit(as.character(g$seq.aa), split="")

a<-1
s<-1
for(s in 1:length(aa.tmp)){
  for(a in 2:length(aa.tmp[[s]])){
    if(aa.tmp[[s]][a]=="*" & aa.tmp[[s]][a-1]!="*"){
      aa.tmp[[s]][c((a+1):length(aa.tmp[[s]]))]<-"*"
    }
  }
  a<-2
}

for(i in 1:nrow(g)){
  g$seq.aa.mod[i]<-gsub(", ", "", toString(aa.tmp[[i]]))
}

Show output:

g$seq.aa[110]
## [1] "MMMMSLTTSLPYPFQILLVFIVSMASISILGRILSRPTKTKDRSRHLPPSPPEWPILGNLPELFMTRPRSKYFDLAMKELKTDIACFNFAGVGAIIINSVEIAREAFRERDADLADRPHLFIMRRQLQVNVNLTVR*TIHEDEKIDHNGNYVR*NIEHVGSCKKHRSG*SHCLRSLDVSTVRDGGR*RTLEGLWVRSDHENVVWKETCHERKRFFG*RETR*SGKTSS*GYFQHSKLFAEF*SRGLRGTMA*RLEYRWSRGEGDSEL*YCS*LQQSHNRREGRVMERKRW*GCC*RLAGYVHYAKRSKRKVRGHARRNKSSMR*ILYSSVR*SGK*HGVDTCGNVKEPGDSQKSSERIRRSSWKRQACARIRHTKSKLLKSLLQRNIQDSPKHSLCPYSCCPSRYHPRRLFHSRR*PHSCMPPWTRPEP*NMERSIGIQTGASPPRRRNHTRGYSGGNRDAFCLVWHGSTWLHRC*SRDDHDGYDVG*VYSRV*LETSSRFWTVKPRGR*CIIAYG*TSPLVR*ATLGTKPL*KISSL"
g$seq.aa.mod[110]
## [1] "MMMMSLTTSLPYPFQILLVFIVSMASISILGRILSRPTKTKDRSRHLPPSPPEWPILGNLPELFMTRPRSKYFDLAMKELKTDIACFNFAGVGAIIINSVEIAREAFRERDADLADRPHLFIMRRQLQVNVNLTVR******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************"

Now we can parse the modified AA sequence and focus on the positions of interest.

aa.tmp<-strsplit(as.character(g$seq.aa.mod), split="")

# in this version of the data, the actual positions match Prasad et al., so there is no need to alter the position read in from the data file
seq.148<-unlist(lapply(aa.tmp, `[`, 148)) 
seq.268<-unlist(lapply(aa.tmp, `[`, 268))

# append back to datafile
g$AA148<-seq.148
g$AA268<-seq.268

# concatenate AA pheno
g$aa.concat<-paste(g$AA148, g$AA268, sep="")
unique(g$aa.concat)
## [1] "VM" "VV" "LV" "**"
# examine contingency table
table(g$AA148, g$AA268)
##    
##      *  M  V
##   * 20  0  0
##   L  0  0 21
##   V  0 18 51

Save modified file with all Sanger data

This file includes the various sequence versions as well as the “downstream” derived variables of interest, for use in our models regarding the factors underlying variation in BC-ratio.

g$rowname<-rownames(g)
g.tmp<-strsplit(as.character(g$rowname), split="[.]")

g$ID<-rep(NA, nrow(g))

i<-1
for(i in 1:nrow(g)){
  g$ID[i]<-g.tmp[[i]][1]
}
g.final<-dplyr::select(g, rowname, ID, del, del.loc, pre, stop.loc, AA148, AA268, aa.concat, seq, seq.nogaps, seq.aa, seq.aa.mod)

write.csv(g.final, "~/Documents/Duke/TMOLab/CFR - drought and herbivory/Previous data/Julius - Allelic variation/Most recent/LNC_redux/BCMA3SangerData_110RP_LNC_20200820.csv", row.names=F)