#### General parameters for the analysis ####

reloadImage <- FALSE

#### Sequence collection ####

## Supported: 
collections <- c(
  "around-CoV-2", 
  "selected",
  "around-CoV-2-plus-GISAID", 
  "selected-plus-GISAID"
  )



# collection <- "around-CoV-2" # 14 genomes
# collection <- "all" # 16 genomes
# collection <- "selected" # ~60 genomes
# collection <- "around-CoV-2-plus-GISAID" # 16 genomes
# collection <- "selected-plus-GISAID" # ~60 genomes
collection <- "all-plus-GISAID" # 16 genomes

## Note about GIDAID sequences.
## 
## A few genomes were not available in NCBI Genbank at the time of 
## this analysis, and had to be downloaded from GISAID. These sequences
##  can however not be redistributed, they should thus be downloaded 
##  manually to reproduce the full trees. 


## Exclude incomplete genomes (i.e. those containing a lof of Ns) to avoid biases in the distance computations
excludeIncomplete <- TRUE



#### Define directories and files ####
dir <- list(main = '..')
dir$R <- file.path(dir$main, "scripts/R")

#### Create output directory for sequences ####
dir$seqdata <- file.path(dir$main, "data")
dir.create(dir$seqdata, showWarnings = FALSE, recursive = TRUE)

## Instantiate a list for output files
outfiles <- vector()

## Memory image
dir$images <- file.path(dir$main, "memory_images")
dir.create(dir$images, recursive = TRUE, showWarnings = FALSE)
outfiles["Memory image"] <- file.path(
  dir$images, 
  paste0(collection, "_one-to-n_matches.Rdata"))

## Input files
infiles <- list()

## Output tables
# di$output <- file.path(dir.main, "")
# dir$tables <- 

## Load custom functions
source(file.path(dir$R, "align_n_to_one.R"))
source(file.path(dir$R, "plot_pip_profiles.R"))

## A unequivocal pattern to identify the reference genome in the sequence names of the input file
refPattern <- "HuCoV2_WH01_2019"

## Exclude some genomes with a lot of Ns, because they bias the PIP profiles and alignments and trees
excludePatterns <- c("PnMP789", "PnGu-P2S_2019")

#### Features of interest in the reference genome ####

features <- list()


#### Specific features  ####

## Regions around the insertions
features[['Ins1-pm120']] <- c(start = 21647, end = 21907)
features[['Ins2-pm120']] <- c(start = 21899, end = 22156)
features[['Ins3-pm120']] <- c(start = 21899, end = 23152)
features[['Ins4-pm120']] <- c(start = 23483, end = 23734)
features[['Ins4-m240']] <- c(start = 23363, end = 23614)



## S1: pre-cleavage part of the spike protein 
features[["S1"]] <- c(start = 21599, end =  23617)
## S2: post-cleavage part of the spike protein 
features[["S2"]] <- c(start = 23618, end = 25381)
## Receptor Binding Domain (RBD)
features[["RBD"]] <- c(start = 22517, end = 23185)
## Potential Pangolin origin after Xiao (https://doi.org/10.1101/2020.02.17.951335)
features[['Recomb-Xiao']] <- c(start = 22871, end = 23092)
## Recombinant region 1 seen on the PIP profiles
features[['Recomb-reg-1']] <- c(start = 21500, end = 22800)
## Recombinant region 2 seen on the PIP profiles
features[['Recomb-reg-2']] <- c(start = 22800, end = 24000)
## Recombinant region 3 seen on the PIP profiles
features[['Recomb-reg-3']] <- c(start = 27800, end = 28350)

## Annotated coding sequences
features[['CDS-S']] <- c(start = 21563, end = 25384) ## Spike gene
features[['CDS-ORF3a']] <- c(start = 25393, end = 26220)
features[['CDS-E']] <- c(start = 26245, end = 26472)
features[['CDS-M']] <- c(start = 26523, end = 27191)
features[['CDS-ORF6']] <- c(start = 27202, end = 27387)
features[['CDS-ORF7a']] <- c(start = 27394, end = 27759)
features[['CDS-ORF8']] <- c(start = 27894, end = 28259)
features[['CDS-N']] <- c(start = 28274, end = 29533)
features[['CDS-ORF10']] <- c(start = 29558, end = 29674)
features[['CDS-ORF1ab']] <- c(start = 266, end = 21555)

## All the sequences after the bif ORF coding for the polyproteinn 1ab
features[['After-ORF1ab']] <- c(start = 21556, end = 29899)



## Report the parameters
message("\tReference genomes: ", refPattern)
## Genome dir and files
if (length(grep(pattern = "GISAID", x = collection)) > 0) {
  useGISAID <- TRUE
  dir$genomes <- file.path(dir$main, "data", "GISAID_genomes")
  # collections <-  paste0(collections, "-plus-GISAID")
  # collection <-  paste0(collection, "-plus-GISAID")
} else {
  dir$genomes <- file.path(dir$main, "data", "genomes")
}


## Define the input genome
infiles$genomes <- file.path(
  dir$genomes, 
  paste0("genomes_", collection, ".fasta"))

## Genome sequences
if (!file.exists(infiles$genomes)) {
  stop("Genome sequence file is missing", "\n", infiles$genomes)
}

#### Load genome sequences ####
genomes <- readDNAStringSet(filepath = infiles$genome, format = "fasta")

## Shorten sequence names by suppressing the fasta comment (after the space)
names(genomes) <- sub(pattern = " .*", replacement = "", x = names(genomes), perl = TRUE)

## Exclude genomes
if (excludeIncomplete) {
  excludePattern = paste0("(", paste(collapse = ")|(", excludePatterns), ")")
  excludedGenomeNames <- grep(pattern = excludePattern, x = names(genomes), 
                              value = TRUE, invert = FALSE)
  filteredGenomeIndices <- grep(pattern = excludePattern, x = names(genomes), 
                                value = FALSE, invert = TRUE)
  message("\tExcluded ", length(excludedGenomeNames)," genomes: ", paste(collapse = ", ", excludedGenomeNames)) 
  message("\tRemaining genomes: ", length(filteredGenomeIndices))
  genomes <- genomes[filteredGenomeIndices]
# names(genomes)
}

## Report the number of genoomes
genomeNames <- names(genomes)
nbGenomes <- length(genomeNames)
message("\tLoaded ", nbGenomes, " genomes from file ", infiles$genomes)
# View(genomes)


#### Define reference and query genomes ####
refGenomeName <- grep(pattern = refPattern, x = names(genomes), 
                      ignore.case = TRUE, value = TRUE)
if (is.null(refGenomeName)) {
  stop("Could not identify reference genome with pattern ", refPattern)
}
message("\tReference genome name: ", refGenomeName)

## Compute some statistics about genome sizes
genomeStat <- data.frame(
  n = 1:length(genomes),
  row.names = names(genomes),
  status = rep("Query", length.out = length(genomeNames))
)
genomeStat[,"status"] <- as.vector(genomeStat[,"status"])
genomeStat[refGenomeName,"status"] <- "Reference"
g <- 1
for (g in genomeNames) {
  genomeStat[g, "length"] <- length(genomes[[g]])
}

Genome statistics

Reference and query genomes
n status length species color
BtBM48-31 1 Query 29276 Bat #888888
BtBtKY72 2 Query 29274 Bat #888888
BtCoV_273-2005 3 Query 29704 Bat #888888
BtCp-Yun_2011 4 Query 29452 Bat #888888
BtGX2013 5 Query 29161 Bat #888888
BtHKU3-12 6 Query 29704 Bat #888888
BtHKU3-6 7 Query 29704 Bat #888888
BtHKU3-8 8 Query 29681 Bat #888888
BtHKU5 9 Query 30482 Bat #888888
BtHKU9-1 10 Query 29114 Bat #888888
BtJL2012 11 Query 29037 Bat #888888
BtJTMC15 12 Query 28761 Bat #888888
BtLongquan-140 13 Query 29676 Bat #888888
BtLYRa11 14 Query 29805 Bat #888888
BtRaTG13_2013_Yunnan 15 Query 29855 Bat #FF6600
Btrec-SARSg_2008 16 Query 29750 Bat #888888
BtRf-HeB-2013 17 Query 29443 Bat #888888
BtRf1-2004 18 Query 29709 Bat #888888
BtRm1/2004 19 Query 29749 Bat #888888
BtRp-Shaanxi2011 20 Query 29484 Bat #888888
BtRp3-2004 21 Query 29736 Bat #888888
BtRs_672-2006 22 Query 29059 Bat #888888
BtRs-HuB-2013 23 Query 29658 Bat #888888
BtRs3367 24 Query 29792 Bat #888888
BtRs4081 25 Query 29741 Bat #888888
BtRs4084 26 Query 29770 Bat #888888
BtRs4231 27 Query 29782 Bat #888888
BtRs4237 28 Query 29741 Bat #888888
BtRs4247 29 Query 29743 Bat #888888
BtRs4874 30 Query 30311 Bat #888888
BtRs7327 31 Query 30307 Bat #888888
BtRsSHC014 32 Query 29787 Bat #888888
BtSC2018 33 Query 29648 Bat #888888
BtWIV16_2013 34 Query 30290 Bat #888888
BtYN2013 35 Query 29142 Bat #888888
BtYN2018B 36 Query 30256 Bat #888888
BtYN2018C 37 Query 29689 Bat #888888
BtYNLF_31C 38 Query 29723 Bat #888888
BtZC45 39 Query 29802 Bat black
BtZXC21 40 Query 29732 Bat black
CmMERS 41 Query 29851 Camel #BB8800
Cv007-2004 42 Query 29540 Civet #00BBFF
Hu229E 43 Query 27317 Human #880000
HuCoV2_WH01_2019 44 Reference 29899 Human red
HuCoV2_Whu1_2019 45 Query 29903 Human #880000
HuCoV2_WIV04°2019 46 Query 29891 Human #880000
HuHK04-02 47 Query 30722 Human #880000
HuHKU1_N23 48 Query 29926 Human #880000
HuMERS_172-06_2015 49 Query 30068 Human #880000
HuNL63 50 Query 27553 Human #880000
HuOC43 51 Query 30741 Human #880000
HuSARS-CoV_Tor2-FP1-10895 52 Query 29646 Human #880000
HuSARS-Frankfurt-1_2003 53 Query 29727 Human #0044BB
HuTGEV 54 Query 28586 Human #880000
HuTW11-SARS 55 Query 29727 Human #880000
PiPRCV 56 Query 27765 Pig #FFBBBB
PiSADS 57 Query 27163 Pig #FFBBBB
PnGX-P1E_2017 58 Query 29801 Pangolin #448800
PnGX-P2V_2018 59 Query 29795 Pangolin #448800
PnGX-P4L 60 Query 29805 Pangolin #448800
PnGX-P5E 61 Query 29802 Pangolin #448800
PnGX-P5L 62 Query 29806 Pangolin #448800
BtYu-RmYN02_2019 63 Query 29671 Bat #FFBB22
PnGu1_2019 64 Query 29825 Pangolin #00BB00

The collection all-plus-GISAID contains 64 virus genome sequences.

One-to-N alignemnts of selected features

We perform a global pairwise alignment (Needle-Waterman algorithm) between each feature of the reference (HuCoV2_WH01_2019) and each one of the query genomes.

#### One-to-N alignmemnt of user-speficied genomic features ####
# featureName <- "CDS-S" # for the test
featureName <- "S1" # for the test
# featureName <- "Recomb-reg-3" # for the test
# featureName <- "RBD" # for the test
# for (collection in (collections)) {
if (reloadImage) {
  load(outfiles["Memory image"])
  reloadImage <- TRUE

} else {
  allFeatureAlignments <- list()
}
for (featureName in names(features)) {
  featureLimits <- features[[featureName]]
  featureStart <- featureLimits[["start"]]
  featureEnd <- featureLimits[["end"]]
  featureLength <- 
    featureEnd - featureStart  + 1
  
  message("Searching matches for feature ", featureName, 
          " (", featureLimits[1], "-", featureLimits[2], ")", 
          " in collection ", collection)
  
  cat(sep = "", "\n### ",  featureName, 
      " (", featureLimits[1], "-", featureLimits[2], 
      "; ", featureLength,"bp)", "\n")

  #### N-to-1 alignments of spike-coding sequences ####
  featurePrefix <- paste0(
    featureName,
    "_", collection)
  
  dir[[featureName]] <- file.path(dir$seqdata, featureName)
  dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE)
  outfiles[featureName]  <- file.path(
    dir[[featureName]], paste0(featurePrefix, ".fasta"))
  message("\tOutput directory: ", dir[[featureName]] )
  
  ## Get sequence of the feature from the reference genome
  refSeq <- subseq(genomes[refGenomeName], 
                   start = featureLimits[1], 
                   end = featureLimits[2])
  
  ## Match the reference feature with all the genomes
  if (reloadImage) {
    featureAlignmentsNto1 <- allFeatureAlignments[[featureName]]
  } else {  
    featureAlignmentsNto1 <- alignNtoOne(
      refSequence = refSeq, 
      querySequences = genomes, 
      type = "global-local",
      outfile = outfiles[featureName])
    
    allFeatureAlignments[[featureName]] <- featureAlignmentsNto1
  }
  
  genomeOrder <- order(featureAlignmentsNto1$stat$score, decreasing = TRUE)
  
  if (featureLength > 20000) {
    windowSize <- 800 
  } else if (featureLength > 5000) {
    windowSize <- 500 
  } else if (featureLength > 3000) {
    windowSize <- 200 
  } else {
    windowSize <- max(100, 10 * round(featureLength / 100))
  }
  
  ## PIP profile of one-to-N alignment
  plotPIPprofiles(
    alignments = featureAlignmentsNto1$alignments[genomeOrder],
    reversePlot = TRUE,
    windowSize = windowSize, 
    main = paste0(featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"), 
    #    colors = NULL,
    colors = genomeColors,
    legendMargin = 0.25, 
    legendCorner = "topright", lwd = 1,
    legendCex = 0.5, ylim = c(30, 100))
  
  
  kable(featureAlignmentsNto1$stats[genomeOrder,], 
        caption = paste0(
          "One-to-N alignment of feature ", 
          featureName))
  
}

Ins1-pm120 (21647-21907; 261bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Ins2-pm120 (21899-22156; 258bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Ins3-pm120 (21899-23152; 1254bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Ins4-pm120 (23483-23734; 252bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Ins4-m240 (23363-23614; 252bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

S1 (21599-23617; 2019bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

S2 (23618-25381; 1764bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

RBD (22517-23185; 669bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Recomb-Xiao (22871-23092; 222bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Recomb-reg-1 (21500-22800; 1301bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Recomb-reg-2 (22800-24000; 1201bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Recomb-reg-3 (27800-28350; 551bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-S (21563-25384; 3822bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF3a (25393-26220; 828bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-E (26245-26472; 228bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-M (26523-27191; 669bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF6 (27202-27387; 186bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF7a (27394-27759; 366bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF8 (27894-28259; 366bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-N (28274-29533; 1260bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF10 (29558-29674; 117bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

CDS-ORF1ab (266-21555; 21290bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

After-ORF1ab (21556-29899; 8344bp)

Feature-specific Percent Identical Positions (PIP) profiles.

Feature-specific Percent Identical Positions (PIP) profiles.

Output files

Directories
Dir
main ..
R ../scripts/R
seqdata ../data
images ../memory_images
genomes ../data/GISAID_genomes
Ins1.pm120 ../data/Ins1-pm120
Ins2.pm120 ../data/Ins2-pm120
Ins3.pm120 ../data/Ins3-pm120
Ins4.pm120 ../data/Ins4-pm120
Ins4.m240 ../data/Ins4-m240
S1 ../data/S1
S2 ../data/S2
RBD ../data/RBD
Recomb.Xiao ../data/Recomb-Xiao
Recomb.reg.1 ../data/Recomb-reg-1
Recomb.reg.2 ../data/Recomb-reg-2
Recomb.reg.3 ../data/Recomb-reg-3
CDS.S ../data/CDS-S
CDS.ORF3a ../data/CDS-ORF3a
CDS.E ../data/CDS-E
CDS.M ../data/CDS-M
CDS.ORF6 ../data/CDS-ORF6
CDS.ORF7a ../data/CDS-ORF7a
CDS.ORF8 ../data/CDS-ORF8
CDS.N ../data/CDS-N
CDS.ORF10 ../data/CDS-ORF10
CDS.ORF1ab ../data/CDS-ORF1ab
After.ORF1ab ../data/After-ORF1ab
Output files
dir file
../memory_images all-plus-GISAID_one-to-n_matches.Rdata
../data/Ins1-pm120 Ins1-pm120_all-plus-GISAID.fasta
../data/Ins2-pm120 Ins2-pm120_all-plus-GISAID.fasta
../data/Ins3-pm120 Ins3-pm120_all-plus-GISAID.fasta
../data/Ins4-pm120 Ins4-pm120_all-plus-GISAID.fasta
../data/Ins4-m240 Ins4-m240_all-plus-GISAID.fasta
../data/S1 S1_all-plus-GISAID.fasta
../data/S2 S2_all-plus-GISAID.fasta
../data/RBD RBD_all-plus-GISAID.fasta
../data/Recomb-Xiao Recomb-Xiao_all-plus-GISAID.fasta
../data/Recomb-reg-1 Recomb-reg-1_all-plus-GISAID.fasta
../data/Recomb-reg-2 Recomb-reg-2_all-plus-GISAID.fasta
../data/Recomb-reg-3 Recomb-reg-3_all-plus-GISAID.fasta
../data/CDS-S CDS-S_all-plus-GISAID.fasta
../data/CDS-ORF3a CDS-ORF3a_all-plus-GISAID.fasta
../data/CDS-E CDS-E_all-plus-GISAID.fasta
../data/CDS-M CDS-M_all-plus-GISAID.fasta
../data/CDS-ORF6 CDS-ORF6_all-plus-GISAID.fasta
../data/CDS-ORF7a CDS-ORF7a_all-plus-GISAID.fasta
../data/CDS-ORF8 CDS-ORF8_all-plus-GISAID.fasta
../data/CDS-N CDS-N_all-plus-GISAID.fasta
../data/CDS-ORF10 CDS-ORF10_all-plus-GISAID.fasta
../data/CDS-ORF1ab CDS-ORF1ab_all-plus-GISAID.fasta
../data/After-ORF1ab After-ORF1ab_all-plus-GISAID.fasta

Memory image

We store the result ina memory image, in oder to be able reloading it to plot PIP profiles with different parameters.

Session info

R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] Biostrings_2.52.0   XVector_0.24.0      IRanges_2.18.3      S4Vectors_0.22.1    BiocGenerics_0.30.0 knitr_1.28         

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4          digest_0.6.25       magrittr_1.5        evaluate_0.14       highr_0.8           zlibbioc_1.30.0     rlang_0.4.5         stringi_1.4.6       rmarkdown_2.1       tools_3.6.1         stringr_1.4.0       xfun_0.12           yaml_2.2.1          compiler_3.6.1     
[15] BiocManager_1.30.10 htmltools_0.4.0