# Example for the identification of phage p22 in S.Tm LT2

This R script shows an analysis example of using outwards-oriented paired-end reads (OPRs), identified using the OPR finder function in the mVIRs package, to identify p22 in S.Tm LT2. In addition to the OPR finder output, the script uses a fragment (insert) coverage file of single-copy marker genes to compute OPR counts per cell. Methodological details are described in in Zünd et al., 2020.

Table of Contents

  1. Setting up your environment
  2. Processing the input data
  3. OPR analysis for full length of the host genome
  4. OPR analysis for a sub-section of the host genome
  5. Extract OPR counts around the attL and attR site of p22

1. Setting up your environment

The following packages are required for the analysis:

The script below will install these for you

list.of.packages <- c("tidyverse", "vegan", "igraph")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)

Create and set working directory

dir.create("~/example/", showWarnings = FALSE)
setwd("~/example/")

Copy the following files, which were included in OPR_analysis_example.zip, into the newly created working directory:

2. Processing the input data

Load marker gene fragment coverage data

The fragment coverage file contains (i) the median fragment (insert) coverage of single copy marker genes and (ii) the length of the genome of S.Tm LT2.

MG_coverage <- read.table("MG_coverage.txt",header=TRUE) 

Load FeatureCount data and extract median gene length

The FeatureCount data are used to extract median gene length, which is used to define the distance between breakpoints of genomic coordinates.

Control_1_fcnt <- read.table("Control_1.fcnt", header = TRUE)  #load FeatureCount file
bp <- c(seq(0,MG_coverage[1,2],by=(median(Control_1_fcnt$Length)-1)),MG_coverage[1,2]) #define breakpoint of bins with the median gene length as distance between two breakpoints

3. OPR analysis for full length of the host genome

Functions for plotting OPRs

## Function for strain without plasmid(s)
# x = input file

load_isolate_sample <- function(x){ 
  rr <- read.table(x, sep="\t")
  colnames(rr) <- c("Readname","Strain","Insert_size","Orientation_R1","Orientation_R2","BWA_Score","PosR1","PosR2","R1_ALNLENGTH","R2_ALNLENGTH", "INSERT_ORIENTATION")#rename the columns
  cc <- rr%>%filter(INSERT_ORIENTATION=="OPR")#filter for paired-end read with OPR characteristics.
  cc
}

## Function for community sample or strain with plasmid(s)
# x = input file
# strain = strain within a community or the strain name carrying a plasmid. Not needed in the culture samples 

load_community_samples <- function(x,strain){ 
  rr <- read.table(x, sep="\t")
  colnames(rr) <- c("Readname","Strain","Insert_size","Orientation_R1", "Orientation_R2","BWA_Score","PosR1","PosR2","R1_ALNLENGTH","R2_ALNLENGTH", "INSERT_ORIENTATION")#rename the columns
  cc <- rr%>%filter(INSERT_ORIENTATION=="OPR")#filter for paired-end read with OPR characteristics.
  cc <- cc%>%filter(Strain==strain)#filter for reads from specific strain.
  cc
}

## Function for OPR data plotting 
# input_file = OPR data frame  
# MG = Frag_cov data frame  
# seqStart = smallest break point 
# seqEnd = biggest break kpoint
# i = sample name as string. The sample names must match the row names of the Frag_cov data frame



OPR_graph <- function(input_file,MG,i, seqStart,seqEnd){
  input_file$PosR1 <- input_file$PosR1+1 #change that the sequence start at position 1 and not at 0
  input_file$PosR2 <- input_file$PosR2+1 #change that the sequence start at position 1 and not at 0
  input_file=input_file[input_file[,"Insert_size"]< MG[1,2]-4500,] #filter out OPR pairs with an insert size of roughly the bacterial genome
  input_file=input_file[input_file[,"Insert_size"]>1000,] # keep only OPR pairs with an insert size that is lager than 1000bp.   
  input_file$P1 <- cut(input_file$PosR1, breaks = bp,dig.lab = 7) # use the defined break point to group the read 1 into the binds
  P1= as.data.frame(do.call(rbind,lapply(strsplit(sub("\\]","",sub("\\(","",input_file$P1)),","),as.numeric)))# extract start and end of bins from read 1
  input_file$P2 <- cut(input_file$PosR2, breaks = bp,dig.lab = 7) # use the defined break point to group the read 2 into the binds
  P2= as.data.frame(do.call(rbind,lapply(strsplit(sub("\\]","",sub("\\(","",input_file$P2)),","),as.numeric)))# extract start and end of bin from read 2
  Bins<- as.data.frame(cbind(V1=((P1$V2+P1$V1+1)/2),V2=((P2$V2+P2$V1+1)/2))) #calculate the mid-position of each bin and combine coordinates of bins of read 1 and read 2 into a data frame
  Length <- unique(as.data.frame(cbind(V1=c(((P1$V2+P1$V1+1)/2),((P2$V2+P2$V1+1)/2)),L1=c((P1$V2-P1$V1+1),(P2$V2-P2$V1+1)))))# calculate length of each bin
  Bin_graph <- graph_from_data_frame(Bins, directed = FALSE,vertices = NULL) #build a undirected graph using the mid-bin position of Read 1 and Read 2
  edge_bins <- as.data.frame(get.edgelist(Bin_graph),stringsAsFactors = FALSE) #extract edglist of graph
  edge_bins$V1 <- as.numeric(edge_bins$V1) #convert egde into number
  edge_bins$V2 <- as.numeric(edge_bins$V2) #convert egde into number
  edge_bins <- edge_bins%>%filter(V1>seqStart, V2<seqEnd ,V2>seqStart,V1<seqEnd) #filter out edges that are smaller than smallest break point and bigger than biggest break point
  edge_conv <- cbind(edge_bins$V2, edge_bins$V1)#make a data frame where bin 2 is in the first columen
  all_edge <- rbind(edge_bins, edge_conv)# combine the two data frame to have all edges in the first and second column
  edge_count <- all_edge%>%group_by(V1)%>% #count how many times you have each edge in column V1. since we added all the edge of V2 to V1 we only have to count it in one of the columns
    summarize(Total_count=sum(count=n()))
  edge_count <- left_join(edge_count, Length, by="V1") # merge bins length and edge_count table
  edge_count$Total_count<- ((edge_count$Total_count/edge_count$L1)/MG[i,1])# normalize the edge counts by the median coverage of the 10 single-copy marker gene evaluated from the DESeq2 featureCount and by the bin length
  colnames(edge_count) <- c("V1",paste(substr(i,1,7),"Count",sep ="_"),"gene_length")#rename column of edge_counts table
  return(edge_count)
}

Load OPR files

### Use appropriate function:
# “load_isolate_sample”, if the data are from a cultured strain without plasmids or 
# “load_community_samples”, if the data are from a community sample or a strain with plasmid(s).
# HERE: “load_community_samples(x=input file, strain=strain name)” because *S*.Tm LT2 carries plasmids

C_1<- load_community_samples("Control_1.opr",strain="SalmonellaLT2")
C_2<- load_community_samples("Control_2.opr",strain="SalmonellaLT2")
C_3<- load_community_samples("Control_3.opr",strain="SalmonellaLT2")

I_1<- load_community_samples("Induced_1.opr",strain="SalmonellaLT2")
I_2<- load_community_samples("Induced_2.opr",strain="SalmonellaLT2")
I_3<- load_community_samples("Induced_3.opr",strain="SalmonellaLT2")

Build OPR graph for the full length of the host genome

# If interested in all OPRs at any genome position, the seqStart should be set to 1 and the seqEnd should represent the genome length. Here, the genome length can be extracted from “MG_coverage.txt” file.

Con_1 <- OPR_graph(C_1,MG=MG_coverage,i="Control_1",seqStart = 1, seqEnd = MG_coverage[1,2])
Con_2 <- OPR_graph(C_2,MG=MG_coverage,i="Control_2",seqStart = 1, seqEnd = MG_coverage[1,2])
Con_3 <- OPR_graph(C_3,MG=MG_coverage,i="Control_3",seqStart = 1, seqEnd = MG_coverage[1,2])

Ind_1 <- OPR_graph(I_1,MG=MG_coverage,i="Induced_1",seqStart = 1, seqEnd = MG_coverage[1,2])
Ind_2 <- OPR_graph(I_2,MG=MG_coverage,i="Induced_2",seqStart = 1, seqEnd = MG_coverage[1,2])
Ind_3 <- OPR_graph(I_3,MG=MG_coverage,i="Induced_3",seqStart = 1, seqEnd = MG_coverage[1,2])

# Calculate average OPR count within each bin  
# Merge all control samples and calculate the average OPR count per bin
Control <- bind_rows(Con_1,Con_2,Con_3)%>% #combine all the control samples into one data frame
  group_by(V1)%>% #group all bins with the same position
  summarize(All_count_C=sum(Control_Count)/3) #calculate the average read count per bin

# Merge all induced samples and calculate the average OPR count per bin
Induced <- bind_rows(Ind_1,Ind_2, Ind_3)%>% #combine all the induced samples into one data frame
  group_by(V1)%>% #group all bins with the same position
  summarize(All_count_I=sum(Induced_Count)/3) #calculate the average read count per bins

# Plot average ORP count per genome position
plot_data <- full_join(Control,Induced, by="V1")
plot_data[is.na(plot_data)] <- 0 #replace NA by 0

ggplot(data=plot_data)+
  geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_I,fill="#fb8072"),width=0.004)+
  geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_C,fill="#988699"),width=0.004)+
  theme_light()+
  scale_fill_manual(values=c( "#988699","#fb8072"))+
  theme(legend.position  ="none",
        plot.title = element_text(hjust = 0.5, size = 16))+
  ylab("OPRs (per cell)")+
  xlab("genome position (Mb)")+
  theme(axis.text.y = element_text(size=16),
        axis.title.x = element_text(size=20),
        axis.text.x = element_text(size=16),
        axis.title.y = element_text(size=20),
        axis.ticks.x=element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none")

4. OPR analysis for a sub-section of the host genome

# Plot only OPRs for which both reads are located within a sub-section of the genome  
# The example of the prophage p22 is shown
# The host genome region of interest is defined by seqStart and seqEnd

Con_p1 <- OPR_graph(C_1,MG=MG_coverage,i="Control_1",seqStart=1210000,seqEnd=1270000)
Con_p2 <- OPR_graph(C_2,MG=MG_coverage,i="Control_2",seqStart=1210000,seqEnd=1270000)
Con_p3 <- OPR_graph(C_3,MG=MG_coverage,i="Control_3",seqStart=1210000,seqEnd=1270000)

Ind_p1 <- OPR_graph(I_1,MG=MG_coverage,i="Induced_1",seqStart=1210000,seqEnd=1270000)
Ind_p2 <- OPR_graph(I_2,MG=MG_coverage,i="Induced_2",seqStart=1210000,seqEnd=1270000)
Ind_p3 <- OPR_graph(I_3,MG=MG_coverage,i="Induced_3",seqStart=1210000,seqEnd=1270000)

# Merge all control samples and calculate the average OPR count per bin
Control_p <- bind_rows(Con_p1,Con_p2,Con_p3)%>% #combine all the control samples into one data frame
  group_by(V1)%>% #group all bins with the same position
  summarize(All_count_C=sum(Control_Count)/3) #calculate the average read count per bin

# Merge all induced samples and calculate the average OPR count per bin
Induced_p <- bind_rows(Ind_p1,Ind_p2, Ind_p3)%>% #combine all the induced samples into one data frame
  group_by(V1)%>% #group all bins with the same position. 
  summarize(All_count_I=sum(Induced_Count)/3) #calculate the average read count per bin

Build OPR graph for subsection of the host genome

plot_data_p <- full_join(Control_p,Induced_p, by="V1")
plot_data_p[is.na(plot_data_p)] <- 0 #replace NA by 0

ggplot(data=plot_data_p)+
  geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_I,fill="#fb8072"),width=0.0008)+
  geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_C,fill="#988699"),width=0.0008)+
  theme_light()+
  scale_fill_manual(values=c( "#988699","#fb8072"))+
  theme(legend.position  ="none",
        plot.title = element_text(hjust = 0.5, size = 16))+
  ylab("OPRs (per cell)")+
  xlab("genome position (Mb)")+
  theme(axis.text.y = element_text(size=16),
        axis.title.x = element_text(size=20),
        axis.text.x = element_text(size=16),
        axis.title.y = element_text(size=20),
        axis.ticks.x=element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none")

5. Extract OPR counts around the attL and attR site of p22

plot_data_p$V1 <- as.numeric(plot_data_p$V1)

LT2_P22_attL <- plot_data_p%>%filter(V1>(1213987-1500),V1<(1213987+1500))#Middle gene position if first gene with increased deltaPtoH change of phage region 1 +- 3 insert sizes
LT2_P22_attR <- plot_data_p%>%filter(V1>(1255756-1500),V1<(1255756+1500))#Middle gene position if first gene with increased deltaPtoH change of phage region 1 +- 3 Insert sizes

round(sum(LT2_P22_attL$All_count_C),2)
## [1] 0.04
round(sum(LT2_P22_attR$All_count_C),2)
## [1] 0.04
round(sum(LT2_P22_attL$All_count_I),2)
## [1] 5.6
round(sum(LT2_P22_attR$All_count_I),2)
## [1] 5.6