This R script shows an analysis example of using outwards-oriented paired-end reads (OPRs), identified using the OPR finder function in the mVIRs package, to identify p22 in S.Tm LT2. In addition to the OPR finder output, the script uses a fragment (insert) coverage file of single-copy marker genes to compute OPR counts per cell. Methodological details are described in in Zünd et al., 2020.
The following packages are required for the analysis:
The script below will install these for you
list.of.packages <- c("tidyverse", "vegan", "igraph")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)
Create and set working directory
dir.create("~/example/", showWarnings = FALSE)
setwd("~/example/")
Copy the following files, which were included in OPR_analysis_example.zip, into the newly created working directory:
Load marker gene fragment coverage data
The fragment coverage file contains (i) the median fragment (insert) coverage of single copy marker genes and (ii) the length of the genome of S.Tm LT2.
MG_coverage <- read.table("MG_coverage.txt",header=TRUE)
Load FeatureCount data and extract median gene length
The FeatureCount data are used to extract median gene length, which is used to define the distance between breakpoints of genomic coordinates.
Control_1_fcnt <- read.table("Control_1.fcnt", header = TRUE) #load FeatureCount file
bp <- c(seq(0,MG_coverage[1,2],by=(median(Control_1_fcnt$Length)-1)),MG_coverage[1,2]) #define breakpoint of bins with the median gene length as distance between two breakpoints
## Function for strain without plasmid(s)
# x = input file
load_isolate_sample <- function(x){
rr <- read.table(x, sep="\t")
colnames(rr) <- c("Readname","Strain","Insert_size","Orientation_R1","Orientation_R2","BWA_Score","PosR1","PosR2","R1_ALNLENGTH","R2_ALNLENGTH", "INSERT_ORIENTATION")#rename the columns
cc <- rr%>%filter(INSERT_ORIENTATION=="OPR")#filter for paired-end read with OPR characteristics.
cc
}
## Function for community sample or strain with plasmid(s)
# x = input file
# strain = strain within a community or the strain name carrying a plasmid. Not needed in the culture samples
load_community_samples <- function(x,strain){
rr <- read.table(x, sep="\t")
colnames(rr) <- c("Readname","Strain","Insert_size","Orientation_R1", "Orientation_R2","BWA_Score","PosR1","PosR2","R1_ALNLENGTH","R2_ALNLENGTH", "INSERT_ORIENTATION")#rename the columns
cc <- rr%>%filter(INSERT_ORIENTATION=="OPR")#filter for paired-end read with OPR characteristics.
cc <- cc%>%filter(Strain==strain)#filter for reads from specific strain.
cc
}
## Function for OPR data plotting
# input_file = OPR data frame
# MG = Frag_cov data frame
# seqStart = smallest break point
# seqEnd = biggest break kpoint
# i = sample name as string. The sample names must match the row names of the Frag_cov data frame
OPR_graph <- function(input_file,MG,i, seqStart,seqEnd){
input_file$PosR1 <- input_file$PosR1+1 #change that the sequence start at position 1 and not at 0
input_file$PosR2 <- input_file$PosR2+1 #change that the sequence start at position 1 and not at 0
input_file=input_file[input_file[,"Insert_size"]< MG[1,2]-4500,] #filter out OPR pairs with an insert size of roughly the bacterial genome
input_file=input_file[input_file[,"Insert_size"]>1000,] # keep only OPR pairs with an insert size that is lager than 1000bp.
input_file$P1 <- cut(input_file$PosR1, breaks = bp,dig.lab = 7) # use the defined break point to group the read 1 into the binds
P1= as.data.frame(do.call(rbind,lapply(strsplit(sub("\\]","",sub("\\(","",input_file$P1)),","),as.numeric)))# extract start and end of bins from read 1
input_file$P2 <- cut(input_file$PosR2, breaks = bp,dig.lab = 7) # use the defined break point to group the read 2 into the binds
P2= as.data.frame(do.call(rbind,lapply(strsplit(sub("\\]","",sub("\\(","",input_file$P2)),","),as.numeric)))# extract start and end of bin from read 2
Bins<- as.data.frame(cbind(V1=((P1$V2+P1$V1+1)/2),V2=((P2$V2+P2$V1+1)/2))) #calculate the mid-position of each bin and combine coordinates of bins of read 1 and read 2 into a data frame
Length <- unique(as.data.frame(cbind(V1=c(((P1$V2+P1$V1+1)/2),((P2$V2+P2$V1+1)/2)),L1=c((P1$V2-P1$V1+1),(P2$V2-P2$V1+1)))))# calculate length of each bin
Bin_graph <- graph_from_data_frame(Bins, directed = FALSE,vertices = NULL) #build a undirected graph using the mid-bin position of Read 1 and Read 2
edge_bins <- as.data.frame(get.edgelist(Bin_graph),stringsAsFactors = FALSE) #extract edglist of graph
edge_bins$V1 <- as.numeric(edge_bins$V1) #convert egde into number
edge_bins$V2 <- as.numeric(edge_bins$V2) #convert egde into number
edge_bins <- edge_bins%>%filter(V1>seqStart, V2<seqEnd ,V2>seqStart,V1<seqEnd) #filter out edges that are smaller than smallest break point and bigger than biggest break point
edge_conv <- cbind(edge_bins$V2, edge_bins$V1)#make a data frame where bin 2 is in the first columen
all_edge <- rbind(edge_bins, edge_conv)# combine the two data frame to have all edges in the first and second column
edge_count <- all_edge%>%group_by(V1)%>% #count how many times you have each edge in column V1. since we added all the edge of V2 to V1 we only have to count it in one of the columns
summarize(Total_count=sum(count=n()))
edge_count <- left_join(edge_count, Length, by="V1") # merge bins length and edge_count table
edge_count$Total_count<- ((edge_count$Total_count/edge_count$L1)/MG[i,1])# normalize the edge counts by the median coverage of the 10 single-copy marker gene evaluated from the DESeq2 featureCount and by the bin length
colnames(edge_count) <- c("V1",paste(substr(i,1,7),"Count",sep ="_"),"gene_length")#rename column of edge_counts table
return(edge_count)
}
### Use appropriate function:
# “load_isolate_sample”, if the data are from a cultured strain without plasmids or
# “load_community_samples”, if the data are from a community sample or a strain with plasmid(s).
# HERE: “load_community_samples(x=input file, strain=strain name)” because *S*.Tm LT2 carries plasmids
C_1<- load_community_samples("Control_1.opr",strain="SalmonellaLT2")
C_2<- load_community_samples("Control_2.opr",strain="SalmonellaLT2")
C_3<- load_community_samples("Control_3.opr",strain="SalmonellaLT2")
I_1<- load_community_samples("Induced_1.opr",strain="SalmonellaLT2")
I_2<- load_community_samples("Induced_2.opr",strain="SalmonellaLT2")
I_3<- load_community_samples("Induced_3.opr",strain="SalmonellaLT2")
# If interested in all OPRs at any genome position, the seqStart should be set to 1 and the seqEnd should represent the genome length. Here, the genome length can be extracted from “MG_coverage.txt” file.
Con_1 <- OPR_graph(C_1,MG=MG_coverage,i="Control_1",seqStart = 1, seqEnd = MG_coverage[1,2])
Con_2 <- OPR_graph(C_2,MG=MG_coverage,i="Control_2",seqStart = 1, seqEnd = MG_coverage[1,2])
Con_3 <- OPR_graph(C_3,MG=MG_coverage,i="Control_3",seqStart = 1, seqEnd = MG_coverage[1,2])
Ind_1 <- OPR_graph(I_1,MG=MG_coverage,i="Induced_1",seqStart = 1, seqEnd = MG_coverage[1,2])
Ind_2 <- OPR_graph(I_2,MG=MG_coverage,i="Induced_2",seqStart = 1, seqEnd = MG_coverage[1,2])
Ind_3 <- OPR_graph(I_3,MG=MG_coverage,i="Induced_3",seqStart = 1, seqEnd = MG_coverage[1,2])
# Calculate average OPR count within each bin
# Merge all control samples and calculate the average OPR count per bin
Control <- bind_rows(Con_1,Con_2,Con_3)%>% #combine all the control samples into one data frame
group_by(V1)%>% #group all bins with the same position
summarize(All_count_C=sum(Control_Count)/3) #calculate the average read count per bin
# Merge all induced samples and calculate the average OPR count per bin
Induced <- bind_rows(Ind_1,Ind_2, Ind_3)%>% #combine all the induced samples into one data frame
group_by(V1)%>% #group all bins with the same position
summarize(All_count_I=sum(Induced_Count)/3) #calculate the average read count per bins
# Plot average ORP count per genome position
plot_data <- full_join(Control,Induced, by="V1")
plot_data[is.na(plot_data)] <- 0 #replace NA by 0
ggplot(data=plot_data)+
geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_I,fill="#fb8072"),width=0.004)+
geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_C,fill="#988699"),width=0.004)+
theme_light()+
scale_fill_manual(values=c( "#988699","#fb8072"))+
theme(legend.position ="none",
plot.title = element_text(hjust = 0.5, size = 16))+
ylab("OPRs (per cell)")+
xlab("genome position (Mb)")+
theme(axis.text.y = element_text(size=16),
axis.title.x = element_text(size=20),
axis.text.x = element_text(size=16),
axis.title.y = element_text(size=20),
axis.ticks.x=element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
# Plot only OPRs for which both reads are located within a sub-section of the genome
# The example of the prophage p22 is shown
# The host genome region of interest is defined by seqStart and seqEnd
Con_p1 <- OPR_graph(C_1,MG=MG_coverage,i="Control_1",seqStart=1210000,seqEnd=1270000)
Con_p2 <- OPR_graph(C_2,MG=MG_coverage,i="Control_2",seqStart=1210000,seqEnd=1270000)
Con_p3 <- OPR_graph(C_3,MG=MG_coverage,i="Control_3",seqStart=1210000,seqEnd=1270000)
Ind_p1 <- OPR_graph(I_1,MG=MG_coverage,i="Induced_1",seqStart=1210000,seqEnd=1270000)
Ind_p2 <- OPR_graph(I_2,MG=MG_coverage,i="Induced_2",seqStart=1210000,seqEnd=1270000)
Ind_p3 <- OPR_graph(I_3,MG=MG_coverage,i="Induced_3",seqStart=1210000,seqEnd=1270000)
# Merge all control samples and calculate the average OPR count per bin
Control_p <- bind_rows(Con_p1,Con_p2,Con_p3)%>% #combine all the control samples into one data frame
group_by(V1)%>% #group all bins with the same position
summarize(All_count_C=sum(Control_Count)/3) #calculate the average read count per bin
# Merge all induced samples and calculate the average OPR count per bin
Induced_p <- bind_rows(Ind_p1,Ind_p2, Ind_p3)%>% #combine all the induced samples into one data frame
group_by(V1)%>% #group all bins with the same position.
summarize(All_count_I=sum(Induced_Count)/3) #calculate the average read count per bin
plot_data_p <- full_join(Control_p,Induced_p, by="V1")
plot_data_p[is.na(plot_data_p)] <- 0 #replace NA by 0
ggplot(data=plot_data_p)+
geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_I,fill="#fb8072"),width=0.0008)+
geom_col(aes(x=(as.numeric(V1)/1000000), y=All_count_C,fill="#988699"),width=0.0008)+
theme_light()+
scale_fill_manual(values=c( "#988699","#fb8072"))+
theme(legend.position ="none",
plot.title = element_text(hjust = 0.5, size = 16))+
ylab("OPRs (per cell)")+
xlab("genome position (Mb)")+
theme(axis.text.y = element_text(size=16),
axis.title.x = element_text(size=20),
axis.text.x = element_text(size=16),
axis.title.y = element_text(size=20),
axis.ticks.x=element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
plot_data_p$V1 <- as.numeric(plot_data_p$V1)
LT2_P22_attL <- plot_data_p%>%filter(V1>(1213987-1500),V1<(1213987+1500))#Middle gene position if first gene with increased deltaPtoH change of phage region 1 +- 3 insert sizes
LT2_P22_attR <- plot_data_p%>%filter(V1>(1255756-1500),V1<(1255756+1500))#Middle gene position if first gene with increased deltaPtoH change of phage region 1 +- 3 Insert sizes
round(sum(LT2_P22_attL$All_count_C),2)
## [1] 0.04
round(sum(LT2_P22_attR$All_count_C),2)
## [1] 0.04
round(sum(LT2_P22_attL$All_count_I),2)
## [1] 5.6
round(sum(LT2_P22_attR$All_count_I),2)
## [1] 5.6