Bonds

Code
library(tidyverse)
library(moments)
library(Hmisc)
library(spatstat)

reference_dir = "../reference_set/"
prosco_dir = "../prosco/"
bonds <- list.files(reference_dir,"bond*")

bond_dat <- list()
colnames <- c("type","pdb_id","unk1","chain","resname1","resnum1","atomname1","unk2","resname2","resnum2","atom2","unk3","val","target","classification","conformer")

i <- 1
for(file in bonds){
  filepath <- paste(reference_dir,file,sep='')
  filename_prefix <- strsplit(file,'[.]')[[1]]
  prosco_filepath <- paste(prosco_dir,"_prosco.csv",sep=filename_prefix)
  bond_dat[[i]] <- list()
  bond_dat[[i]]$filename <- file
  bond_dat[[i]]$dat <- read_csv(filepath,col_names=colnames,show_col_types=FALSE)
  bond_dat[[i]]$prosco <- read_csv(prosco_filepath,show_col_types=FALSE)
  i = i+1
}

for(i in 1:length(bond_dat)){
  codes = list()
  for(id in bond_dat[[i]]$dat$pdb_id){
    codes[id] = sum(bond_dat[[i]]$dat$pdb_id == id)
  }
  bond_dat[[i]]$dat$num_bonds = unlist(codes[bond_dat[[i]]$dat$pdb_id])
  bond_dat[[i]]$dat$weight = 1/bond_dat[[i]]$dat$num_bonds
}

for(i in 1:length(bond_dat)){
  bond_dat[[i]]$hist <- bond_dat[[i]]$dat %>% ggplot(aes(x=val)) + geom_histogram() + ggtitle(bond_dat[[i]]$filename)
  #bond_dat[[i]]$ecdf <- ecdf(bond_dat[[i]]$dat$val)
  
  bond_dat[[i]]$mean = mean(bond_dat[[i]]$dat$val)
  bond_dat[[i]]$median = median(bond_dat[[i]]$dat$val)
  bond_dat[[i]]$sd = sd(bond_dat[[i]]$dat$val)
  
  bond_dat[[i]]$iqr = IQR(bond_dat[[i]]$dat$val)
  bond_dat[[i]]$mad = mad(bond_dat[[i]]$dat$val)
  
  bond_dat[[i]]$wt_mean = wtd.mean(bond_dat[[i]]$dat$val,bond_dat[[i]]$dat$weight)
  bond_dat[[i]]$wt_sd = sqrt(wtd.var(bond_dat[[i]]$dat$val,bond_dat[[i]]$dat$weight))

  bond_dat[[i]]$skewness = skewness(bond_dat[[i]]$dat$val)
  bond_dat[[i]]$kurtosis = kurtosis(bond_dat[[i]]$dat$val)
  
  preferred <- bond_dat[[i]]$prosco$bcol=="green"
  allowed <- bond_dat[[i]]$prosco$bcol=="yellow"
  ofconcern_or_borderline <- (bond_dat[[i]]$prosco$bcol=="red" | bond_dat[[i]]$prosco$bcol=="orange")
  med = median(bond_dat[[i]]$dat$val)
  outliers_to = bond_dat[[i]]$prosco$to[ofconcern_or_borderline]
  outliers_from = bond_dat[[i]]$prosco$from[ofconcern_or_borderline]
  bond_dat[[i]]$threshold = list(
    mean = mean(bond_dat[[i]]$dat$val),
    median = med,
    min = min(bond_dat[[i]]$prosco$from),
    max = max(bond_dat[[i]]$prosco$to),
    preferred_lower = min(bond_dat[[i]]$prosco$from[preferred]),
    preferred_upper = max(bond_dat[[i]]$prosco$to[preferred]),
    allowed_lower = min(bond_dat[[i]]$prosco$from[allowed]),
    allowed_upper = max(bond_dat[[i]]$prosco$to[allowed]),
    ofconcern_lower = max(outliers_to[outliers_to < med]),
    ofconcern_upper = min(outliers_to[outliers_to > med])
  )
}

idr_to_sd = 1/(2*qnorm(.9))
sigma_levels = c(3,4,5,6)

for(i in 1:length(bond_dat)){
  w_quantiles <- weighted.quantile(bond_dat[[i]]$dat$val, bond_dat[[i]]$dat$weight, probs=c(0.1,0.5,0.9))
  
  idx_lower = bond_dat[[i]]$dat$val <= w_quantiles[2]
  idx_upper = bond_dat[[i]]$dat$val >= w_quantiles[2]
  
  bond_dat[[i]]$idr <- list()
  bond_dat[[i]]$idr$w_quantiles <- w_quantiles
  bond_dat[[i]]$idr$w_median <- w_quantiles[2]
  bond_dat[[i]]$idr$w_idr <- w_quantiles[3] - w_quantiles[1]
  bond_dat[[i]]$idr$w_idr_lower <- 2.0*(w_quantiles[2] - w_quantiles[1])
  bond_dat[[i]]$idr$w_idr_upper <- 2.0*(w_quantiles[3] - w_quantiles[2])
  
  bond_dat[[i]]$idr$w_z_sym_lower <- bond_dat[[i]]$idr$w_median - sigma_levels*idr_to_sd*bond_dat[[i]]$idr$w_idr
  bond_dat[[i]]$idr$w_z_sym_upper <- bond_dat[[i]]$idr$w_median + sigma_levels*idr_to_sd*bond_dat[[i]]$idr$w_idr
  bond_dat[[i]]$idr$w_z_asym_lower <- bond_dat[[i]]$idr$w_median - sigma_levels*idr_to_sd*bond_dat[[i]]$idr$w_idr_lower
  bond_dat[[i]]$idr$w_z_asym_upper <- bond_dat[[i]]$idr$w_median + sigma_levels*idr_to_sd*bond_dat[[i]]$idr$w_idr_upper
  
  outliers = c()
  for(j in 1:length(sigma_levels)){
    outliers = c(outliers, 100*(sum(bond_dat[[i]]$dat$val < bond_dat[[i]]$idr$w_z_asym_lower[j]) + sum(bond_dat[[i]]$dat$val > bond_dat[[i]]$idr$w_z_asym_upper[j]))/(2*length(bond_dat[[i]]$dat$val)) ) 
  }
  bond_dat[[i]]$idr$outliers <- outliers
}

bond_summary = tibble(
  file = unlist(lapply(bond_dat, function(x){x$filename})),

  min = unlist(lapply(bond_dat, function(x){x$threshold$min})),
  max = unlist(lapply(bond_dat, function(x){x$threshold$max})),
  preferred_lower = unlist(lapply(bond_dat, function(x){x$threshold$preferred_lower})),
  preferred_upper = unlist(lapply(bond_dat, function(x){x$threshold$preferred_upper})),
  allowed_lower = unlist(lapply(bond_dat, function(x){x$threshold$allowed_lower})),
  allowed_upper = unlist(lapply(bond_dat, function(x){x$threshold$allowed_upper})),
  ofconcern_lower = unlist(lapply(bond_dat, function(x){x$threshold$ofconcern_lower})),
  ofconcern_upper = unlist(lapply(bond_dat, function(x){x$threshold$ofconcern_upper})),
  
  w_median = unlist(lapply(bond_dat, function(x){x$idr$w_median})),
  w_idr = unlist(lapply(bond_dat, function(x){x$idr$w_idr})),
  w_idr_lower = unlist(lapply(bond_dat, function(x){x$idr$w_idr_lower})),
  w_idr_upper = unlist(lapply(bond_dat, function(x){x$idr$w_idr_upper})),
  
  skewness = unlist(lapply(bond_dat, function(x){x$skewness})),
  kurtosis = unlist(lapply(bond_dat, function(x){x$kurtosis})),
  sarle = ((skewness*skewness)+1)/kurtosis
) %>% 
  mutate(sigma_lower = w_idr_upper*idr_to_sd,
         sigma_upper = w_idr_upper*idr_to_sd) %>% 
  mutate(prosco_lower = (ofconcern_lower - w_median)/sigma_lower,
         prosco_upper = (ofconcern_upper - w_median)/sigma_upper) 

Summary of different Z thresholds over all bond types

Code
is_outlier <- function(x,type) {
  # This controls which extreme values (bond types) are labelled.
  return( (type == '6-sigma' & x > 0.51) | (type == '5-sigma' & x > 0.9) | (type == '4-sigma' & x > 1.5)  | (type == '3-sigma' & x > 2.7) )
}

bond_summary %>% 
  add_column(sigma3 = unlist(lapply(bond_dat, function(x){x$idr$outliers[1]}))) %>%
  add_column(sigma4 = unlist(lapply(bond_dat, function(x){x$idr$outliers[2]}))) %>%
  add_column(sigma5 = unlist(lapply(bond_dat, function(x){x$idr$outliers[3]}))) %>%
  add_column(sigma6 = unlist(lapply(bond_dat, function(x){x$idr$outliers[4]}))) %>%
  pivot_longer(cols=c('sigma3', 'sigma4','sigma5','sigma6'),
                    names_to='Threshold',
                    values_to='outliers') %>%
  mutate(Threshold = recode(Threshold,
                       'sigma3' = '3-sigma',
                       'sigma4' = '4-sigma',
                       'sigma5' = '5-sigma',
                       'sigma6' = '6-sigma')) %>%
  mutate(is_outlier = ifelse(is_outlier(outliers,Threshold), file, as.numeric(NA)) ) %>%
  ggplot(aes(x=Threshold,y=outliers,fill=Threshold)) + geom_boxplot(show.legend=FALSE) + 
  xlab("Z' Threshold") +
  ylab("Percentage of observations outside the Z' threshold") +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  geom_text(aes(label=is_outlier),na.rm=TRUE,nudge_y=0.11) +
  labs(title="Percentages of observations more extreme than different Z' thresholds")

Histograms for each bond type, with Z thresholds

Code
for(i in 1:length(bond_dat)){
  print(paste(i,bond_dat[[i]]$filename))
  print(bond_dat[[i]]$hist +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_lower[1],lwd=2, color="blue") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_upper[1],lwd=2,color="blue") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_lower[2],lwd=2, color="purple") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_upper[2],lwd=2,color="purple") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_lower[3],lwd=2, color="maroon") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_upper[3],lwd=2,color="maroon") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_lower[4],lwd=2, color="red") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_z_asym_upper[4],lwd=2,color="red") +
          
          geom_vline(xintercept=bond_dat[[i]]$idr$w_quantiles[1],lwd=1,lty="dashed",color="yellow") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_quantiles[2],lwd=1,color="yellow") +
          geom_vline(xintercept=bond_dat[[i]]$idr$w_quantiles[3],lwd=1,lty="dashed",color="yellow") +
          xlab("Value") +
          ylab("Count") +
          labs(subtitle=paste(paste("Number of observations:",length(bond_dat[[i]]$dat$val)),
                              #paste("Skewness:",round(bond_dat[[i]]$skewness,3)),
                              #paste("Kurtosis:",round(bond_dat[[i]]$kurtosis,3)),
                              sep="\n"),
               caption=paste("Solid yellow line: Weighted Median",
                             "Dashed yellow lines: 10th and 90th percentiles",
                             "Blue lines: 3-sigma Z' thresholds",
                             "Purple lines: 4-sigma Z' thresholds",
                             "Maroon lines: 5-sigma Z' thresholds",
                             "Red lines: 6-sigma Z' thresholds",
                             sep="\n"))
        )
}
[1] "1 bond_A_C1p_N9.csv"

[1] "2 bond_A_C2_N3.csv"

[1] "3 bond_A_C4_C5.csv"

[1] "4 bond_A_C5_C6.csv"

[1] "5 bond_A_C5_N7.csv"

[1] "6 bond_A_C6_N1.csv"

[1] "7 bond_A_C6_N6.csv"

[1] "8 bond_A_C8_N9.csv"

[1] "9 bond_A_N1_C2.csv"

[1] "10 bond_A_N3_C4.csv"

[1] "11 bond_A_N7_C8.csv"

[1] "12 bond_A_N9_C4.csv"

[1] "13 bond_C_C1p_N1.csv"

[1] "14 bond_C_C2_N3.csv"

[1] "15 bond_C_C2_O2.csv"

[1] "16 bond_C_C4_C5.csv"

[1] "17 bond_C_C4_N4.csv"

[1] "18 bond_C_C5_C6.csv"

[1] "19 bond_C_C6_N1.csv"

[1] "20 bond_C_N1_C2.csv"

[1] "21 bond_C_N3_C4.csv"

[1] "22 bond_DA_C1p_N9.csv"

[1] "23 bond_DA_C2_N3.csv"

[1] "24 bond_DA_C4_C5.csv"

[1] "25 bond_DA_C5_C6.csv"

[1] "26 bond_DA_C5_N7.csv"

[1] "27 bond_DA_C6_N1.csv"

[1] "28 bond_DA_C6_N6.csv"

[1] "29 bond_DA_C8_N9.csv"

[1] "30 bond_DA_N1_C2.csv"

[1] "31 bond_DA_N3_C4.csv"

[1] "32 bond_DA_N7_C8.csv"

[1] "33 bond_DA_N9_C4.csv"

[1] "34 bond_DC_C1p_N1.csv"

[1] "35 bond_DC_C2_N3.csv"

[1] "36 bond_DC_C2_O2.csv"

[1] "37 bond_DC_C4_C5.csv"

[1] "38 bond_DC_C4_N4.csv"

[1] "39 bond_DC_C5_C6.csv"

[1] "40 bond_DC_C6_N1.csv"

[1] "41 bond_DC_N1_C2.csv"

[1] "42 bond_DC_N3_C4.csv"

[1] "43 bond_DG_C1p_N9.csv"

[1] "44 bond_DG_C2_N2.csv"

[1] "45 bond_DG_C2_N3.csv"

[1] "46 bond_DG_C4_C5.csv"

[1] "47 bond_DG_C5_C6.csv"

[1] "48 bond_DG_C5_N7.csv"

[1] "49 bond_DG_C6_N1.csv"

[1] "50 bond_DG_C6_O6.csv"

[1] "51 bond_DG_C8_N9.csv"

[1] "52 bond_DG_N1_C2.csv"

[1] "53 bond_DG_N3_C4.csv"

[1] "54 bond_DG_N7_C8.csv"

[1] "55 bond_DG_N9_C4.csv"

[1] "56 bond_DNA_C1p_C2p.csv"

[1] "57 bond_DNA_C1p_O4p.csv"

[1] "58 bond_DNA_C2p_C3p.csv"

[1] "59 bond_DNA_C3p_C4p.csv"

[1] "60 bond_DNA_C4p_C5p.csv"

[1] "61 bond_DNA_C4p_O4p.csv"

[1] "62 bond_DNA_O3p_C3p.csv"

[1] "63 bond_DNA_O3pr2_P.csv"

[1] "64 bond_DNA_O5p_C5p.csv"

[1] "65 bond_DNA_O5p_P.csv"

[1] "66 bond_DNA_OP1_P.csv"

[1] "67 bond_DNA_OP2_P.csv"

[1] "68 bond_DT_C1p_N1.csv"

[1] "69 bond_DT_C2_N3.csv"

[1] "70 bond_DT_C2_O2.csv"

[1] "71 bond_DT_C4_C5.csv"

[1] "72 bond_DT_C4_O4.csv"

[1] "73 bond_DT_C5_C6.csv"

[1] "74 bond_DT_C6_N1.csv"

[1] "75 bond_DT_C7_C5.csv"

[1] "76 bond_DT_N1_C2.csv"

[1] "77 bond_DT_N3_C4.csv"

[1] "78 bond_G_C1p_N9.csv"

[1] "79 bond_G_C2_N2.csv"

[1] "80 bond_G_C2_N3.csv"

[1] "81 bond_G_C4_C5.csv"

[1] "82 bond_G_C5_C6.csv"

[1] "83 bond_G_C5_N7.csv"

[1] "84 bond_G_C6_N1.csv"

[1] "85 bond_G_C6_O6.csv"

[1] "86 bond_G_C8_N9.csv"

[1] "87 bond_G_N1_C2.csv"

[1] "88 bond_G_N3_C4.csv"

[1] "89 bond_G_N7_C8.csv"

[1] "90 bond_G_N9_C4.csv"

[1] "91 bond_RNA_C1p_C2p.csv"

[1] "92 bond_RNA_C1p_O4p.csv"

[1] "93 bond_RNA_C2p_C3p.csv"

[1] "94 bond_RNA_C2p_O2p.csv"

[1] "95 bond_RNA_C3p_C4p.csv"

[1] "96 bond_RNA_C4p_C5p.csv"

[1] "97 bond_RNA_C4p_O4p.csv"

[1] "98 bond_RNA_O3p_C3p.csv"

[1] "99 bond_RNA_O3pr2_P.csv"

[1] "100 bond_RNA_O5p_C5p.csv"

[1] "101 bond_RNA_O5p_P.csv"

[1] "102 bond_RNA_OP1_P.csv"

[1] "103 bond_RNA_OP2_P.csv"

[1] "104 bond_U_C1p_N1.csv"

[1] "105 bond_U_C2_N3.csv"

[1] "106 bond_U_C2_O2.csv"

[1] "107 bond_U_C4_C5.csv"

[1] "108 bond_U_C4_O4.csv"

[1] "109 bond_U_C5_C6.csv"

[1] "110 bond_U_C6_N1.csv"

[1] "111 bond_U_N1_C2.csv"

[1] "112 bond_U_N3_C4.csv"