Angles

Code
library(tidyverse)
library(moments)
library(Hmisc)
library(spatstat)

reference_dir = "../reference_set/"
prosco_dir = "../prosco/"
angles <- list.files(reference_dir,"angle*")

angle_dat <- list()
colnames <- c("type","pdb_id","unk1","chain","resname1","resnum1","atomname1","unk2","resname2","resnum2","atom2","unk3","resname3","resnum3","atom3","unk4","val","target","classification","conformer")

i <- 1
for(file in angles){
  filepath <- paste(reference_dir,file,sep='')
  filename_prefix <- strsplit(file,'[.]')[[1]]
  prosco_filepath <- paste(prosco_dir,"_prosco.csv",sep=filename_prefix)
  angle_dat[[i]] <- list()
  angle_dat[[i]]$filename <- file
  angle_dat[[i]]$dat <- read_csv(filepath,col_names=colnames,show_col_types=FALSE)
  angle_dat[[i]]$prosco <- read_csv(prosco_filepath,show_col_types=FALSE)
  i = i+1
}

for(i in 1:length(angle_dat)){
  codes = list()
  for(id in angle_dat[[i]]$dat$pdb_id){
    codes[id] = sum(angle_dat[[i]]$dat$pdb_id == id)
  }
  angle_dat[[i]]$dat$num_angles = unlist(codes[angle_dat[[i]]$dat$pdb_id])
  angle_dat[[i]]$dat$weight = 1/angle_dat[[i]]$dat$num_angles
}

for(i in 1:length(angle_dat)){
  angle_dat[[i]]$hist <- angle_dat[[i]]$dat %>% ggplot(aes(x=val)) + geom_histogram() + ggtitle(angle_dat[[i]]$filename)
  #angle_dat[[i]]$ecdf <- ecdf(angle_dat[[i]]$dat$val)
  
  angle_dat[[i]]$mean = mean(angle_dat[[i]]$dat$val)
  angle_dat[[i]]$median = median(angle_dat[[i]]$dat$val)
  angle_dat[[i]]$sd = sd(angle_dat[[i]]$dat$val)
  
  angle_dat[[i]]$iqr = IQR(angle_dat[[i]]$dat$val)
  angle_dat[[i]]$mad = mad(angle_dat[[i]]$dat$val)
  
  angle_dat[[i]]$wt_mean = wtd.mean(angle_dat[[i]]$dat$val,angle_dat[[i]]$dat$weight)
  angle_dat[[i]]$wt_sd = sqrt(wtd.var(angle_dat[[i]]$dat$val,angle_dat[[i]]$dat$weight))

  angle_dat[[i]]$skewness = skewness(angle_dat[[i]]$dat$val)
  angle_dat[[i]]$kurtosis = kurtosis(angle_dat[[i]]$dat$val)
  
  preferred <- angle_dat[[i]]$prosco$bcol=="green"
  allowed <- angle_dat[[i]]$prosco$bcol=="yellow"
  ofconcern_or_borderline <- (angle_dat[[i]]$prosco$bcol=="red" | angle_dat[[i]]$prosco$bcol=="orange")
  med = median(angle_dat[[i]]$dat$val)
  outliers_to = angle_dat[[i]]$prosco$to[ofconcern_or_borderline]
  outliers_from = angle_dat[[i]]$prosco$from[ofconcern_or_borderline]
  angle_dat[[i]]$threshold = list(
    mean = mean(angle_dat[[i]]$dat$val),
    median = med,
    min = min(angle_dat[[i]]$prosco$from),
    max = max(angle_dat[[i]]$prosco$to),
    preferred_lower = min(angle_dat[[i]]$prosco$from[preferred]),
    preferred_upper = max(angle_dat[[i]]$prosco$to[preferred]),
    allowed_lower = min(angle_dat[[i]]$prosco$from[allowed]),
    allowed_upper = max(angle_dat[[i]]$prosco$to[allowed]),
    ofconcern_lower = max(outliers_to[outliers_to < med]),
    ofconcern_upper = min(outliers_to[outliers_to > med])
  )
}

idr_to_sd = 1/(2*qnorm(.9))
sigma_levels = c(3,4,5,6)

for(i in 1:length(angle_dat)){
  w_quantiles <- weighted.quantile(angle_dat[[i]]$dat$val, angle_dat[[i]]$dat$weight, probs=c(0.1,0.5,0.9))
  
  idx_lower = angle_dat[[i]]$dat$val <= w_quantiles[2]
  idx_upper = angle_dat[[i]]$dat$val >= w_quantiles[2]
  
  angle_dat[[i]]$idr <- list()
  angle_dat[[i]]$idr$w_quantiles <- w_quantiles
  angle_dat[[i]]$idr$w_median <- w_quantiles[2]
  angle_dat[[i]]$idr$w_idr <- w_quantiles[3] - w_quantiles[1]
  angle_dat[[i]]$idr$w_idr_lower <- 2.0*(w_quantiles[2] - w_quantiles[1])
  angle_dat[[i]]$idr$w_idr_upper <- 2.0*(w_quantiles[3] - w_quantiles[2])
  
  angle_dat[[i]]$idr$w_z_sym_lower <- angle_dat[[i]]$idr$w_median - sigma_levels*idr_to_sd*angle_dat[[i]]$idr$w_idr
  angle_dat[[i]]$idr$w_z_sym_upper <- angle_dat[[i]]$idr$w_median + sigma_levels*idr_to_sd*angle_dat[[i]]$idr$w_idr
  angle_dat[[i]]$idr$w_z_asym_lower <- angle_dat[[i]]$idr$w_median - sigma_levels*idr_to_sd*angle_dat[[i]]$idr$w_idr_lower
  angle_dat[[i]]$idr$w_z_asym_upper <- angle_dat[[i]]$idr$w_median + sigma_levels*idr_to_sd*angle_dat[[i]]$idr$w_idr_upper
  
  outliers = c()
  for(j in 1:length(sigma_levels)){
    outliers = c(outliers, 100*(sum(angle_dat[[i]]$dat$val < angle_dat[[i]]$idr$w_z_asym_lower[j]) + sum(angle_dat[[i]]$dat$val > angle_dat[[i]]$idr$w_z_asym_upper[j]))/(2*length(angle_dat[[i]]$dat$val)) ) 
  }
  angle_dat[[i]]$idr$outliers <- outliers
}

angle_summary = tibble(
  file = unlist(lapply(angle_dat, function(x){x$filename})),

  min = unlist(lapply(angle_dat, function(x){x$threshold$min})),
  max = unlist(lapply(angle_dat, function(x){x$threshold$max})),
  preferred_lower = unlist(lapply(angle_dat, function(x){x$threshold$preferred_lower})),
  preferred_upper = unlist(lapply(angle_dat, function(x){x$threshold$preferred_upper})),
  allowed_lower = unlist(lapply(angle_dat, function(x){x$threshold$allowed_lower})),
  allowed_upper = unlist(lapply(angle_dat, function(x){x$threshold$allowed_upper})),
  ofconcern_lower = unlist(lapply(angle_dat, function(x){x$threshold$ofconcern_lower})),
  ofconcern_upper = unlist(lapply(angle_dat, function(x){x$threshold$ofconcern_upper})),
  
  w_median = unlist(lapply(angle_dat, function(x){x$idr$w_median})),
  w_idr = unlist(lapply(angle_dat, function(x){x$idr$w_idr})),
  w_idr_lower = unlist(lapply(angle_dat, function(x){x$idr$w_idr_lower})),
  w_idr_upper = unlist(lapply(angle_dat, function(x){x$idr$w_idr_upper})),
  
  skewness = unlist(lapply(angle_dat, function(x){x$skewness})),
  kurtosis = unlist(lapply(angle_dat, function(x){x$kurtosis})),
  sarle = ((skewness*skewness)+1)/kurtosis
) %>% 
  mutate(sigma_lower = w_idr_upper*idr_to_sd,
         sigma_upper = w_idr_upper*idr_to_sd) %>% 
  mutate(prosco_lower = (ofconcern_lower - w_median)/sigma_lower,
         prosco_upper = (ofconcern_upper - w_median)/sigma_upper) 

Summary of different Z thresholds over all angle types

Code
is_outlier <- function(x,type) {
  # This controls which extreme values (angle types) are labelled.
  # Note that in the case of overlaps, only one of the overlapping labels will be shown.
  return( (type == '6-sigma' & x > 1.2) | (type == '5-sigma' & x > 1.5) | (type == '4-sigma' & x > 2.5)  | (type == '3-sigma' & x > 4.0) )
}

angle_summary %>% 
  add_column(sigma3 = unlist(lapply(angle_dat, function(x){x$idr$outliers[1]}))) %>%
  add_column(sigma4 = unlist(lapply(angle_dat, function(x){x$idr$outliers[2]}))) %>%
  add_column(sigma5 = unlist(lapply(angle_dat, function(x){x$idr$outliers[3]}))) %>%
  add_column(sigma6 = unlist(lapply(angle_dat, function(x){x$idr$outliers[4]}))) %>%
  pivot_longer(cols=c('sigma3', 'sigma4','sigma5','sigma6'),
                    names_to='Threshold',
                    values_to='outliers') %>%
  mutate(Threshold = recode(Threshold,
                       'sigma3' = '3-sigma',
                       'sigma4' = '4-sigma',
                       'sigma5' = '5-sigma',
                       'sigma6' = '6-sigma')) %>%
  mutate(is_outlier = ifelse(is_outlier(outliers,Threshold), file, as.numeric(NA)) ) %>%
  ggplot(aes(x=Threshold,y=outliers,fill=Threshold)) + geom_boxplot(show.legend=FALSE) + 
  xlab("Z' Threshold") +
  ylab("Percentage of observations outside the Z' threshold") +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  geom_text(aes(label=is_outlier),na.rm=TRUE,nudge_y=0.2,check_overlap=TRUE) +
  labs(title="Percentages of observations more extreme than different Z' thresholds")

Histograms for each angle type, with Z thresholds

Code
for(i in 1:length(angle_dat)){
  print(paste(i,angle_dat[[i]]$filename))
  print(angle_dat[[i]]$hist +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_lower[1],lwd=2, color="blue") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_upper[1],lwd=2,color="blue") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_lower[2],lwd=2, color="purple") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_upper[2],lwd=2,color="purple") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_lower[3],lwd=2, color="maroon") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_upper[3],lwd=2,color="maroon") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_lower[4],lwd=2, color="red") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_z_asym_upper[4],lwd=2,color="red") +
          
          geom_vline(xintercept=angle_dat[[i]]$idr$w_quantiles[1],lwd=1,lty="dashed",color="yellow") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_quantiles[2],lwd=1,color="yellow") +
          geom_vline(xintercept=angle_dat[[i]]$idr$w_quantiles[3],lwd=1,lty="dashed",color="yellow") +
          xlab("Value") +
          ylab("Count") +
          labs(subtitle=paste(paste("Number of observations:",length(angle_dat[[i]]$dat$val)),
                              #paste("Skewness:",round(angle_dat[[i]]$skewness,3)),
                              #paste("Kurtosis:",round(angle_dat[[i]]$kurtosis,3)),
                              sep="\n"),
               caption=paste("Solid yellow line: Weighted Median",
                             "Dashed yellow lines: 10th and 90th percentiles",
                             "Blue lines: 3-sigma Z' thresholds",
                             "Purple lines: 4-sigma Z' thresholds",
                             "Maroon lines: 5-sigma Z' thresholds",
                             "Red lines: 6-sigma Z' thresholds",
                             sep="\n"))
        )
}
[1] "1 angle_A_C1p_N9_C4.csv"

[1] "2 angle_A_C1p_N9_C8.csv"

[1] "3 angle_A_C2_N3_C4.csv"

[1] "4 angle_A_C4_C5_C6.csv"

[1] "5 angle_A_C4_N9_C8.csv"

[1] "6 angle_A_C5_C4_N9.csv"

[1] "7 angle_A_C5_C6_N1.csv"

[1] "8 angle_A_C6_C5_N7.csv"

[1] "9 angle_A_C6_N1_C2.csv"

[1] "10 angle_A_C8_N7_C5.csv"

[1] "11 angle_A_N1_C2_N3.csv"

[1] "12 angle_A_N3_C4_C5.csv"

[1] "13 angle_A_N3_C4_N9.csv"

[1] "14 angle_A_N6_C6_C5.csv"

[1] "15 angle_A_N6_C6_N1.csv"

[1] "16 angle_A_N7_C5_C4.csv"

[1] "17 angle_A_N9_C1p_C2p.csv"

[1] "18 angle_A_N9_C1p_O4p.csv"

[1] "19 angle_A_N9_C8_N7.csv"

[1] "20 angle_C_C1p_N1_C2.csv"

[1] "21 angle_C_C1p_N1_C6.csv"

[1] "22 angle_C_C2_N3_C4.csv"

[1] "23 angle_C_C4_C5_C6.csv"

[1] "24 angle_C_C5_C6_N1.csv"

[1] "25 angle_C_C6_N1_C2.csv"

[1] "26 angle_C_N1_C1p_C2p.csv"

[1] "27 angle_C_N1_C1p_O4p.csv"

[1] "28 angle_C_N1_C2_N3.csv"

[1] "29 angle_C_N3_C4_C5.csv"

[1] "30 angle_C_N4_C4_C5.csv"

[1] "31 angle_C_N4_C4_N3.csv"

[1] "32 angle_C_O2_C2_N1.csv"

[1] "33 angle_C_O2_C2_N3.csv"

[1] "34 angle_DA_C1p_N9_C4.csv"

[1] "35 angle_DA_C1p_N9_C8.csv"

[1] "36 angle_DA_C2_N3_C4.csv"

[1] "37 angle_DA_C4_C5_C6.csv"

[1] "38 angle_DA_C4_N9_C8.csv"

[1] "39 angle_DA_C5_C4_N9.csv"

[1] "40 angle_DA_C5_C6_N1.csv"

[1] "41 angle_DA_C6_C5_N7.csv"

[1] "42 angle_DA_C6_N1_C2.csv"

[1] "43 angle_DA_C8_N7_C5.csv"

[1] "44 angle_DA_N1_C2_N3.csv"

[1] "45 angle_DA_N3_C4_C5.csv"

[1] "46 angle_DA_N3_C4_N9.csv"

[1] "47 angle_DA_N6_C6_C5.csv"

[1] "48 angle_DA_N6_C6_N1.csv"

[1] "49 angle_DA_N7_C5_C4.csv"

[1] "50 angle_DA_N9_C1p_C2p.csv"

[1] "51 angle_DA_N9_C1p_O4p.csv"

[1] "52 angle_DA_N9_C8_N7.csv"

[1] "53 angle_DC_C1p_N1_C2.csv"

[1] "54 angle_DC_C1p_N1_C6.csv"

[1] "55 angle_DC_C2_N3_C4.csv"

[1] "56 angle_DC_C4_C5_C6.csv"

[1] "57 angle_DC_C5_C6_N1.csv"

[1] "58 angle_DC_C6_N1_C2.csv"

[1] "59 angle_DC_N1_C1p_C2p.csv"

[1] "60 angle_DC_N1_C1p_O4p.csv"

[1] "61 angle_DC_N1_C2_N3.csv"

[1] "62 angle_DC_N3_C4_C5.csv"

[1] "63 angle_DC_N4_C4_C5.csv"

[1] "64 angle_DC_N4_C4_N3.csv"

[1] "65 angle_DC_O2_C2_N1.csv"

[1] "66 angle_DC_O2_C2_N3.csv"

[1] "67 angle_DG_C1p_N9_C4.csv"

[1] "68 angle_DG_C1p_N9_C8.csv"

[1] "69 angle_DG_C2_N3_C4.csv"

[1] "70 angle_DG_C4_C5_C6.csv"

[1] "71 angle_DG_C4_N9_C8.csv"

[1] "72 angle_DG_C5_C4_N9.csv"

[1] "73 angle_DG_C5_C6_N1.csv"

[1] "74 angle_DG_C6_C5_N7.csv"

[1] "75 angle_DG_C6_N1_C2.csv"

[1] "76 angle_DG_C8_N7_C5.csv"

[1] "77 angle_DG_N1_C2_N3.csv"

[1] "78 angle_DG_N2_C2_N1.csv"

[1] "79 angle_DG_N2_C2_N3.csv"

[1] "80 angle_DG_N3_C4_C5.csv"

[1] "81 angle_DG_N3_C4_N9.csv"

[1] "82 angle_DG_N7_C5_C4.csv"

[1] "83 angle_DG_N9_C1p_C2p.csv"

[1] "84 angle_DG_N9_C1p_O4p.csv"

[1] "85 angle_DG_N9_C8_N7.csv"

[1] "86 angle_DG_O6_C6_C5.csv"

[1] "87 angle_DG_O6_C6_N1.csv"

[1] "88 angle_DNA_C1p_C2p_C3p.csv"

[1] "89 angle_DNA_C1p_O4p_C4p.csv"

[1] "90 angle_DNA_C2p_C1p_O4p.csv"

[1] "91 angle_DNA_C2p_C3p_C4p.csv"

[1] "92 angle_DNA_C2p_C3p_O3p.csv"

[1] "93 angle_DNA_C3p_C4p_C5p.csv"

[1] "94 angle_DNA_C3p_C4p_O4p.csv"

[1] "95 angle_DNA_C4p_C3p_O3p.csv"

[1] "96 angle_DNA_C4p_C5p_O5p.csv"

[1] "97 angle_DNA_C5p_C4p_O4p.csv"

[1] "98 angle_DNA_O3pr2_P_O5p.csv"

[1] "99 angle_DNA_OP1_P_O3pr2.csv"

[1] "100 angle_DNA_OP1_P_O5p.csv"

[1] "101 angle_DNA_OP1_P_OP2.csv"

[1] "102 angle_DNA_OP2_P_O3pr2.csv"

[1] "103 angle_DNA_OP2_P_O5p.csv"

[1] "104 angle_DNA_P_O3pr2_C3pr2.csv"

[1] "105 angle_DNA_P_O5p_C5p.csv"

[1] "106 angle_DT_C1p_N1_C2.csv"

[1] "107 angle_DT_C1p_N1_C6.csv"

[1] "108 angle_DT_C2_N3_C4.csv"

[1] "109 angle_DT_C4_C5_C6.csv"

[1] "110 angle_DT_C5_C6_N1.csv"

[1] "111 angle_DT_C6_N1_C2.csv"

[1] "112 angle_DT_C7_C5_C4.csv"

[1] "113 angle_DT_C7_C5_C6.csv"

[1] "114 angle_DT_N1_C1p_C2p.csv"

[1] "115 angle_DT_N1_C1p_O4p.csv"

[1] "116 angle_DT_N1_C2_N3.csv"

[1] "117 angle_DT_N3_C4_C5.csv"

[1] "118 angle_DT_O2_C2_N1.csv"

[1] "119 angle_DT_O2_C2_N3.csv"

[1] "120 angle_DT_O4_C4_C5.csv"

[1] "121 angle_DT_O4_C4_N3.csv"

[1] "122 angle_G_C1p_N9_C4.csv"

[1] "123 angle_G_C1p_N9_C8.csv"

[1] "124 angle_G_C2_N3_C4.csv"

[1] "125 angle_G_C4_C5_C6.csv"

[1] "126 angle_G_C4_N9_C8.csv"

[1] "127 angle_G_C5_C4_N9.csv"

[1] "128 angle_G_C5_C6_N1.csv"

[1] "129 angle_G_C6_C5_N7.csv"

[1] "130 angle_G_C6_N1_C2.csv"

[1] "131 angle_G_C8_N7_C5.csv"

[1] "132 angle_G_N1_C2_N3.csv"

[1] "133 angle_G_N2_C2_N1.csv"

[1] "134 angle_G_N2_C2_N3.csv"

[1] "135 angle_G_N3_C4_C5.csv"

[1] "136 angle_G_N3_C4_N9.csv"

[1] "137 angle_G_N7_C5_C4.csv"

[1] "138 angle_G_N9_C1p_C2p.csv"

[1] "139 angle_G_N9_C1p_O4p.csv"

[1] "140 angle_G_N9_C8_N7.csv"

[1] "141 angle_G_O6_C6_C5.csv"

[1] "142 angle_G_O6_C6_N1.csv"

[1] "143 angle_RNA_C1p_C2p_C3p.csv"

[1] "144 angle_RNA_C1p_C2p_O2p.csv"

[1] "145 angle_RNA_C1p_O4p_C4p.csv"

[1] "146 angle_RNA_C2p_C1p_O4p.csv"

[1] "147 angle_RNA_C2p_C3p_C4p.csv"

[1] "148 angle_RNA_C2p_C3p_O3p.csv"

[1] "149 angle_RNA_C3p_C2p_O2p.csv"

[1] "150 angle_RNA_C3p_C4p_C5p.csv"

[1] "151 angle_RNA_C3p_C4p_O4p.csv"

[1] "152 angle_RNA_C4p_C3p_O3p.csv"

[1] "153 angle_RNA_C4p_C5p_O5p.csv"

[1] "154 angle_RNA_C5p_C4p_O4p.csv"

[1] "155 angle_RNA_O3pr2_P_O5p.csv"

[1] "156 angle_RNA_OP1_P_O3pr2.csv"

[1] "157 angle_RNA_OP1_P_O5p.csv"

[1] "158 angle_RNA_OP1_P_OP2.csv"

[1] "159 angle_RNA_OP2_P_O3pr2.csv"

[1] "160 angle_RNA_OP2_P_O5p.csv"

[1] "161 angle_RNA_P_O3pr2_C3pr2.csv"

[1] "162 angle_RNA_P_O5p_C5p.csv"

[1] "163 angle_U_C1p_N1_C2.csv"

[1] "164 angle_U_C1p_N1_C6.csv"

[1] "165 angle_U_C2_N3_C4.csv"

[1] "166 angle_U_C4_C5_C6.csv"

[1] "167 angle_U_C5_C6_N1.csv"

[1] "168 angle_U_C6_N1_C2.csv"

[1] "169 angle_U_N1_C1p_C2p.csv"

[1] "170 angle_U_N1_C1p_O4p.csv"

[1] "171 angle_U_N1_C2_N3.csv"

[1] "172 angle_U_N3_C4_C5.csv"

[1] "173 angle_U_O2_C2_N1.csv"

[1] "174 angle_U_O2_C2_N3.csv"

[1] "175 angle_U_O4_C4_C5.csv"

[1] "176 angle_U_O4_C4_N3.csv"