Compilation date: 18.11.2020
here_r = function (...) here::here("Statistics", "R", ...)
here_lab_data = function (...) here::here("Lab_paper2_data", ...)
here_out = function (...) here::here(
"Lab_paper2_output", "lab_figures_yannik_DZ", ...)
dir.create(here_out(), showWarnings = FALSE)
library(magrittr)
library(ggplot2)
library(cowplot)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load settings
source(here_r("setup.R"))
source(here_r("functions.R"))
# Load half violin package
source(here_r("RainCloudPlots", "R_rainclouds.R"))
# Load data
# Single data contains only one blood sample per participant
single_data = read.csv(here_lab_data(
"Global_Data", "Final_Lab_Single_20200904.csv"), stringsAsFactors = F)
###############################################################################
# Clean data
data = single_data
# Unique labels
data[data=="Positiv" | data=="pos." | data=="reactive" |
data=="positive"] = "Positive"
data[data=="Negativ" | data=="Indeterminate" | data=="neg." | data=="ind." |
data=="nonreactive" | data=="negative"] = "Negative"
# Remove Roche 0 values
val_cols = grep("roche_COI", colnames(data), value=T)
res_cols = grep("roche_Interpretation", colnames(data), value=T)
cat("# Roche == 0 before removal:", sum(data[,val_cols]==0, na.rm=T), "\n")
## # Roche == 0 before removal: 5
for (j in 1:length(val_cols)) {
# Conveniently, the order is the same
val_col = val_cols[[j]]
res_col = res_cols[[j]]
data[data[,val_col]==0 & !is.na(data[,val_col]), res_col] = NA
data[data[,val_col]==0 & !is.na(data[,val_col]), val_col] = NA
}
# Add binary ground truth column
data['Ground truth'] = ifelse(
is.na(data$model_outcome), "Unknown",
ifelse(data$model_outcome=="true_positive", "True-positive",
"True-negative"))
data$`Ground truth` = factor(
as.character(data$`Ground truth`),
levels = c("Unknown", "True-negative", "True-positive"))
# NT to categories with correct ordering
data$NT = factor(as.character(data$NT),
levels = c("<10", "10", "20", "40", ">80"))
###############################################################################
# Check data consistency
# Assertions
if (length(unique(single_data$blut_ID)) != nrow(single_data)) {
stop("Blood id not unique")
}
if (length(unique(single_data$tln_ID)) != nrow(single_data)) {
stop("Participant id not unique")
}
#' Check consistency of reported values according to assumed thresholds
check_consistency = function(val_cols, res_cols, threshold) {
for (j in 1:length(val_cols)) {
# Assuming that value and result columns have the same order
val_col = val_cols[[j]]
res_col = res_cols[[j]]
by_value = sum(data[,val_col]>=threshold, na.rm=T)
by_cat = sum(data[,res_col] == "Positive", na.rm=T)
if (by_value != by_cat) {
stop(paste("Mismatch positives (exp./act.)", by_value, by_cat))
}
by_value = sum(data[,val_col]<threshold, na.rm=T)
by_cat = sum(data[,res_col] == "Negative", na.rm=T)
if (by_value != by_cat) {
stop(paste("Mismatch negatives (exp./act.)", by_value, by_cat))
}
}
}
# Roche
val_cols = grep("roche_COI", colnames(data), value=T)
res_cols = grep("roche_Interpretation", colnames(data), value=T)
check_consistency(val_cols, res_cols, 1.0)
if (any(data[,val_cols]==0, na.rm=T)) {
stop("There are Roche zero values.")
}
# Euroimmun
val_cols = grep("eur_quotient", colnames(data), value=T)
res_cols = grep("eur_der_test_result", colnames(data), value=T)
check_consistency(val_cols, res_cols, 1.1)
if (any(data[,val_cols]==0, na.rm=T)) {
stop("There are Euroimmun zero values.")
}
# Use corrected data frame
single_data = data
###############################################################################
# Some statistics
cat("# rows:", nrow(single_data), "\n")
## # rows: 6665
cat("# columns:", ncol(single_data), "\n")
## # columns: 91
cat("# Ground truths:", sum(!is.na(single_data$model_outcome)),
sum(single_data$`Ground truth`!="Unknown"), "\n")
## # Ground truths: 1284 1284
cat("# True-positive:", sum(single_data$`Ground truth`=="True-positive"), "\n")
## # True-positive: 193
cat("# True-negative:", sum(single_data$`Ground truth`=="True-negative"), "\n")
## # True-negative: 1091
# Tidy up
rm(data, val_cols, res_cols, val_col, res_col, j)
data = extract_last_values(single_data)
data_long = data %>% tidyr::gather(Array, Concentration, columns$main)
data_long$Concentration = as.numeric(data_long$Concentration)
# For plotting, we need the array IDs as numbers
for (i in 1:length(columns$main)) {
data_long[data_long$Array==columns$main[i] &
!is.na(data_long$Array),"Array_i"] = as.character(i)
}
# Cutoffs
for (col in columns$main) {
data_long[data_long$Array==col,"Old_cutoff"] = cutoff$old[col]
data_long[data_long$Array==col,"New_cutoff"] = cutoff$new[col]
}
# German
data_long = data_long %>% dplyr::rename(
"Korrekte Klassifikation"="Ground truth")
data_long$`Korrekte Klassifikation` = as.character(data_long$`Korrekte Klassifikation`)
data_long[data_long=="Unknown"] = "Unbekannt"
data_long[data_long=="True-negative"] = "Negativ"
data_long[data_long=="True-positive"] = "Positiv"
data_long$`Korrekte Klassifikation` = factor(
data_long$`Korrekte Klassifikation`, levels=c("Unbekannt", "Negativ", "Positiv"))
plt_spaghetti_main = ggplot(data=data_long,
aes(y=Concentration, x=Array_i, group=tln_ID,
color=`Korrekte Klassifikation`)) +
geom_line(alpha = 0.07) + geom_point(alpha = 0.5) +
# Horizontal lines for decision thresholds
geom_segment(data=data_long[!duplicated(data_long["Array_i"]),],
color=style$thr_lc_old,
aes(x=as.numeric(Array_i)-0.5, y=as.numeric(Old_cutoff),
xend=as.numeric(Array_i)+0.5, yend=as.numeric(Old_cutoff),
linetype="Hersteller")) +
geom_segment(data=data_long[!duplicated(data_long["Array_i"]),],
color=style$thr_lc_new,
aes(x=as.numeric(Array_i)-0.5, y=as.numeric(New_cutoff),
xend=as.numeric(Array_i)+0.5, yend=as.numeric(New_cutoff),
linetype="Optimiert")) +
scale_linetype_manual(
name="Schwellenwert", values=c(style$thr_lt_old,style$thr_lt_new)) +
# Decoration
scale_x_discrete(labels=columns_pretty$main) + scale_y_log10() +
scale_fill_manual(values=style$pal3) +
scale_color_manual(values=style$pal3) +
labs(x="", y="Messwert")
plt_spaghetti_main
## Warning: Removed 44 row(s) containing missing values (geom_path).
## Warning: Removed 44 rows containing missing values (geom_point).
load(here_r("Abbildung-5.RData"))
plot_grid(
roche.igg, get_legend(plt_spaghetti_main),
nrow=1, rel_widths = c(3, 1)
)
## Warning: Removed 44 row(s) containing missing values (geom_path).
## Warning: Removed 44 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 29 rows containing non-finite values (stat_cor).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_point).
ggsave(here_out("scatter_roche_igg.png"), width=7, height=4, dpi=720)
ggsave(here_out("scatter_roche_igg.pdf"), width=7, height=4, dpi=720)
data = single_data
# Get recovery rates
data_cts = to_numeric(data)
cols = c("cPass", "NT")
rec_old = get_recovery_rates(
data_cts, cols, cutoffs=cutoff$old, cutoff_label="Old")
rec_new = get_recovery_rates(
data_cts, cols, cutoffs=cutoff$new, cutoff_label="New")
rownames(rec_old) = rec_old$Method
rownames(rec_new) = rec_new$Method
# cPass
# German
data = data %>% dplyr::rename(
"Korrekte Klassifikation"="Ground truth")
data$`Korrekte Klassifikation` = as.character(data$`Korrekte Klassifikation`)
data[data=="Unknown"] = "Unbekannt"
data[data=="True-negative"] = "Negativ"
data[data=="True-positive"] = "Positiv"
data$`Korrekte Klassifikation` = factor(
data$`Korrekte Klassifikation`, levels=c("Unbekannt", "Negativ", "Positiv"))
plt_cpass = ggplot(data,
aes(x=cPass, fill=`Korrekte Klassifikation`, color=`Korrekte Klassifikation`)) +
geom_histogram(alpha=0.5, position="stack") +
# Thresholds
geom_vline(aes(xintercept=cutoff$old$cPass, linetype="Hersteller"),
color=style$thr_lc_old) +
geom_vline(aes(xintercept=cutoff$new$cPass, linetype="Optimiert"),
color=style$thr_lc_new) +
# Use linetype aesthetic to create a separate legend
scale_linetype_manual(
name="Schwellenwert", values=c(style$thr_lt_old,style$thr_lt_new)) +
annotate("text", size=3, color=style$col_truepos, hjust="center",
x=cutoff$old$cPass + 10, y=35,
label=sprintf("%.0f%%", rec_new["cPass", "frac_pos_rec"] * 100)) +
annotate("text", size=3, color=style$col_trueneg, hjust="center",
x=cutoff$old$cPass - 30, y=35,
label=sprintf("%.0f%%", rec_new["cPass", "frac_neg_rec"] * 100)) +
annotate("text", size=3, color=style$col_truepos, hjust="center",
x=cutoff$old$cPass + 10, y=30,
label=sprintf("(%.0f%%)", rec_old["cPass", "frac_pos_rec"] * 100)) +
annotate("text", size=3, color=style$col_trueneg, hjust="center",
x=cutoff$old$cPass - 30, y=30,
label=sprintf("(%.0f%%)", rec_old["cPass", "frac_neg_rec"] * 100)) +
labs(x=paste0(
columns_pretty$cpass,
" (n=", sum(!is.na(data$cPass)), ")"), y="Anzahl") +
scale_fill_manual(values=style$pal3) + scale_color_manual(values=style$pal3)
plt_cpass
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6310 rows containing non-finite values (stat_bin).
rm(data, data_cts, rec_old, rec_new, cols)
CAPTION: Distribution of measurement values for NT and cPass. The dashed lines indicate the respective manufacturer test decision thresholds. Coloring as in Figure TODO.
data = to_numeric(single_data)
cols = c("NT", "cPass")
rbind(get_recovery_rates(data, cols, cutoffs=cutoff$old, cutoff_label="Old"),
get_recovery_rates(data, cols, cutoffs=cutoff$new, cutoff_label="New"))
## Method Cutoff p n truepos trueneg p_truepos n_truepos p_trueneg n_trueneg
## 1 NT Old 0 0 107 106 0 0 0 0
## 2 cPass Old 104 110 108 106 104 4 0 106
## 3 NT New 79 134 107 106 79 28 0 106
## 4 cPass New 104 110 108 106 104 4 0 106
## frac_pos_rec frac_neg_rec
## 1 0.0000000 0
## 2 0.9629630 1
## 3 0.7383178 1
## 4 0.9629630 1
rm(data, cols)
jitter_violin_plot_de = function(data_long, cols_pretty,
ylimits, ybreaks, ylabels,
ymeas, thr_shift=0.1, violin_width=0.7,
show_new_line=T, show_new_percent=T,
show_old_line=T, show_old_percent=T,
unknown_text_offset=2.5) {
# Size of in-figure texts
text_size=3
# Base plot
plt = ggplot(data_long, aes(x=Array_i, y=Concentration)) +
# Jittered points to the left
# We plot these first as violins are not plotted with <=2 points,
# messing up color assignment
geom_point(aes(x=as.numeric(Array_i)-0.22, y=Concentration,
col=`Korrekte Klassifikation`, fill=`Korrekte Klassifikation`), size=1,
position=position_jitter(width=.15), alpha=0.5, show.legend=F) +
# 3 Violins to the right for the ground truth values, and unknowns
geom_flat_violin(aes(fill=`Korrekte Klassifikation`, col=`Korrekte Klassifikation`),
position = position_nudge(x = .0, y = 0),
alpha=0.2, trim=F, scale="area", width=violin_width) +
# Labels, scales, colors
scale_y_log10(limits=ylimits, breaks=ybreaks, labels=ylabels) +
scale_x_discrete(labels=paste(cols_pretty)) +
scale_fill_manual(values=style$pal3) + scale_color_manual(values=style$pal3) +
# Add measurement count
geom_text(data=data_long[!duplicated(data_long["Array_i"]),], size=3,
aes(x=Array_i, y=ymeas, label=Count)) +
labs(x="", y="Messwert")
linetype_manuals = c()
# New annotations
if (show_new_line) {
# Horizontal line
plt = plt +
geom_segment(data=data_long[!duplicated(data_long["Array_i"]),],
color=style$thr_lc_new,
aes(x=as.numeric(Array_i)-0.5, y=as.numeric(New_cutoff),
xend=as.numeric(Array_i)+0.5,
yend=as.numeric(New_cutoff), linetype="Optimiert"))
linetype_manuals = c(linetype_manuals, style$thr_lt_new)
}
if (show_new_percent) {
# Percentages
plt = plt +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_grey, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(New_cutoff)*10**(thr_shift*unknown_text_offset),
label=`Fraction unknown positive new`)) +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_truepos, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(New_cutoff)*10**thr_shift,
label=`Fraction true positive new`)) +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_trueneg, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(New_cutoff)*10**-thr_shift,
label=`Fraction true negative new`))
}
# Old annotations
if (show_old_line) {
# Horizontal line
plt = plt +
geom_segment(data=data_long[!duplicated(data_long["Array_i"]),],
color=style$thr_lc_old,
aes(x=as.numeric(Array_i)-0.5, y=as.numeric(Old_cutoff),
xend=as.numeric(Array_i)+0.5,
yend=as.numeric(Old_cutoff), linetype="Hersteller"))
linetype_manuals = c(style$thr_lt_old, linetype_manuals)
}
if (show_old_percent) {
# Percentages
plt = plt +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_grey, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(Old_cutoff)*10**(thr_shift*unknown_text_offset),
label=`Fraction unknown positive old`)) +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_truepos, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(Old_cutoff)*10**thr_shift,
label=`Fraction true positive old`)) +
geom_text(
data=data_long[!duplicated(data_long["Array_i"]),],
size=text_size, color=style$col_trueneg, hjust="right",
aes(x=as.numeric(Array_i) + 0.5,
y=as.numeric(Old_cutoff)*10**-thr_shift,
label=`Fraction true negative old`))
}
# Linetype annotation
plt = plt + scale_linetype_manual(name="Schwellenwert", values=linetype_manuals)
plt
}
cols = columns$lineblot
cols_pretty = columns_pretty$lineblot
data = data_long_for_viola(
single_data, cols, cols_pretty, "not_reactive", cutoff)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols)` instead of `cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data_long = data[[1]]
data_long_inv = data[[2]]
# German
data_long = data_long %>% dplyr::rename(
"Korrekte Klassifikation"="Ground truth")
data_long$`Korrekte Klassifikation` = as.character(data_long$`Korrekte Klassifikation`)
data_long[data_long=="Unknown"] = "Unbekannt"
data_long[data_long=="True-negative"] = "Negativ"
data_long[data_long=="True-positive"] = "Positiv"
data_long$`Korrekte Klassifikation` = factor(
data_long$`Korrekte Klassifikation`, levels=c("Unbekannt", "Negativ", "Positiv"))
# Violin plots and jitter for the continuous part
plt_up = jitter_violin_plot_de(
data_long, cols_pretty=cols_pretty,
ylimits=c(0.3,30), ybreaks=c(0.4,1,3,10),
ylabels=c("<1","1","3","10"),
ymeas=30, thr_shift=0.07, violin_width=0.7,
show_old_percent = F)
plt_up
# Histogram for the discrete part
plt_down = discrete_bar_plot(data_long_inv, ylimits=c(0,170))
plt_down
# Combine plots
plt_lineblot_distr = plt_up + annotation_custom(
ggplotGrob(plt_down + theme(legend.position = "none")),
ymin=-0.65, ymax=-0.15, xmin=0.3, xmax=3.7)
plt_lineblot_distr
# Save plot
for (fmt in style$output_formats) {
#ggsave(here_out(paste0("LineBlot_Distribution.", fmt)),
# device=fmt, dpi=style$dpi, width=5, height=5, units="in")
}
rm(data, data_long, data_long_inv, cols, cols_pretty, plt_up, plt_down, fmt)
plot_grid(
title_plot("A", plt_lineblot_distr + theme(legend.position = "none")),
title_plot("B", plt_cpass + theme(legend.position = "none")),
cowplot::get_legend(plt_cpass),
nrow=1, rel_widths = c(3,3,1.5)
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6310 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6310 rows containing non-finite values (stat_bin).
ggsave(here_out("mg_cpass.png"), width=8, height=3, dpi=720)
ggsave(here_out("mg_cpass.pdf"), width=8, height=3, dpi=720)