Compilation date: 30.10.2020
here_r = function (...) here::here("Statistics", "R", ...)
here_perm = function (...) here::here("PermutationStudies", ...)
here_data = function (...) here_perm("data", ...)
here_out = function (...) here_perm("out", ...)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load half violin package
source(here_r("R_rainclouds.R"))
# Basic setup
source(here_r("setup.R"))
# Here we use a black and white theme
#ggplot2::theme_set(ggplot2::theme_bw())
cutoff = params$cutoff # cutnew
data_key = params$data_key # 'R'
n_perm = params$n_perm # 10000
thresholds = c(0.05, 0.1, 0.2, 0.5, 1, 2, 4)
# read in sampled variances
vars = list()
vars["Households"] = read.csv(
here_data(paste0("variances_hh_", cutoff, '_', data_key, "_", n_perm,
".csv")),
header=F, sep=',')
vars["Buildings"] = read.csv(
here_data(paste0("variances_bd_", cutoff, '_', data_key, "_", n_perm,
".csv")),
header=F, sep=',')
for (threshold in thresholds) {
vars[paste0(round(threshold*1000), "m clusters")] =
read.csv(here_data(paste0(
"variances_lc_", cutoff, '_', threshold, "_", data_key, "_", n_perm,
".csv")),
header=F, sep=',')
}
# read in true variances
real_var = list()
real_var["Households"] = read.csv(
here_data(paste0("real_variance_hh_", cutoff, '_', data_key, "_", n_perm,
".csv")),
header=F, sep=',')
real_var["Buildings"] = read.csv(
here_data(paste0("real_variance_bd_", cutoff, '_', data_key, "_", n_perm,
".csv")),
header=F, sep=',')
for (threshold in thresholds) {
real_var[paste0(round(threshold*1000), "m clusters")] =
read.csv(here_data(paste0(
"real_variance_lc_", cutoff, '_', threshold, "_", data_key, "_", n_perm,
".csv")),
header=F, sep=',')
}
# merge into one data frame
data = data.frame(Sample=double(), Id=double(), Real_value=double())
for (id in names(vars)) {
samples = data.frame(Sample=vars[[id]], Id=id, Real_value=real_var[[id]])
if (nrow(data)==0) {
data = samples
} else {
data = rbind(data, samples)
}
}
setups = c(
"Households", "Buildings", "50m clusters", "100m clusters", "200m clusters",
"500m clusters", "1000m clusters", "2000m clusters", "4000m clusters")
# For correct order
data$Id = factor(data$Id, levels=setups)
# For some plots we need the index
for (i in 1:length(setups)) {
data[data$Id==setups[i],"Id_i"] = i
}
plt_distr = ggplot(data=data, aes(x=Sample)) + facet_wrap(~Id, scales="free") +
geom_histogram(aes(y=..density..), bins=100, alpha=0.5) + geom_density() +
geom_vline(data=data[!duplicated(data$Id),], aes(xintercept = Real_value)) +
scale_x_continuous(guide=guide_axis(check.overlap = T)) +
scale_y_continuous(guide=guide_axis(check.overlap = T)) +
labs(x="Mean within-cluster variance", y="Count")
plt_distr
yq = function(xs, level) {
lb = (1 - level) / 2
ub = 1 - lb
r <- quantile(xs, probs=c(lb, lb, 0.5, ub, ub))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
plt_box = ggplot(data=data, aes(x=Id, y=Sample)) +
stat_summary(
fun.data=pryr::partial(yq, level=0.99), geom="boxplot", width=0.3) +
stat_summary(
fun.data=pryr::partial(yq, level=0.95), geom="boxplot", width=0.5) +
stat_summary(
fun.data=pryr::partial(yq, level=0.5), geom="boxplot") +
geom_point(aes(x=Id, y=Real_value), size=1) +
labs(x="Cluster variable", y="Mean within-cluster variance") +
coord_flip()
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
plt_box
# log10 scale
plt_box_log10 = ggplot(data=data, aes(x=Id, y=Sample)) + scale_y_log10() +
stat_summary(
fun.data=pryr::partial(yq, level=0.99), geom="boxplot", width=0.3) +
stat_summary(
fun.data=pryr::partial(yq, level=0.95), geom="boxplot", width=0.5) +
stat_summary(fun.data=pryr::partial(yq, level=0.5), geom="boxplot") +
geom_point(aes(x=Id, y=Real_value), size=1) +
labs(x="Cluster variable", y="Mean within-cluster variance") +
coord_flip()
plt_box_log10
cowplot::plot_grid(plt_distr, plt_box, rel_widths = c(1, 1))
for (device in c("pdf", "png")) {
ggsave(here_out(
paste0("HistogramAndBoxplot-", params$cutoff, "-", params$data_key, "-",
params$n_perm, ".", device)),
dpi=720, device=device, width=10, height=4)
}
for (col in setups) {
data[data$Id==col,"p"] = sprintf(
"p=%.3f", sum(data$Sample<=data$Real_value &
data$Id==col, na.rm=T) / sum(data$Id==col))
# represent small values as "< ..."
data[data$Id==col & data$p=="p=0.000","p"] = "p<0.001"
}
ggplot(data, aes(x=Id, y=Sample)) +
geom_point(aes(x=as.numeric(Id_i)-0.22), size=0.01,
position=position_jitter(width=.2), alpha=0.1) +
geom_flat_violin(position=position_nudge(x=.02,y=0),
fill=style$col_grey,
alpha=0.5, trim=F, scale="width", width=0.85) +
geom_segment(data=data[!duplicated(data$Id),],
aes(x=as.numeric(Id_i)-0.5, y=Real_value,
xend=as.numeric(Id_i)+0.5,
yend=as.numeric(Real_value)),
color=style$pal4[2]) +
geom_text(data=data[!duplicated(data$Id),],
aes(x=as.numeric(Id_i), label=p), y=max(data$Sample)*1.1, size=3.5) +
scale_y_continuous(limits=c(0, max(data$Sample)*1.15)) +
scale_x_discrete(labels=setups) +
labs(x="Cluster variable", y="Mean within-cluster variance")
for (device in c("pdf", "png")) {
ggsave(here_out(
paste0("JitterAndViolin-", params$cutoff, "-", params$data_key, "-",
params$n_perm, ".", device)),
dpi=720, device=device, width=10, height=4)
}
cat("Id\t", "n_leq", "n", "p", "0.95 confidence interval", "\n", sep="\t")
## Id n_leq n p 0.95 confidence interval
for (id in unique(data$Id)) {
data_for_id = data[data$Id==id,]
n_leq = sum(data_for_id$Sample<=data_for_id$Real_value)
total = nrow(data_for_id)
p = n_leq / total
conf = prop.test(n_leq, total, conf.level=0.95)
cat(id, n_leq, total, p, conf$conf.int, "\n", sep="\t")
}
## Households 3084 10000 0.3084 0.2993737 0.3175743
## Buildings 860 10000 0.086 0.08061416 0.09170668
## 50m clusters 1106 10000 0.1106 0.104553 0.1169485
## 100m clusters 3356 10000 0.3356 0.3263601 0.3449668
## 200m clusters 672 10000 0.0672 0.06240891 0.07232686
## 500m clusters 7577 10000 0.7577 0.7491536 0.7660474
## 1000m clusters 1610 10000 0.161 0.1538778 0.1683843
## 2000m clusters 8418 10000 0.8418 0.8344656 0.8488701
## 4000m clusters 6577 10000 0.6577 0.6482911 0.6669871