Compilation date: 30.10.2020

Prepare data

here_r = function (...) here::here("Statistics", "R", ...)
here_perm = function (...) here::here("PermutationStudies", ...)
here_data = function (...) here_perm("data", ...)
here_out = function (...) here_perm("out", ...)

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load half violin package
source(here_r("R_rainclouds.R"))

# Basic setup
source(here_r("setup.R"))
# Here we use a black and white theme
#ggplot2::theme_set(ggplot2::theme_bw())
cutoff = params$cutoff  # cutnew
data_key = params$data_key  # 'R'
n_perm = params$n_perm  # 10000
thresholds = c(0.05, 0.1, 0.2, 0.5, 1, 2, 4)

# read in sampled variances
vars = list()
vars["Households"] = read.csv(
  here_data(paste0("variances_hh_", cutoff, '_', data_key, "_", n_perm,
                   ".csv")),
  header=F, sep=',')
vars["Buildings"] = read.csv(
  here_data(paste0("variances_bd_", cutoff, '_', data_key, "_", n_perm,
                   ".csv")),
  header=F, sep=',')
for (threshold in thresholds) {
  vars[paste0(round(threshold*1000), "m clusters")] = 
    read.csv(here_data(paste0(
      "variances_lc_", cutoff, '_', threshold, "_", data_key, "_", n_perm,
      ".csv")),
      header=F, sep=',')
}

# read in true variances
real_var = list()
real_var["Households"] = read.csv(
  here_data(paste0("real_variance_hh_", cutoff, '_', data_key, "_", n_perm,
                   ".csv")),
  header=F, sep=',')
real_var["Buildings"] = read.csv(
  here_data(paste0("real_variance_bd_", cutoff, '_', data_key, "_", n_perm,
                   ".csv")),
  header=F, sep=',')
for (threshold in thresholds) {
  real_var[paste0(round(threshold*1000), "m clusters")] = 
    read.csv(here_data(paste0(
      "real_variance_lc_", cutoff, '_', threshold, "_", data_key, "_", n_perm,
      ".csv")),
      header=F, sep=',')
}

# merge into one data frame
data = data.frame(Sample=double(), Id=double(), Real_value=double())
for (id  in names(vars)) {
  samples = data.frame(Sample=vars[[id]], Id=id, Real_value=real_var[[id]])
  if (nrow(data)==0) {
    data = samples
  } else {
    data = rbind(data, samples)
  }
}

setups = c(
  "Households", "Buildings", "50m clusters", "100m clusters", "200m clusters",
  "500m clusters", "1000m clusters", "2000m clusters", "4000m clusters")

# For correct order
data$Id = factor(data$Id, levels=setups)
# For some plots we need the index
for (i in 1:length(setups)) {
  data[data$Id==setups[i],"Id_i"] = i
}

Distributions

plt_distr = ggplot(data=data, aes(x=Sample)) + facet_wrap(~Id, scales="free") + 
  geom_histogram(aes(y=..density..), bins=100, alpha=0.5) + geom_density() +
  geom_vline(data=data[!duplicated(data$Id),], aes(xintercept = Real_value)) +
  scale_x_continuous(guide=guide_axis(check.overlap = T)) +
  scale_y_continuous(guide=guide_axis(check.overlap = T)) +
  labs(x="Mean within-cluster variance", y="Count")
plt_distr

Boxplots

yq = function(xs, level) {
  lb = (1 - level) / 2
  ub = 1 - lb
  r <- quantile(xs, probs=c(lb, lb, 0.5, ub, ub))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

plt_box = ggplot(data=data, aes(x=Id, y=Sample)) + 
  stat_summary(
    fun.data=pryr::partial(yq, level=0.99), geom="boxplot", width=0.3) +
  stat_summary(
    fun.data=pryr::partial(yq, level=0.95), geom="boxplot", width=0.5) + 
  stat_summary(
    fun.data=pryr::partial(yq, level=0.5), geom="boxplot") +
  geom_point(aes(x=Id, y=Real_value), size=1) +
  labs(x="Cluster variable", y="Mean within-cluster variance") +
  coord_flip()
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
plt_box

# log10 scale
plt_box_log10 = ggplot(data=data, aes(x=Id, y=Sample)) + scale_y_log10() +
  stat_summary(
    fun.data=pryr::partial(yq, level=0.99), geom="boxplot", width=0.3) +
  stat_summary(
    fun.data=pryr::partial(yq, level=0.95), geom="boxplot", width=0.5) + 
  
  stat_summary(fun.data=pryr::partial(yq, level=0.5), geom="boxplot") +
  geom_point(aes(x=Id, y=Real_value), size=1) +
  labs(x="Cluster variable", y="Mean within-cluster variance") +
  coord_flip()
plt_box_log10

Combined

cowplot::plot_grid(plt_distr, plt_box, rel_widths = c(1, 1))

for (device in c("pdf", "png")) {
  ggsave(here_out(
    paste0("HistogramAndBoxplot-", params$cutoff, "-", params$data_key, "-",
           params$n_perm, ".", device)),
    dpi=720, device=device, width=10, height=4)
}

Jitter violins

for (col in setups) {
  data[data$Id==col,"p"] = sprintf(
    "p=%.3f", sum(data$Sample<=data$Real_value &
                  data$Id==col, na.rm=T) / sum(data$Id==col))
  # represent small values as "< ..."
  data[data$Id==col & data$p=="p=0.000","p"] = "p<0.001"
}

ggplot(data, aes(x=Id, y=Sample)) +
  geom_point(aes(x=as.numeric(Id_i)-0.22), size=0.01,
             position=position_jitter(width=.2), alpha=0.1) +
  geom_flat_violin(position=position_nudge(x=.02,y=0),
                   fill=style$col_grey,
                   alpha=0.5, trim=F, scale="width", width=0.85) +
  geom_segment(data=data[!duplicated(data$Id),],
               aes(x=as.numeric(Id_i)-0.5, y=Real_value,
                   xend=as.numeric(Id_i)+0.5,
                   yend=as.numeric(Real_value)),
               color=style$pal4[2]) +
  geom_text(data=data[!duplicated(data$Id),],
            aes(x=as.numeric(Id_i), label=p), y=max(data$Sample)*1.1, size=3.5) +
  scale_y_continuous(limits=c(0, max(data$Sample)*1.15)) +
  scale_x_discrete(labels=setups) +
  labs(x="Cluster variable", y="Mean within-cluster variance")

for (device in c("pdf", "png")) {
  ggsave(here_out(
    paste0("JitterAndViolin-", params$cutoff, "-", params$data_key, "-",
           params$n_perm, ".", device)),
         dpi=720, device=device, width=10, height=4)
}

Percentiles

cat("Id\t", "n_leq", "n", "p", "0.95 confidence interval", "\n", sep="\t")
## Id       n_leq   n   p   0.95 confidence interval    
for (id in unique(data$Id)) {
  data_for_id = data[data$Id==id,]
  n_leq =  sum(data_for_id$Sample<=data_for_id$Real_value)
  total = nrow(data_for_id)
  p = n_leq / total
  conf = prop.test(n_leq, total, conf.level=0.95)
  cat(id, n_leq, total, p, conf$conf.int, "\n", sep="\t")
}
## Households   19  10000   0.0019  0.001177765 0.003026639 
## Buildings    1952    10000   0.1952  0.1875  0.2031356   
## 50m clusters 1058    10000   0.1058  0.09987342  0.1120318   
## 100m clusters    3379    10000   0.3379  0.3286437   0.3472815   
## 200m clusters    188 10000   0.0188  0.01627038  0.0217061   
## 500m clusters    1640    10000   0.164   0.1568229   0.1714369   
## 1000m clusters   1291    10000   0.1291  0.1226213   0.1358657   
## 2000m clusters   7487    10000   0.7487  0.7400535   0.7571544   
## 4000m clusters   8810    10000   0.881   0.874456    0.8872491