Compilation date: 18.11.2020

Some very basic population visualizations.

library(ggplot2)

here_r = function (...) here::here("Statistics", "R", ...)
here_koco = function (...) here::here("KoCo19_Datasets", ...)
here_out = function (...) here::here(
  "Epi_out", "epi_figures_yannik_basics", ...)

dir.create(here_out(), showWarnings = FALSE)

# Setup
source(here_r("setup.R"))

###############################################################################
# Prepare date

# Pre-defined population data set
data_koco = read.csv(here_koco(
  "Analysis Data Sets", "Koco_baseline.csv"), stringsAsFactors = F)
if (any(data_koco$IgA_quant == 0 | data_koco$IgG_quant == 0 |
        data_koco$R_quant == 0, na.rm=T)) {
  stop("Zeros should be NAs.")
}

# Location data
data_building = read.csv(
  here_koco("Geocodes", "KoCo19_Haushalte4Modeler_20200909.csv"))
# Add associations of households to addresses
for (address_id in unique(data_building$address_id)) {
  data_building[data_building$address_id==address_id,"HouseholdsPerAddress"] =
    sum(data_building$address_id==address_id, na.rm=T)
}

Study population

Inhabitants per household

data = data_koco[!duplicated(data_koco$hh_id),]
unique(data$HouseholdSize)
##  [1]  4  1  3  2  6  8  5  7  9 NA
plt_inh_per_hh = ggplot(data, aes(x=HouseholdSize)) +
  geom_bar() + scale_x_continuous(breaks=1:9) +
  labs(title="Inhabitants per household",
       y="Number of households", x="Household size")
plt_inh_per_hh
## Warning: Removed 1 rows containing non-finite values (stat_count).

Households per building

data = data_building[!duplicated(data_building$address_id),]

plt_hh_per_bdg = ggplot(data, aes(x=HouseholdsPerAddress)) +
  geom_bar() + scale_x_continuous(breaks=1:9) +
  labs(title="Households per building",
       y="Number of buildings", x="Number of households in same building")
plt_hh_per_bdg

Inhabitants per building

data1 = data_koco[c("ind_id", "hh_id", "HouseholdSize")]
data2 = data_building[c("hht_ID", "address_id")]
colnames(data2)[colnames(data2)=="hht_ID"] = "hh_id"
data = merge(data1, data2, by = "hh_id")

for (address_id in unique(data$address_id)) {
  data[data$address_id==address_id,"InhabitantsPerAddress"] =
    sum(data$address_id==address_id, na.rm=T)
}

data = data[!duplicated(data$hh_id),]

unique(data$InhabitantsPerAddress)
##  [1]  3  6 10  7  2  8  5  4  1 11 14 13  9 15 12
plt_inh_per_bdg = ggplot(data, aes(x=InhabitantsPerAddress)) +
  geom_bar() + scale_x_continuous(breaks=1:15) +
  labs(title="Inhabitants per building", y="Number of buildings",
       x="Number of inhabitants in same building")
plt_inh_per_bdg

ggpubr::ggarrange(plt_inh_per_hh, plt_hh_per_bdg, plt_inh_per_bdg, ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_count).

ggsave(here_out("inh_hh_bdg.png"), width=9, height=3, dpi=720)
ggsave(here_out("inh_hh_bdg.pdf"), width=9, height=3, device=cairo_pdf)

Positive tests

data = data_koco

# Positivity criterion
data$Positive = data$R_quant >= cutoff$new$Roche

# Take only members where household size was reported
sum(is.na(data$HouseholdSize))
## [1] 8
data = data[!is.na(data$HouseholdSize),]

# Assign positivity fractions for each household size
for (hhsize in unique(data$HouseholdSize)) {
  d = data[data$HouseholdSize == hhsize,]
  # Fraction 1 is positives by all available test results
  data[data$HouseholdSize == hhsize,
       "Tested household members"] =
    sum(d$Positive) / sum(!is.na(d$Positive))
  # Fraction 2 is positives by extrapolated household size
  data[data$HouseholdSize == hhsize,
       "Household members"] =
    sum(d$Positive) / (hhsize * length(unique(d$hh_id)))
  # Number of participants for that household size
  data[data$HouseholdSize == hhsize, "n"] = sum(!is.na(d$Positive))
}

data = data[!duplicated(data$HouseholdSize),]
data = data[,c(
  "HouseholdSize",
  "Tested household members",
  "Household members",
  "n")]

data
##      HouseholdSize Tested household members Household members    n
## 1                4               0.01718582       0.009975062  931
## 3                1               0.02040816       0.020408163  784
## 7                3               0.01373896       0.009742519 1019
## 12               2               0.01561022       0.014090521 2114
## 226              6               0.05970149       0.028985507   67
## 330              8               0.00000000       0.000000000    4
## 342              5               0.02513966       0.014173228  358
## 387              7               0.03703704       0.023809524   27
## 4109             9               0.00000000       0.000000000    1
# To long over fractions
data_long = tidyr::gather(
  data,
  `Relative to total # of`, Value,
  `Tested household members`:`Household members`)

ggplot(data_long,
       aes(x=factor(HouseholdSize), y=Value, fill=`Relative to total # of`)) +
  geom_bar(stat="identity", position="dodge") +
  geom_text(
    data=data_long[data_long$`Relative to total # of`==
                  "Tested household members",],
    aes(label=n, y=-0.003)) +
  scale_y_continuous(labels=scales::percent) +
  labs(x="Household size",
       y="Fraction of individuals\nwith positive test result")

ggsave(here_out("PositiveFractions.png"), width=8, height=4, dpi=720)
ggsave(here_out("PositiveFractions.pdf"), width=8, height=4, device=cairo_pdf)
# Grouping household sizes differently

data = data_koco
data$Positive = data$R_quant >= cutoff$new$Roche

data = data[!is.na(data$HouseholdSize),]

data[data$HouseholdSize==1, "HhsizeFactor"] = "1"
data[data$HouseholdSize==2, "HhsizeFactor"] = "2"
data[data$HouseholdSize %in% c(3,4), "HhsizeFactor"] = "3-4"

data = data[!is.na(data$HhsizeFactor),]

for (hhsize in unique(data$HhsizeFactor)) {
  d = data[data$HhsizeFactor == hhsize,]
  data[data$HhsizeFactor == hhsize, "PosRelHhsize"] = sum(d$Positive) / sum(!is.na(d$Positive))
  data[data$HhsizeFactor == hhsize, "n"] = sum(!is.na(d$Positive))
}

data = data[!duplicated(data$HhsizeFactor),]

ggplot(data, aes(x=factor(HhsizeFactor), y=PosRelHhsize)) +
  geom_bar(stat="identity") + geom_text(aes(label=n, y=PosRelHhsize+0.001)) +
  labs(x="Household size", y="Fraction of positive tests")

# Number of positives in household sizes

data = data_koco
data$Positive = data$R_quant >= cutoff$new$Roche

# Roche values available for all?
cat("Roche not available: ", any(is.na(data$R_quant)), "\n")
## Roche not available:  FALSE
cat("Number of households:", length(unique(data$hh_id)), "\n")
## Number of households: 2994
for (hh_id in unique(data$hh_id)) {
  data[data$hh_id == hh_id, "PosInHh"] = sum(data$hh_id == hh_id & data$Positive)
}

data_uq_hh = data[!duplicated(data$hh_id),]
table(data_uq_hh$HouseholdSize, data_uq_hh$PosInHh)
##    
##        0    1    2
##   1  768   16    0
##   2 1141   27    3
##   3  467   10    2
##   4  388   10    3
##   5  120    5    2
##   6   20    2    1
##   7    5    1    0
##   8    1    0    0
##   9    1    0    0
sum(data$Positive)
## [1] 93