Compilation date: 18.11.2020
Some very basic population visualizations.
library(ggplot2)
here_r = function (...) here::here("Statistics", "R", ...)
here_koco = function (...) here::here("KoCo19_Datasets", ...)
here_out = function (...) here::here(
"Epi_out", "epi_figures_yannik_basics", ...)
dir.create(here_out(), showWarnings = FALSE)
# Setup
source(here_r("setup.R"))
###############################################################################
# Prepare date
# Pre-defined population data set
data_koco = read.csv(here_koco(
"Analysis Data Sets", "Koco_baseline.csv"), stringsAsFactors = F)
if (any(data_koco$IgA_quant == 0 | data_koco$IgG_quant == 0 |
data_koco$R_quant == 0, na.rm=T)) {
stop("Zeros should be NAs.")
}
# Location data
data_building = read.csv(
here_koco("Geocodes", "KoCo19_Haushalte4Modeler_20200909.csv"))
# Add associations of households to addresses
for (address_id in unique(data_building$address_id)) {
data_building[data_building$address_id==address_id,"HouseholdsPerAddress"] =
sum(data_building$address_id==address_id, na.rm=T)
}
data = data_koco[!duplicated(data_koco$hh_id),]
unique(data$HouseholdSize)
## [1] 4 1 3 2 6 8 5 7 9 NA
plt_inh_per_hh = ggplot(data, aes(x=HouseholdSize)) +
geom_bar() + scale_x_continuous(breaks=1:9) +
labs(title="Inhabitants per household",
y="Number of households", x="Household size")
plt_inh_per_hh
## Warning: Removed 1 rows containing non-finite values (stat_count).
data = data_building[!duplicated(data_building$address_id),]
plt_hh_per_bdg = ggplot(data, aes(x=HouseholdsPerAddress)) +
geom_bar() + scale_x_continuous(breaks=1:9) +
labs(title="Households per building",
y="Number of buildings", x="Number of households in same building")
plt_hh_per_bdg
data1 = data_koco[c("ind_id", "hh_id", "HouseholdSize")]
data2 = data_building[c("hht_ID", "address_id")]
colnames(data2)[colnames(data2)=="hht_ID"] = "hh_id"
data = merge(data1, data2, by = "hh_id")
for (address_id in unique(data$address_id)) {
data[data$address_id==address_id,"InhabitantsPerAddress"] =
sum(data$address_id==address_id, na.rm=T)
}
data = data[!duplicated(data$hh_id),]
unique(data$InhabitantsPerAddress)
## [1] 3 6 10 7 2 8 5 4 1 11 14 13 9 15 12
plt_inh_per_bdg = ggplot(data, aes(x=InhabitantsPerAddress)) +
geom_bar() + scale_x_continuous(breaks=1:15) +
labs(title="Inhabitants per building", y="Number of buildings",
x="Number of inhabitants in same building")
plt_inh_per_bdg
ggpubr::ggarrange(plt_inh_per_hh, plt_hh_per_bdg, plt_inh_per_bdg, ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_count).
ggsave(here_out("inh_hh_bdg.png"), width=9, height=3, dpi=720)
ggsave(here_out("inh_hh_bdg.pdf"), width=9, height=3, device=cairo_pdf)
data = data_koco
# Positivity criterion
data$Positive = data$R_quant >= cutoff$new$Roche
# Take only members where household size was reported
sum(is.na(data$HouseholdSize))
## [1] 8
data = data[!is.na(data$HouseholdSize),]
# Assign positivity fractions for each household size
for (hhsize in unique(data$HouseholdSize)) {
d = data[data$HouseholdSize == hhsize,]
# Fraction 1 is positives by all available test results
data[data$HouseholdSize == hhsize,
"Tested household members"] =
sum(d$Positive) / sum(!is.na(d$Positive))
# Fraction 2 is positives by extrapolated household size
data[data$HouseholdSize == hhsize,
"Household members"] =
sum(d$Positive) / (hhsize * length(unique(d$hh_id)))
# Number of participants for that household size
data[data$HouseholdSize == hhsize, "n"] = sum(!is.na(d$Positive))
}
data = data[!duplicated(data$HouseholdSize),]
data = data[,c(
"HouseholdSize",
"Tested household members",
"Household members",
"n")]
data
## HouseholdSize Tested household members Household members n
## 1 4 0.01718582 0.009975062 931
## 3 1 0.02040816 0.020408163 784
## 7 3 0.01373896 0.009742519 1019
## 12 2 0.01561022 0.014090521 2114
## 226 6 0.05970149 0.028985507 67
## 330 8 0.00000000 0.000000000 4
## 342 5 0.02513966 0.014173228 358
## 387 7 0.03703704 0.023809524 27
## 4109 9 0.00000000 0.000000000 1
# To long over fractions
data_long = tidyr::gather(
data,
`Relative to total # of`, Value,
`Tested household members`:`Household members`)
ggplot(data_long,
aes(x=factor(HouseholdSize), y=Value, fill=`Relative to total # of`)) +
geom_bar(stat="identity", position="dodge") +
geom_text(
data=data_long[data_long$`Relative to total # of`==
"Tested household members",],
aes(label=n, y=-0.003)) +
scale_y_continuous(labels=scales::percent) +
labs(x="Household size",
y="Fraction of individuals\nwith positive test result")
ggsave(here_out("PositiveFractions.png"), width=8, height=4, dpi=720)
ggsave(here_out("PositiveFractions.pdf"), width=8, height=4, device=cairo_pdf)
# Grouping household sizes differently
data = data_koco
data$Positive = data$R_quant >= cutoff$new$Roche
data = data[!is.na(data$HouseholdSize),]
data[data$HouseholdSize==1, "HhsizeFactor"] = "1"
data[data$HouseholdSize==2, "HhsizeFactor"] = "2"
data[data$HouseholdSize %in% c(3,4), "HhsizeFactor"] = "3-4"
data = data[!is.na(data$HhsizeFactor),]
for (hhsize in unique(data$HhsizeFactor)) {
d = data[data$HhsizeFactor == hhsize,]
data[data$HhsizeFactor == hhsize, "PosRelHhsize"] = sum(d$Positive) / sum(!is.na(d$Positive))
data[data$HhsizeFactor == hhsize, "n"] = sum(!is.na(d$Positive))
}
data = data[!duplicated(data$HhsizeFactor),]
ggplot(data, aes(x=factor(HhsizeFactor), y=PosRelHhsize)) +
geom_bar(stat="identity") + geom_text(aes(label=n, y=PosRelHhsize+0.001)) +
labs(x="Household size", y="Fraction of positive tests")
# Number of positives in household sizes
data = data_koco
data$Positive = data$R_quant >= cutoff$new$Roche
# Roche values available for all?
cat("Roche not available: ", any(is.na(data$R_quant)), "\n")
## Roche not available: FALSE
cat("Number of households:", length(unique(data$hh_id)), "\n")
## Number of households: 2994
for (hh_id in unique(data$hh_id)) {
data[data$hh_id == hh_id, "PosInHh"] = sum(data$hh_id == hh_id & data$Positive)
}
data_uq_hh = data[!duplicated(data$hh_id),]
table(data_uq_hh$HouseholdSize, data_uq_hh$PosInHh)
##
## 0 1 2
## 1 768 16 0
## 2 1141 27 3
## 3 467 10 2
## 4 388 10 3
## 5 120 5 2
## 6 20 2 1
## 7 5 1 0
## 8 1 0 0
## 9 1 0 0
sum(data$Positive)
## [1] 93