library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(data.table)
## data.table 1.14.2 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
## **********
## This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
## This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
## **********
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ukbtools)
library(irr)
## Loading required package: lpSolve
library(skimr)
library(summarytools)
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
library(cowplot)
dir.create(path = here::here("out"), showWarnings = F)
datapath <- "/Volumes/EHR Group/GPRD_GOLD/Ali/2022_biobank/"
ukb <- read_rds(paste0(datapath, "ukb669156.rds"))
my_ukb_key <- ukb_df_field("ukb669156", path = datapath)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
twoXtwo <- function(df, exp, out){
df1 <- df %>%
ungroup() %>%
dplyr::select(exp = {{ exp }}, out = {{ out }})
tab <- table(df1$exp, df1$out, useNA = "always")
tab_p <- prop.table(tab,1)
tibble(
exposure = exp,
val = rownames(tab),
No = tab[, 1],
No_pc = tab_p[, 1] * 100,
Yes = tab[, 2],
Yes_pc = tab_p[, 2] * 100,
Miss = tab[, 3]
)
}
my_ukb_key[grepl(pattern = "centre",x = my_ukb_key$col.name), ] %>% print(n= Inf)
## # A tibble: 12 × 5
## field.showcase field.html field.tab col.type col.name
## <chr> <chr> <chr> <chr> <chr>
## 1 53 53-0.0 f.53.0.0 Date date_of_attending_assessment_centre_f53_0_0
## 2 53 53-1.0 f.53.1.0 Date date_of_attending_assessment_centre_f53_1_0
## 3 53 53-2.0 f.53.2.0 Date date_of_attending_assessment_centre_f53_2_0
## 4 53 53-3.0 f.53.3.0 Date date_of_attending_assessment_centre_f53_3_0
## 5 55 55-0.0 f.55.0.0 Categorical (single) month_of_attending_assessment_centre_f55_0_0
## 6 55 55-1.0 f.55.1.0 Categorical (single) month_of_attending_assessment_centre_f55_1_0
## 7 55 55-2.0 f.55.2.0 Categorical (single) month_of_attending_assessment_centre_f55_2_0
## 8 55 55-3.0 f.55.3.0 Categorical (single) month_of_attending_assessment_centre_f55_3_0
## 9 21003 21003-0.0 f.21003.0.0 Integer age_when_attended_assessment_centre_f21003_0_0
## 10 21003 21003-1.0 f.21003.1.0 Integer age_when_attended_assessment_centre_f21003_1_0
## 11 21003 21003-2.0 f.21003.2.0 Integer age_when_attended_assessment_centre_f21003_2_0
## 12 21003 21003-3.0 f.21003.3.0 Integer age_when_attended_assessment_centre_f21003_3_0
# basic info about participants -------------------------------------------
ukb_descriptive <- ukb %>%
select(f.eid,
age_at_recruit = f.21022.0.0,
age_at_assess = f.21003.0.0,
sex = f.31.0.0,
deprivation = f.189.0.0,
study_entry = f.53.0.0,
ethnicity = f.21000.0.0,
hh_inc = f.738.0.0,
emplyment = f.6142.0.0,
qualifications = f.6138.0.0
) %>%
mutate(age_cat = cut_number(age_at_assess, n = 6))
ukb_n <- dim(ukb_descriptive)[1]
ukb_n_age <- ukb_descriptive %>%
group_by(age_cat) %>%
summarise(n = n())
ukb_n_sex <- ukb_descriptive %>%
group_by(sex) %>%
summarise(n = n())
dfSummary(ukb_descriptive)
## Data Frame Summary
## ukb_descriptive
## Dimensions: 502409 x 11
## Duplicates: 0
##
## ----------------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ------------------- ------------------------------- ------------------------ --------------------- ---------- ---------
## 1 f.eid Mean (sd) : 3512069 (1450338) 502409 distinct values : : : : : : : : : : 502409 0
## [integer] min < med < max: : : : : : : : : : : (100.0%) (0.0%)
## 1000018 < 3512066 < 6024123 : : : : : : : : : :
## IQR (CV) : 2512043 (0.4) : : : : : : : : : :
## : : : : : : : : : :
##
## 2 age_at_recruit Mean (sd) : 56.5 (8.1) 37 distinct values : 502406 3
## [integer] min < med < max: . : . . (100.0%) (0.0%)
## 37 < 58 < 73 . : : : : : :
## IQR (CV) : 13 (0.1) : : : : : : : :
## . : : : : : : : :
##
## 3 age_at_assess Mean (sd) : 56.5 (8.1) 37 distinct values : 502409 0
## [integer] min < med < max: . : . . (100.0%) (0.0%)
## 37 < 58 < 73 . : : : : : :
## IQR (CV) : 13 (0.1) : : : : : : : :
## . : : : : : : : :
##
## 4 sex 1. Female 273325 (54.4%) IIIIIIIIII 502409 0
## [ordered, factor] 2. Male 229084 (45.6%) IIIIIIIII (100.0%) (0.0%)
##
## 5 deprivation Mean (sd) : -1.3 (3.1) 57716 distinct values : 501783 626
## [numeric] min < med < max: : : (99.9%) (0.1%)
## -6.3 < -2.1 < 11 : : .
## IQR (CV) : 4.2 (-2.4) : : : : .
## : : : : : : : .
##
## 6 study_entry min : 2006-03-13 1068 distinct values : : . . 502409 0
## [Date] med : 2009-01-26 : : : : . (100.0%) (0.0%)
## max : 2010-10-01 : : : : : .
## range : 4y 6m 18d : : : : : : :
## . : : : : : : :
##
## 7 ethnicity 1. Prefer not to answer 1661 ( 0.3%) 501509 900
## [ordered, factor] 2. Do not know 217 ( 0.0%) (99.8%) (0.2%)
## 3. White 569 ( 0.1%)
## 4. Mixed 49 ( 0.0%)
## 5. Asian or Asian British 43 ( 0.0%)
## 6. Black or Black British 27 ( 0.0%)
## 7. Chinese 1573 ( 0.3%)
## 8. Other ethnic group 4557 ( 0.9%)
## 9. British 442514 (88.2%) IIIIIIIIIIIIIIIII
## 10. Irish 13201 ( 2.6%)
## [ 12 others ] 37098 ( 7.4%) I
##
## 8 hh_inc 1. Prefer not to answer 49830 (10.0%) II 496395 6014
## [ordered, factor] 2. Do not know 21300 ( 4.3%) (98.8%) (1.2%)
## 3. Less than 18,000 97180 (19.6%) III
## 4. 18,000 to 30,999 108157 (21.8%) IIII
## 5. 31,000 to 51,999 110751 (22.3%) IIII
## 6. 52,000 to 100,000 86250 (17.4%) III
## 7. Greater than 100,000 22927 ( 4.6%)
##
## 9 emplyment 1. None of the above 2800 ( 0.6%) 501537 872
## [ordered, factor] 2. Prefer not to answer 2079 ( 0.4%) (99.8%) (0.2%)
## 3. In paid employment or sel 287080 (57.2%) IIIIIIIIIII
## 4. Retired 166968 (33.3%) IIIIII
## 5. Looking after home and/or 13855 ( 2.8%)
## 6. Unable to work because of 16823 ( 3.4%)
## 7. Unemployed 8263 ( 1.6%)
## 8. Doing unpaid or voluntary 2326 ( 0.5%)
## 9. Full or part-time student 1343 ( 0.3%)
##
## 10 qualifications 1. None of the above 85259 (17.1%) III 497767 4642
## [ordered, factor] 2. Prefer not to answer 5490 ( 1.1%) (99.1%) (0.9%)
## 3. College or University deg 161127 (32.4%) IIIIII
## 4. A levels/AS levels or equ 55308 (11.1%) II
## 5. O levels/GCSEs or equival 105175 (21.1%) IIII
## 6. CSEs or equivalent 26885 ( 5.4%) I
## 7. NVQ or HND or HNC or equi 32724 ( 6.6%) I
## 8. Other professional qualif 25799 ( 5.2%) I
##
## 11 age_cat 1. [37,47] 90200 (18.0%) III 502409 0
## [factor] 2. (47,53] 87868 (17.5%) III (100.0%) (0.0%)
## 3. (53,58] 86917 (17.3%) III
## 4. (58,61] 70417 (14.0%) II
## 5. (61,65] 93139 (18.5%) III
## 6. (65,73] 73868 (14.7%) II
## ----------------------------------------------------------------------------------------------------------------------------
theme_ukb <- function(){
theme_ali() +
theme(legend.position = "none",
axis.title = element_text(face = "bold"),
panel.grid = element_blank())
}
cols <- c("Female" = "grey35", "Male" = "hotpink")
fills <- c("Femaled" = "grey35", "Male" = "hotpink")
p_sex <- ggplot(ukb_descriptive, aes(y = sex, fill = sex)) +
geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7, alpha = 1) +
labs(y = "Sex", x = "%") +
scale_fill_manual(values = c("grey35", "hotpink"), na.value = "grey65") +
scale_x_continuous(limits = c(0,100)) +
theme_ukb()
p_age <- ggplot(ukb_descriptive, aes(x = age_at_assess, fill = sex, col = sex)) +
geom_density(alpha = 0.25, na.rm = TRUE) +
scale_colour_manual(values = cols,
aesthetics = c("color", "fill")) +
labs(x = "Age at assessment", y = "Density") +
theme_ukb()
p_entry <- ggplot(ukb_descriptive, aes(x = study_entry, fill = sex, col = sex)) +
geom_density(alpha = 0.25, na.rm = TRUE) +
scale_colour_manual(values = cols,
aesthetics = c("color", "fill")) +
labs(x = "Study entry", y = "Density") +
theme_ukb()
p_dep <- ggplot(ukb_descriptive, aes(x = deprivation, fill = sex, col = sex)) +
geom_density(alpha = 0.25, na.rm = TRUE) +
scale_colour_manual(values = cols,
aesthetics = c("color", "fill")) +
labs(x = "Townsend deprivation score", y = "Density") +
theme_ukb()
p_qual <- ggplot(ukb_descriptive, aes(y = qualifications, fill = sex)) +
geom_bar(aes(x = (..count..)/1000), position = "dodge", na.rm = TRUE, width = 0.7) +
scale_fill_manual(values = c("grey35", "hotpink")) +
labs(y = "Qualifications", x = "Count (thousands)") +
theme_ukb()
p_income <- ggplot(ukb_descriptive, aes(y = hh_inc, fill = sex)) +
geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) +
scale_fill_manual(values = c("grey35", "hotpink")) +
labs(y = "Household Income", x = "%") +
theme_ukb()
p_ethnicity <- ggplot(ukb_descriptive, aes(y = ethnicity, fill = sex)) +
geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) +
scale_fill_manual(values = c("grey35", "hotpink")) +
labs(y = "Ethnic Background", x = "%") +
theme_ukb()
p_employ <- ggplot(ukb_descriptive, aes(y = emplyment, fill = sex)) +
geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) +
scale_fill_manual(values = c("grey35", "hotpink")) +
labs(y = "Employment", x = "%") +
theme_ukb()
pdf(here::here("out/baseline_cohort_characteristics.pdf"), width = 12, height = 8)
plot_grid(p_sex, p_ethnicity, p_age, p_employ, p_dep, p_income, p_entry, p_qual, ncol = 2)
dev.off()
## quartz_off_screen
## 2
# diagnoses ---------------------------------------------------------------
ukbdiagnoses_baseline <- ukb %>%
select(f.eid, contains("20002.0")) %>%
left_join(ukb_descriptive, by = "f.eid")
# make long
ukbdiagnoses_baseline_long <- ukbdiagnoses_baseline %>%
pivot_longer(cols = contains("20002.0"), values_to = "illness_code",
names_to = c("instance", "array"),
names_pattern = "f.20002.(.).(.*)")
# skin disease at baseline ------------------------------------------------
ecz_pso <- ukbdiagnoses_baseline_long %>%
mutate(eczema = as.numeric(illness_code == 1452),
psoriasis = as.numeric(illness_code == 1453))
ecz_pso_tbl <- ecz_pso %>%
summarise(ecz = sum(eczema, na.rm = T),
pso = sum(psoriasis, na.rm = T))
ecz_pso_tbl
## # A tibble: 1 × 2
## ecz pso
## <dbl> <dbl>
## 1 12563 5575
(ecz_pso_tbl/ukb_n)*100
## ecz pso
## 1 2.500552 1.109654
ecz_pso_tbl_age <- ecz_pso %>%
group_by(age_cat) %>%
summarise(ecz = sum(eczema, na.rm = T),
pso = sum(psoriasis, na.rm = T))
ecz_pso_tbl_age
## # A tibble: 6 × 3
## age_cat ecz pso
## <fct> <dbl> <dbl>
## 1 [37,47] 2911 1025
## 2 (47,53] 2333 932
## 3 (53,58] 2233 1008
## 4 (58,61] 1556 809
## 5 (61,65] 2010 1007
## 6 (65,73] 1520 794
ecz_pso_tbl_age %>%
ungroup() %>%
left_join(ukb_n_age, by = "age_cat") %>%
mutate_at(c("ecz","pso"), ~(./n)*100)
## # A tibble: 6 × 4
## age_cat ecz pso n
## <fct> <dbl> <dbl> <int>
## 1 [37,47] 3.23 1.14 90200
## 2 (47,53] 2.66 1.06 87868
## 3 (53,58] 2.57 1.16 86917
## 4 (58,61] 2.21 1.15 70417
## 5 (61,65] 2.16 1.08 93139
## 6 (65,73] 2.06 1.07 73868
# mental illnesses at baseline ---------------------------------------------
mental_ill <- ecz_pso %>%
mutate(anxiety = as.numeric(illness_code == 1287),
depression = as.numeric(illness_code == 1286),
smi = as.numeric(illness_code %in% c(1289,1291,1290))
)
mental_ill_tbl <- mental_ill %>%
summarise_at(c("anxiety", "depression", "smi"), ~sum(., na.rm = T))
mental_ill_tbl
## # A tibble: 1 × 3
## anxiety depression smi
## <dbl> <dbl> <dbl>
## 1 6725 28244 2254
(mental_ill_tbl/ukb_n)*100
## anxiety depression smi
## 1 1.338551 5.621715 0.4486385
## collapse data
dt_mentalill <- setDT(mental_ill)
for (i in c("eczema", "psoriasis", "anxiety", "depression", "smi")) {
dt_mentalill[is.na(get(i)), (i):=0]
}
dt_mentalill_1row <- dt_mentalill[, lapply(.SD, max), .SDcols = c("eczema", "psoriasis", "anxiety", "depression", "smi"), by = f.eid]
dt_mentalill_1row_demo <- dt_mentalill[, .SD[1], by = f.eid]
dt_mentalill_1row_demo <- dt_mentalill_1row_demo[, -c("eczema", "psoriasis", "anxiety", "depression", "smi", "instance", "array", "illness_code")]
dt_mentalill_1row_combined <- dt_mentalill_1row_demo[dt_mentalill_1row, on = .(f.eid)]
df_1row <- tibble(dt_mentalill_1row_combined) %>%
ungroup()
twoXtwo(df= df_1row, exp = "eczema", out = "anxiety")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 483423 98.7 6441 1.31 0
## 2 eczema 1 12263 97.8 282 2.25 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_1row, exp = "eczema", out = "depression")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 462751 94.5 27113 5.53 0
## 2 eczema 1 11459 91.3 1086 8.66 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_1row, exp = "eczema", out = "smi")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 487741 99.6 2123 0.433 0
## 2 eczema 1 12482 99.5 63 0.502 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_1row, exp = "psoriasis", out = "anxiety")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 490221 98.7 6614 1.33 0
## 2 psoriasis 1 5465 98.0 109 1.96 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_1row, exp = "psoriasis", out = "depression")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 469061 94.4 27774 5.59 0
## 2 psoriasis 1 5149 92.4 425 7.62 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_1row, exp = "psoriasis", out = "smi")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 494691 99.6 2144 0.432 0
## 2 psoriasis 1 5532 99.2 42 0.753 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
# eczema/psoriasis in linked records --------------------------------------
# 41270 Diagnoses - ICD10
# 41280 Date of first in-patient diagnosis - ICD10
ukb_hes <- ukb %>%
select(f.eid, starts_with("f.41270.0"), starts_with("f.41280.0")) %>%
left_join(ukb_descriptive, by = "f.eid")
# make long
ukb_hes_long <- ukb_hes %>%
pivot_longer(cols = c(contains("f.41270.0"), starts_with("f.41270.0"), starts_with("f.41280.0")),
names_to = c(".value", "array"),
names_pattern = "f.412(.*)0.0.(.*)") %>%
rename("icd10" = `7`, "diag_date" = `8`)
ukb_hes_long$icd10[ukb_hes_long$diag_date > ukb_hes_long$study_entry] <- NA ## replace any diagnoses AFTER recruitment with NA
anxiety_codelist <- read.delim(here::here("codelist/eczema_psych_anxiety_HES.txt"), sep = ",")
depression_codelist <- read.delim(here::here("codelist/eczema_psych_depression_HES.txt"), sep = ",")
eczema_codelist <- read.delim(here::here("codelist/eczema_psych_eczemaDx_HES.txt"), sep = ",")
eczema_codelist$ICD10codealternative <- str_remove_all(eczema_codelist$ICD10code, "\\.")
psoriasis_codelist <- read.delim(here::here("codelist/psoriasis_icd.txt"), sep = ",")
psoriasis_codelist$ICD10codealternative <- str_to_upper(str_remove_all(psoriasis_codelist$icd, "\\."))
smi_codelist <- read.delim(here::here("codelist/ICD10codes-SMI.txt"), sep = ",")
names(smi_codelist)[2] <- "ICD10codealternative"
hes_mental <- ukb_hes_long %>%
mutate(anxiety = as.numeric(icd10 %in% anxiety_codelist$ICD10codealternative),
depression = as.numeric(icd10 %in% depression_codelist$ICD10codealternative),
smi = as.numeric(icd10 %in% smi_codelist$ICD10codealternative),
eczema = as.numeric(icd10 %in% eczema_codelist$ICD10codealternative),
psoriasis = as.numeric(icd10 %in% psoriasis_codelist$ICD10codealternative)
)
hes_mental_tbl <- hes_mental %>%
summarise(anxiety = sum(anxiety, na.rm = T),
depression = sum(depression, na.rm = T),
smi = sum(smi, na.rm = T),
psoriasis = sum(psoriasis, na.rm = T),
eczema = sum(eczema, na.rm = T))
hes_mental_tbl
## # A tibble: 1 × 5
## anxiety depression smi psoriasis eczema
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2700 7103 2090 1362 718
(hes_mental_tbl/ukb_n)*100
## anxiety depression smi psoriasis eczema
## 1 0.5374108 1.413788 0.4159957 0.2710939 0.1429115
## 2x2 using HES pre-baseline only
dt_hes_mental <- setDT(hes_mental)
for (i in c("eczema", "psoriasis", "anxiety", "depression", "smi")) {
dt_hes_mental[is.na(get(i)), (i):=0]
}
dt_hes_mental_1row <- dt_hes_mental[, lapply(.SD, max), .SDcols = c("eczema", "psoriasis", "anxiety", "depression", "smi"), by = f.eid]
dt_hes_mental_1row_demo <- setDT(ukb_descriptive)
dt_hes_mental_1row_combined <- dt_hes_mental_1row_demo[dt_hes_mental_1row, on = .(f.eid)]
df_hes_1row <- tibble(dt_hes_mental_1row_combined) %>%
ungroup()
twoXtwo(df= df_hes_1row, exp = "eczema", out = "anxiety")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 499237 99.5 2467 0.492 0
## 2 eczema 1 692 98.2 13 1.84 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_hes_1row, exp = "eczema", out = "depression")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 496132 98.9 5572 1.11 0
## 2 eczema 1 661 93.8 44 6.24 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_hes_1row, exp = "eczema", out = "smi")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 eczema 0 500342 99.7 1362 0.271 0
## 2 eczema 1 700 99.3 5 0.709 0
## 3 eczema <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "anxiety")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 498738 99.5 2458 0.490 0
## 2 psoriasis 1 1191 98.2 22 1.81 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "depression")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 495635 98.9 5561 1.11 0
## 2 psoriasis 1 1158 95.5 55 4.53 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "smi")
## # A tibble: 3 × 7
## exposure val No No_pc Yes Yes_pc Miss
## <chr> <chr> <int> <dbl> <int> <dbl> <int>
## 1 psoriasis 0 499838 99.7 1358 0.271 0
## 2 psoriasis 1 1204 99.3 9 0.742 0
## 3 psoriasis <NA> 0 NaN 0 NaN 0
## discordance - who has a mental illness code in HES but _not_ in assessment
diagnoses_list <- c("eczema", "psoriasis", "anxiety", "depression", "smi")
df_1row_combined <- df_1row %>%
rename_at(all_of(diagnoses_list), ~paste0(., "_assess")) %>%
left_join(df_hes_1row, by = c("f.eid", "age_at_recruit", "age_at_assess", "sex", "deprivation", "age_cat")) %>%
rename_at(all_of(diagnoses_list), ~paste0(., "_hes"))
print("Kappa for assessment|hes")
## [1] "Kappa for assessment|hes"
kappa_df <- NULL
for(X in diagnoses_list){
cont_table <- df_1row_combined %>%
select(contains(X)) %>%
group_by_all() %>%
tally() %>%
ungroup() %>%
spread(key = 2, value = n) %>%
select(2:3) %>%
as.matrix()
a <- cont_table[1,1]
b <- cont_table[1,2]
c <- cont_table[2,1]
d <- cont_table[2,2]
n <- a+b+c+d
p0 <- (a+d)/n
pY <- ((a+b)/n)*((a+c)/n)
pN <- ((c+d)/n)*((b+d)/n)
pE <- pY + pN
kappa <- ((p0-pE)/(1-pE))
print(c(X, signif(kappa,3)))
print.table(cont_table)
results <- data.frame(variable = X, kappa = kappa)
kappa_df <- kappa_df %>%
bind_rows(results)
}
## 0
## "eczema" "0.0288"
## 0 1
## [1,] 489367 497
## [2,] 12337 208
## 0
## "psoriasis" "0.168"
## 0 1
## [1,] 496204 631
## [2,] 4992 582
## 0
## "anxiety" "0.0549"
## 0 1
## [1,] 493490 2196
## [2,] 6439 284
## 0
## "depression" "0.137"
## 0 1
## [1,] 471182 3028
## [2,] 25611 2588
## 0
## "smi" "0.449"
## 0 1
## [1,] 499657 566
## [2,] 1385 801