library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## data.table 1.14.2 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
## **********
## This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
## This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
## **********
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ukbtools)
library(irr)
## Loading required package: lpSolve
library(skimr)
library(summarytools)
## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(cowplot)

dir.create(path = here::here("out"), showWarnings = F)
datapath <- "/Volumes/EHR Group/GPRD_GOLD/Ali/2022_biobank/"

ukb <- read_rds(paste0(datapath, "ukb669156.rds"))
my_ukb_key <- ukb_df_field("ukb669156", path = datapath)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
twoXtwo <- function(df, exp, out){
  df1 <- df %>% 
    ungroup() %>% 
    dplyr::select(exp = {{ exp }}, out = {{ out }})
  tab <- table(df1$exp, df1$out, useNA = "always")
  tab_p <- prop.table(tab,1)
  tibble(
    exposure = exp,
    val = rownames(tab),
    No = tab[, 1],
    No_pc = tab_p[, 1] * 100,
    Yes = tab[, 2],
    Yes_pc = tab_p[, 2] * 100,
    Miss = tab[, 3]
  )
}
my_ukb_key[grepl(pattern = "centre",x = my_ukb_key$col.name), ] %>% print(n= Inf)
## # A tibble: 12 × 5
##    field.showcase field.html field.tab   col.type             col.name                                      
##    <chr>          <chr>      <chr>       <chr>                <chr>                                         
##  1 53             53-0.0     f.53.0.0    Date                 date_of_attending_assessment_centre_f53_0_0   
##  2 53             53-1.0     f.53.1.0    Date                 date_of_attending_assessment_centre_f53_1_0   
##  3 53             53-2.0     f.53.2.0    Date                 date_of_attending_assessment_centre_f53_2_0   
##  4 53             53-3.0     f.53.3.0    Date                 date_of_attending_assessment_centre_f53_3_0   
##  5 55             55-0.0     f.55.0.0    Categorical (single) month_of_attending_assessment_centre_f55_0_0  
##  6 55             55-1.0     f.55.1.0    Categorical (single) month_of_attending_assessment_centre_f55_1_0  
##  7 55             55-2.0     f.55.2.0    Categorical (single) month_of_attending_assessment_centre_f55_2_0  
##  8 55             55-3.0     f.55.3.0    Categorical (single) month_of_attending_assessment_centre_f55_3_0  
##  9 21003          21003-0.0  f.21003.0.0 Integer              age_when_attended_assessment_centre_f21003_0_0
## 10 21003          21003-1.0  f.21003.1.0 Integer              age_when_attended_assessment_centre_f21003_1_0
## 11 21003          21003-2.0  f.21003.2.0 Integer              age_when_attended_assessment_centre_f21003_2_0
## 12 21003          21003-3.0  f.21003.3.0 Integer              age_when_attended_assessment_centre_f21003_3_0
# basic info about participants -------------------------------------------
ukb_descriptive <- ukb %>% 
  select(f.eid, 
         age_at_recruit = f.21022.0.0, 
         age_at_assess = f.21003.0.0,
         sex = f.31.0.0,
         deprivation = f.189.0.0,
         study_entry = f.53.0.0,
         ethnicity = f.21000.0.0,
         hh_inc = f.738.0.0,
         emplyment = f.6142.0.0,
         qualifications = f.6138.0.0
         ) %>% 
  mutate(age_cat = cut_number(age_at_assess, n = 6))

ukb_n <- dim(ukb_descriptive)[1]
ukb_n_age <- ukb_descriptive %>% 
  group_by(age_cat) %>% 
  summarise(n = n())
ukb_n_sex <- ukb_descriptive %>% 
  group_by(sex) %>% 
  summarise(n = n())

dfSummary(ukb_descriptive)
## Data Frame Summary  
## ukb_descriptive  
## Dimensions: 502409 x 11  
## Duplicates: 0  
## 
## ----------------------------------------------------------------------------------------------------------------------------
## No   Variable            Stats / Values                  Freqs (% of Valid)       Graph                 Valid      Missing  
## ---- ------------------- ------------------------------- ------------------------ --------------------- ---------- ---------
## 1    f.eid               Mean (sd) : 3512069 (1450338)   502409 distinct values   : : : : : : : : : :   502409     0        
##      [integer]           min < med < max:                                         : : : : : : : : : :   (100.0%)   (0.0%)   
##                          1000018 < 3512066 < 6024123                              : : : : : : : : : :                       
##                          IQR (CV) : 2512043 (0.4)                                 : : : : : : : : : :                       
##                                                                                   : : : : : : : : : :                       
## 
## 2    age_at_recruit      Mean (sd) : 56.5 (8.1)          37 distinct values                   :         502406     3        
##      [integer]           min < med < max:                                                 .   : . .     (100.0%)   (0.0%)   
##                          37 < 58 < 73                                               .   : : : : : :                         
##                          IQR (CV) : 13 (0.1)                                        : : : : : : : :                         
##                                                                                   . : : : : : : : :                         
## 
## 3    age_at_assess       Mean (sd) : 56.5 (8.1)          37 distinct values                   :         502409     0        
##      [integer]           min < med < max:                                                 .   : . .     (100.0%)   (0.0%)   
##                          37 < 58 < 73                                               .   : : : : : :                         
##                          IQR (CV) : 13 (0.1)                                        : : : : : : : :                         
##                                                                                   . : : : : : : : :                         
## 
## 4    sex                 1. Female                       273325 (54.4%)           IIIIIIIIII            502409     0        
##      [ordered, factor]   2. Male                         229084 (45.6%)           IIIIIIIII             (100.0%)   (0.0%)   
## 
## 5    deprivation         Mean (sd) : -1.3 (3.1)          57716 distinct values      :                   501783     626      
##      [numeric]           min < med < max:                                           : :                 (99.9%)    (0.1%)   
##                          -6.3 < -2.1 < 11                                           : : .                                   
##                          IQR (CV) : 4.2 (-2.4)                                    : : : : .                                 
##                                                                                   : : : : : : : .                           
## 
## 6    study_entry         min : 2006-03-13                1068 distinct values             : : . .       502409     0        
##      [Date]              med : 2009-01-26                                                 : : : : .     (100.0%)   (0.0%)   
##                          max : 2010-10-01                                                 : : : : : .                       
##                          range : 4y 6m 18d                                              : : : : : : :                       
##                                                                                       . : : : : : : :                       
## 
## 7    ethnicity           1. Prefer not to answer           1661 ( 0.3%)                                 501509     900      
##      [ordered, factor]   2. Do not know                     217 ( 0.0%)                                 (99.8%)    (0.2%)   
##                          3. White                           569 ( 0.1%)                                                     
##                          4. Mixed                            49 ( 0.0%)                                                     
##                          5. Asian or Asian British           43 ( 0.0%)                                                     
##                          6. Black or Black British           27 ( 0.0%)                                                     
##                          7. Chinese                        1573 ( 0.3%)                                                     
##                          8. Other ethnic group             4557 ( 0.9%)                                                     
##                          9. British                      442514 (88.2%)           IIIIIIIIIIIIIIIII                         
##                          10. Irish                        13201 ( 2.6%)                                                     
##                          [ 12 others ]                    37098 ( 7.4%)           I                                         
## 
## 8    hh_inc              1. Prefer not to answer          49830 (10.0%)           II                    496395     6014     
##      [ordered, factor]   2. Do not know                   21300 ( 4.3%)                                 (98.8%)    (1.2%)   
##                          3. Less than 18,000              97180 (19.6%)           III                                       
##                          4. 18,000 to 30,999             108157 (21.8%)           IIII                                      
##                          5. 31,000 to 51,999             110751 (22.3%)           IIII                                      
##                          6. 52,000 to 100,000             86250 (17.4%)           III                                       
##                          7. Greater than 100,000          22927 ( 4.6%)                                                     
## 
## 9    emplyment           1. None of the above              2800 ( 0.6%)                                 501537     872      
##      [ordered, factor]   2. Prefer not to answer           2079 ( 0.4%)                                 (99.8%)    (0.2%)   
##                          3. In paid employment or sel    287080 (57.2%)           IIIIIIIIIII                               
##                          4. Retired                      166968 (33.3%)           IIIIII                                    
##                          5. Looking after home and/or     13855 ( 2.8%)                                                     
##                          6. Unable to work because of     16823 ( 3.4%)                                                     
##                          7. Unemployed                     8263 ( 1.6%)                                                     
##                          8. Doing unpaid or voluntary      2326 ( 0.5%)                                                     
##                          9. Full or part-time student      1343 ( 0.3%)                                                     
## 
## 10   qualifications      1. None of the above             85259 (17.1%)           III                   497767     4642     
##      [ordered, factor]   2. Prefer not to answer           5490 ( 1.1%)                                 (99.1%)    (0.9%)   
##                          3. College or University deg    161127 (32.4%)           IIIIII                                    
##                          4. A levels/AS levels or equ     55308 (11.1%)           II                                        
##                          5. O levels/GCSEs or equival    105175 (21.1%)           IIII                                      
##                          6. CSEs or equivalent            26885 ( 5.4%)           I                                         
##                          7. NVQ or HND or HNC or equi     32724 ( 6.6%)           I                                         
##                          8. Other professional qualif     25799 ( 5.2%)           I                                         
## 
## 11   age_cat             1. [37,47]                      90200 (18.0%)            III                   502409     0        
##      [factor]            2. (47,53]                      87868 (17.5%)            III                   (100.0%)   (0.0%)   
##                          3. (53,58]                      86917 (17.3%)            III                                       
##                          4. (58,61]                      70417 (14.0%)            II                                        
##                          5. (61,65]                      93139 (18.5%)            III                                       
##                          6. (65,73]                      73868 (14.7%)            II                                        
## ----------------------------------------------------------------------------------------------------------------------------
theme_ukb <- function(){
  theme_ali() +
    theme(legend.position = "none", 
          axis.title = element_text(face = "bold"), 
          panel.grid = element_blank())
}
cols <- c("Female" = "grey35", "Male" = "hotpink")
fills <- c("Femaled" = "grey35", "Male" = "hotpink")

p_sex <- ggplot(ukb_descriptive, aes(y = sex, fill = sex)) + 
  geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7, alpha = 1) + 
  labs(y = "Sex", x = "%") +
  scale_fill_manual(values = c("grey35", "hotpink"), na.value = "grey65") + 
  scale_x_continuous(limits = c(0,100)) +
  theme_ukb()

p_age <- ggplot(ukb_descriptive, aes(x = age_at_assess, fill = sex, col = sex)) +
  geom_density(alpha = 0.25, na.rm = TRUE) +
  scale_colour_manual(values = cols,
                      aesthetics = c("color", "fill")) + 
  labs(x = "Age at assessment", y = "Density") +
  theme_ukb()

p_entry <- ggplot(ukb_descriptive, aes(x = study_entry, fill = sex, col = sex)) +
  geom_density(alpha = 0.25, na.rm = TRUE) +
  scale_colour_manual(values = cols,
                      aesthetics = c("color", "fill")) + 
  labs(x = "Study entry", y = "Density") +
  theme_ukb()

p_dep <- ggplot(ukb_descriptive, aes(x = deprivation, fill = sex, col = sex)) +
  geom_density(alpha = 0.25, na.rm = TRUE) +
  scale_colour_manual(values = cols,
                      aesthetics = c("color", "fill")) + 
  labs(x = "Townsend deprivation score", y = "Density") +
  theme_ukb()

p_qual <- ggplot(ukb_descriptive, aes(y = qualifications, fill = sex)) +
  geom_bar(aes(x = (..count..)/1000), position = "dodge", na.rm = TRUE, width = 0.7) + 
  scale_fill_manual(values = c("grey35", "hotpink")) + 
  labs(y = "Qualifications", x = "Count (thousands)") +
  theme_ukb()

p_income <- ggplot(ukb_descriptive, aes(y = hh_inc, fill = sex)) +
  geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) + 
  scale_fill_manual(values = c("grey35", "hotpink")) + 
  labs(y = "Household Income", x = "%") +
  theme_ukb()

p_ethnicity <- ggplot(ukb_descriptive, aes(y = ethnicity, fill = sex)) + 
  geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) + 
  scale_fill_manual(values = c("grey35", "hotpink")) + 
  labs(y = "Ethnic Background", x = "%") +
  theme_ukb()

p_employ <- ggplot(ukb_descriptive, aes(y = emplyment, fill = sex)) + 
  geom_bar(aes(x = (..count..)*100/sum(..count..)), position = "dodge", na.rm = TRUE, width = 0.7) + 
  scale_fill_manual(values = c("grey35", "hotpink")) + 
  labs(y = "Employment", x = "%") +
  theme_ukb()

pdf(here::here("out/baseline_cohort_characteristics.pdf"), width = 12, height = 8)
  plot_grid(p_sex, p_ethnicity, p_age, p_employ, p_dep, p_income, p_entry, p_qual, ncol = 2)
dev.off()
## quartz_off_screen 
##                 2
# diagnoses ---------------------------------------------------------------
ukbdiagnoses_baseline <- ukb %>% 
  select(f.eid, contains("20002.0")) %>% 
  left_join(ukb_descriptive, by = "f.eid")

# make long 
ukbdiagnoses_baseline_long <- ukbdiagnoses_baseline %>% 
  pivot_longer(cols = contains("20002.0"), values_to = "illness_code", 
               names_to = c("instance", "array"),
               names_pattern = "f.20002.(.).(.*)")

# skin disease at baseline ------------------------------------------------
ecz_pso <- ukbdiagnoses_baseline_long %>% 
  mutate(eczema = as.numeric(illness_code == 1452),
         psoriasis = as.numeric(illness_code == 1453)) 

ecz_pso_tbl <- ecz_pso %>% 
  summarise(ecz = sum(eczema, na.rm = T), 
            pso = sum(psoriasis, na.rm = T))
ecz_pso_tbl
## # A tibble: 1 × 2
##     ecz   pso
##   <dbl> <dbl>
## 1 12563  5575
(ecz_pso_tbl/ukb_n)*100
##        ecz      pso
## 1 2.500552 1.109654
ecz_pso_tbl_age <- ecz_pso %>% 
  group_by(age_cat) %>% 
  summarise(ecz = sum(eczema, na.rm = T), 
            pso = sum(psoriasis, na.rm = T))
ecz_pso_tbl_age
## # A tibble: 6 × 3
##   age_cat   ecz   pso
##   <fct>   <dbl> <dbl>
## 1 [37,47]  2911  1025
## 2 (47,53]  2333   932
## 3 (53,58]  2233  1008
## 4 (58,61]  1556   809
## 5 (61,65]  2010  1007
## 6 (65,73]  1520   794
ecz_pso_tbl_age %>% 
  ungroup() %>% 
  left_join(ukb_n_age, by = "age_cat") %>% 
  mutate_at(c("ecz","pso"), ~(./n)*100)
## # A tibble: 6 × 4
##   age_cat   ecz   pso     n
##   <fct>   <dbl> <dbl> <int>
## 1 [37,47]  3.23  1.14 90200
## 2 (47,53]  2.66  1.06 87868
## 3 (53,58]  2.57  1.16 86917
## 4 (58,61]  2.21  1.15 70417
## 5 (61,65]  2.16  1.08 93139
## 6 (65,73]  2.06  1.07 73868
# mental illnesses at baseline ---------------------------------------------
mental_ill <- ecz_pso %>% 
  mutate(anxiety = as.numeric(illness_code == 1287),
         depression = as.numeric(illness_code == 1286),
         smi = as.numeric(illness_code %in% c(1289,1291,1290))
         ) 
mental_ill_tbl <- mental_ill %>% 
  summarise_at(c("anxiety", "depression", "smi"), ~sum(., na.rm = T))
mental_ill_tbl
## # A tibble: 1 × 3
##   anxiety depression   smi
##     <dbl>      <dbl> <dbl>
## 1    6725      28244  2254
(mental_ill_tbl/ukb_n)*100
##    anxiety depression       smi
## 1 1.338551   5.621715 0.4486385
## collapse data
dt_mentalill <- setDT(mental_ill)
for (i in c("eczema", "psoriasis", "anxiety", "depression", "smi")) {
  dt_mentalill[is.na(get(i)), (i):=0]
}

dt_mentalill_1row <- dt_mentalill[, lapply(.SD, max), .SDcols = c("eczema", "psoriasis", "anxiety", "depression", "smi"), by = f.eid]
dt_mentalill_1row_demo <- dt_mentalill[, .SD[1], by = f.eid]
dt_mentalill_1row_demo <- dt_mentalill_1row_demo[, -c("eczema", "psoriasis", "anxiety", "depression", "smi", "instance", "array", "illness_code")]
dt_mentalill_1row_combined <- dt_mentalill_1row_demo[dt_mentalill_1row, on = .(f.eid)]

df_1row <- tibble(dt_mentalill_1row_combined) %>% 
  ungroup()

twoXtwo(df= df_1row, exp = "eczema", out = "anxiety")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 eczema   0     483423  98.7  6441   1.31     0
## 2 eczema   1      12263  97.8   282   2.25     0
## 3 eczema   <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_1row, exp = "eczema", out = "depression")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 eczema   0     462751  94.5 27113   5.53     0
## 2 eczema   1      11459  91.3  1086   8.66     0
## 3 eczema   <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_1row, exp = "eczema", out = "smi")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes  Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 eczema   0     487741  99.6  2123   0.433     0
## 2 eczema   1      12482  99.5    63   0.502     0
## 3 eczema   <NA>       0 NaN       0 NaN         0
twoXtwo(df= df_1row, exp = "psoriasis", out = "anxiety")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 psoriasis 0     490221  98.7  6614   1.33     0
## 2 psoriasis 1       5465  98.0   109   1.96     0
## 3 psoriasis <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_1row, exp = "psoriasis", out = "depression")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 psoriasis 0     469061  94.4 27774   5.59     0
## 2 psoriasis 1       5149  92.4   425   7.62     0
## 3 psoriasis <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_1row, exp = "psoriasis", out = "smi")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes  Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 psoriasis 0     494691  99.6  2144   0.432     0
## 2 psoriasis 1       5532  99.2    42   0.753     0
## 3 psoriasis <NA>       0 NaN       0 NaN         0
# eczema/psoriasis in linked records --------------------------------------
# 41270 Diagnoses - ICD10
# 41280 Date of first in-patient diagnosis - ICD10

ukb_hes <- ukb %>% 
  select(f.eid, starts_with("f.41270.0"), starts_with("f.41280.0")) %>% 
  left_join(ukb_descriptive, by = "f.eid")

# make long 
ukb_hes_long <- ukb_hes %>% 
  pivot_longer(cols = c(contains("f.41270.0"), starts_with("f.41270.0"), starts_with("f.41280.0")), 
               names_to = c(".value", "array"),
               names_pattern = "f.412(.*)0.0.(.*)") %>% 
  rename("icd10" = `7`, "diag_date" = `8`)

ukb_hes_long$icd10[ukb_hes_long$diag_date > ukb_hes_long$study_entry] <- NA ## replace any diagnoses AFTER recruitment with NA

anxiety_codelist <- read.delim(here::here("codelist/eczema_psych_anxiety_HES.txt"), sep = ",")
depression_codelist <- read.delim(here::here("codelist/eczema_psych_depression_HES.txt"), sep = ",")
eczema_codelist <- read.delim(here::here("codelist/eczema_psych_eczemaDx_HES.txt"), sep = ",")
  eczema_codelist$ICD10codealternative <- str_remove_all(eczema_codelist$ICD10code, "\\.")
psoriasis_codelist <- read.delim(here::here("codelist/psoriasis_icd.txt"), sep = ",")
  psoriasis_codelist$ICD10codealternative <- str_to_upper(str_remove_all(psoriasis_codelist$icd, "\\."))
smi_codelist <- read.delim(here::here("codelist/ICD10codes-SMI.txt"), sep = ",")
  names(smi_codelist)[2] <- "ICD10codealternative"

hes_mental <- ukb_hes_long %>% 
  mutate(anxiety = as.numeric(icd10 %in% anxiety_codelist$ICD10codealternative),
         depression = as.numeric(icd10 %in% depression_codelist$ICD10codealternative),
         smi = as.numeric(icd10 %in% smi_codelist$ICD10codealternative),
         eczema = as.numeric(icd10 %in% eczema_codelist$ICD10codealternative),
         psoriasis = as.numeric(icd10 %in% psoriasis_codelist$ICD10codealternative)
         )

hes_mental_tbl <- hes_mental %>% 
  summarise(anxiety = sum(anxiety, na.rm = T), 
            depression = sum(depression, na.rm = T),
            smi = sum(smi, na.rm = T),
            psoriasis = sum(psoriasis, na.rm = T),
            eczema = sum(eczema, na.rm = T))
hes_mental_tbl
## # A tibble: 1 × 5
##   anxiety depression   smi psoriasis eczema
##     <dbl>      <dbl> <dbl>     <dbl>  <dbl>
## 1    2700       7103  2090      1362    718
(hes_mental_tbl/ukb_n)*100
##     anxiety depression       smi psoriasis    eczema
## 1 0.5374108   1.413788 0.4159957 0.2710939 0.1429115
## 2x2 using HES pre-baseline only
dt_hes_mental <- setDT(hes_mental)
for (i in c("eczema", "psoriasis", "anxiety", "depression", "smi")) {
  dt_hes_mental[is.na(get(i)), (i):=0]
}

dt_hes_mental_1row <- dt_hes_mental[, lapply(.SD, max), .SDcols = c("eczema", "psoriasis", "anxiety", "depression", "smi"), by = f.eid]
dt_hes_mental_1row_demo <- setDT(ukb_descriptive)
dt_hes_mental_1row_combined <- dt_hes_mental_1row_demo[dt_hes_mental_1row, on = .(f.eid)]

df_hes_1row <- tibble(dt_hes_mental_1row_combined) %>% 
  ungroup()

twoXtwo(df= df_hes_1row, exp = "eczema", out = "anxiety")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes  Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 eczema   0     499237  99.5  2467   0.492     0
## 2 eczema   1        692  98.2    13   1.84      0
## 3 eczema   <NA>       0 NaN       0 NaN         0
twoXtwo(df= df_hes_1row, exp = "eczema", out = "depression")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 eczema   0     496132  98.9  5572   1.11     0
## 2 eczema   1        661  93.8    44   6.24     0
## 3 eczema   <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_hes_1row, exp = "eczema", out = "smi")
## # A tibble: 3 × 7
##   exposure val       No No_pc   Yes  Yes_pc  Miss
##   <chr>    <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 eczema   0     500342  99.7  1362   0.271     0
## 2 eczema   1        700  99.3     5   0.709     0
## 3 eczema   <NA>       0 NaN       0 NaN         0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "anxiety")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes  Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 psoriasis 0     498738  99.5  2458   0.490     0
## 2 psoriasis 1       1191  98.2    22   1.81      0
## 3 psoriasis <NA>       0 NaN       0 NaN         0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "depression")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>  <dbl> <int>
## 1 psoriasis 0     495635  98.9  5561   1.11     0
## 2 psoriasis 1       1158  95.5    55   4.53     0
## 3 psoriasis <NA>       0 NaN       0 NaN        0
twoXtwo(df= df_hes_1row, exp = "psoriasis", out = "smi")
## # A tibble: 3 × 7
##   exposure  val       No No_pc   Yes  Yes_pc  Miss
##   <chr>     <chr>  <int> <dbl> <int>   <dbl> <int>
## 1 psoriasis 0     499838  99.7  1358   0.271     0
## 2 psoriasis 1       1204  99.3     9   0.742     0
## 3 psoriasis <NA>       0 NaN       0 NaN         0
## discordance - who has a mental illness code in HES but _not_ in assessment
diagnoses_list <- c("eczema", "psoriasis", "anxiety", "depression", "smi")
df_1row_combined <- df_1row %>% 
  rename_at(all_of(diagnoses_list), ~paste0(., "_assess")) %>% 
  left_join(df_hes_1row, by = c("f.eid", "age_at_recruit", "age_at_assess", "sex", "deprivation", "age_cat")) %>% 
  rename_at(all_of(diagnoses_list), ~paste0(., "_hes")) 

print("Kappa for assessment|hes")
## [1] "Kappa for assessment|hes"
kappa_df <- NULL
for(X in diagnoses_list){
  cont_table <- df_1row_combined %>% 
    select(contains(X)) %>% 
    group_by_all() %>% 
    tally() %>% 
    ungroup() %>% 
    spread(key = 2, value = n) %>% 
    select(2:3) %>% 
    as.matrix()
  a <- cont_table[1,1]
  b <- cont_table[1,2]
  c <- cont_table[2,1]
  d <- cont_table[2,2]
  n <- a+b+c+d
  p0 <- (a+d)/n
  pY <- ((a+b)/n)*((a+c)/n)
  pN <- ((c+d)/n)*((b+d)/n)
  pE <- pY + pN
  kappa <- ((p0-pE)/(1-pE))
  print(c(X, signif(kappa,3)))
  print.table(cont_table)
  results <- data.frame(variable = X, kappa = kappa)
  kappa_df <- kappa_df %>% 
    bind_rows(results)
}
##                 0 
## "eczema" "0.0288" 
##           0      1
## [1,] 489367    497
## [2,]  12337    208
##                       0 
## "psoriasis"     "0.168" 
##           0      1
## [1,] 496204    631
## [2,]   4992    582
##                   0 
## "anxiety"  "0.0549" 
##           0      1
## [1,] 493490   2196
## [2,]   6439    284
##                         0 
## "depression"      "0.137" 
##           0      1
## [1,] 471182   3028
## [2,]  25611   2588
##               0 
##   "smi" "0.449" 
##           0      1
## [1,] 499657    566
## [2,]   1385    801