rm(list = ls())
library(dplyr)
library(pander)
library(ggplot2)
panderOptions('table.split.table', Inf)
source('../code/rlib_doc.R')

1 About

We analyze the quantitative traits analyzed by Martin et al.

s6 = read.csv('../external_data/martin_et_al_2019ng_table_s6.csv')
s6 %>% pander
Trait Ntotal..BBJ. NGWAS..BBJ…UKBB. Ntarget..BBJ…UKBB. UKBB.code X..BBJ.clumps X..UKBB.clumps
Basophil 87665 82665 5000 30160 8939 8690
BMI 155426 150426 5000 21001 19114 21339
DBP 137991 132991 5000 4079 9865 14213
Eosinophil 88675 83675 5000 30150 9266 13061
Hb 144653 139653 5000 30020 10483 16184
Height 156569 151569 5000 50 37216 31854
Ht 144947 139947 5000 30030 10554 15408
Lymphocyte 91157 86157 5000 30120 9400 13648
MCH 121249 116249 5000 30050 12598 15222
MCHC 128232 123232 5000 30060 10272 10074
MCV 122912 117912 5000 30040 13169 16354
Monocyte 90593 85593 5000 30130 10886 13452
Neutrophil 79287 74287 5000 30140 9150 12211
Platelet 140610 135610 5000 30080 14843 19259
RBC 145426 140426 5000 30010 12467 18069
SBP 137981 132981 5000 4080 11231 14562
WBC 146158 141158 5000 30000 12664 17581

I queried these traits by UKB code where I queried all instances and arrays see details here. So, I need to aggregate the phenotypes (from multiple instances and arrays in some way).

2 Load data

Here I load the data passing population QC see details here.

dat = readRDS('../output/query_first_attempt_with_population_qc.rds')
dat = dat$dat_pass_pop_qc %>% select(-bmi, -height)  # remove the two test traits which are duplicated from the Martin et al phenotypes 

3 Clean up steps

First of all, let’s see how many instances and how many arraies each trait has.

trait_meta = list()
cols_relavant = colnames(dat)
cols_relavant = cols_relavant[!is.na(stringr::str_match(cols_relavant, '_x_'))]
parsed_col = as.data.frame(do.call(rbind, strsplit(cols_relavant, '_x_')))
colnames(parsed_col) = c('trait', 'instance', 'array')
parsed_col = parsed_col %>% mutate(instance = stringr::str_remove(instance, 'instance_'), array = stringr::str_remove(array, 'array_'))
parsed_col = parsed_col %>% filter(trait != 'ethnicity')  # remove ethnicity since this part is taken care of in population_assignment.Rmd
parsed_col %>% group_by(trait) %>% summarize(ninstance = length(unique(instance))) %>% pander(caption = 'number of instances')
number of instances
trait ninstance
basophil 3
bmi 3
dbp 3
eosinophil 3
hb 3
height 3
ht 3
lymphocyte 3
mch 3
mchc 3
mcv 3
monocyte 3
neutrophil 3
platelet 3
rbc 3
sbp 3
wbc 3
trait_instance_with_multi_array = parsed_col %>% group_by(trait, instance) %>% summarize(number_of_array = length(unique(array))) %>% ungroup() %>% filter(number_of_array > 1) 
trait_instance_with_multi_array %>% pander(caption = 'trait/instance pair with more than one array')
trait/instance pair with more than one array
trait instance number_of_array
dbp 0 2
dbp 1 2
dbp 2 2
sbp 0 2
sbp 1 2
sbp 2 2

OK, let’s first see how the two array varies.

subsample = subsample_for_vis(group = dat$meaning, nmax = 1500)
for(i in 1 : nrow(trait_instance_with_multi_array)) {
  this = trait_instance_with_multi_array[i, ]
  prefix = paste0(this$trait, '_x_', 'instance_', this$instance, '_x_', 'array_')
  cols = paste0(prefix, 0 : (this$number_of_array - 1))
  sub = dat[subsample, c('meaning', cols)] 
  sub = sub[rowSums(!is.na(sub)) > 0, ]
  print(myggpairs(sub %>% select(-meaning), sub$meaning) + geom_abline(intercept = 0, slope = 1))
}
## Warning: Removed 343 rows containing missing values (geom_point).

## Warning: Removed 5792 rows containing missing values (geom_point).

## Warning: Removed 5690 rows containing missing values (geom_point).

## Warning: Removed 343 rows containing missing values (geom_point).

## Warning: Removed 5792 rows containing missing values (geom_point).

## Warning: Removed 5690 rows containing missing values (geom_point).

It seems that taking the values from multiple arraies are quite consistent so that we aggregate array by taking the average.

dat_old = dat
for(i in 1 : nrow(trait_instance_with_multi_array)) {
  this = trait_instance_with_multi_array[i, ]
  prefix = paste0(this$trait, '_x_', 'instance_', this$instance, '_x_', 'array_')
  cols = paste0(prefix, 0 : (this$number_of_array - 1))
  newcol = data.frame(x = aggregate_instances(dat[, cols], aggregate_instances))
  colnames(newcol) = paste0(prefix, 'agg')
  dat = cbind(dat, newcol)
  dat[, cols] = NULL
}

Then, we proceed to aggregate across multiple instances. And here we use the first non-NA value as the value of the phenotype for that individual.

for(i in unique(parsed_col$trait)) {
  this = parsed_col %>% filter(trait == i)
  suffix = '_x_array_0'
  if(i %in% trait_instance_with_multi_array$trait) {
    suffix = '_x_array_agg'
  }
  prefix = paste0(i, '_x_', 'instance_')
  cols = paste0(prefix, unique(this$instance), suffix)
  newcol = data.frame(x = aggregate_instances(dat[, cols], first_non_na))
  colnames(newcol) = i
  dat = cbind(dat, newcol)
  dat[, cols] = NULL
}
martin_traits = as.character(unique(parsed_col$trait))
summary(dat[, martin_traits])
##      height           dbp              sbp             bmi       
##  Min.   : 75.0   Min.   : 32.00   Min.   : 62.0   Min.   :12.12  
##  1st Qu.:162.0   1st Qu.: 75.00   1st Qu.:126.0   1st Qu.:24.14  
##  Median :168.0   Median : 82.00   Median :139.0   Median :26.73  
##  Mean   :168.7   Mean   : 82.29   Mean   :140.1   Mean   :27.41  
##  3rd Qu.:175.0   3rd Qu.: 89.00   3rd Qu.:152.0   3rd Qu.:29.86  
##  Max.   :209.0   Max.   :148.00   Max.   :264.0   Max.   :74.68  
##  NA's   :992     NA's   :21354    NA's   :21357   NA's   :1388   
##       wbc               rbc              hb              ht       
##  Min.   :  0.000   Min.   :0.006   Min.   : 0.12   Min.   : 0.05  
##  1st Qu.:  5.640   1st Qu.:4.238   1st Qu.:13.37   1st Qu.:38.74  
##  Median :  6.650   Median :4.504   Median :14.19   Median :41.10  
##  Mean   :  6.883   Mean   :4.516   Mean   :14.20   Mean   :41.13  
##  3rd Qu.:  7.840   3rd Qu.:4.789   3rd Qu.:15.06   3rd Qu.:43.52  
##  Max.   :189.500   Max.   :7.911   Max.   :22.27   Max.   :72.48  
##  NA's   :11031     NA's   :11027   NA's   :11028   NA's   :11027  
##       mcv              mch             mchc          platelet     
##  Min.   : 53.17   Min.   : 0.00   Min.   :16.10   Min.   :   0.3  
##  1st Qu.: 88.66   1st Qu.:30.53   1st Qu.:33.90   1st Qu.: 213.3  
##  Median : 91.30   Median :31.52   Median :34.49   Median : 248.0  
##  Mean   : 91.23   Mean   :31.51   Mean   :34.53   Mean   : 253.1  
##  3rd Qu.: 93.92   3rd Qu.:32.52   3rd Qu.:35.10   3rd Qu.: 287.0  
##  Max.   :143.00   Max.   :95.67   Max.   :97.30   Max.   :1821.0  
##  NA's   :11028    NA's   :11031   NA's   :11033   NA's   :11031   
##    lymphocyte         monocyte        neutrophil       eosinophil   
##  Min.   :  0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:  1.500   1st Qu.: 0.370   1st Qu.: 3.290   1st Qu.:0.100  
##  Median :  1.870   Median : 0.450   Median : 4.030   Median :0.140  
##  Mean   :  1.955   Mean   : 0.477   Mean   : 4.236   Mean   :0.174  
##  3rd Qu.:  2.280   3rd Qu.: 0.570   3rd Qu.: 4.980   3rd Qu.:0.210  
##  Max.   :177.000   Max.   :34.260   Max.   :52.020   Max.   :9.600  
##  NA's   :11704     NA's   :11704    NA's   :11704    NA's   :11704  
##     basophil    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.020  
##  Mean   :0.034  
##  3rd Qu.:0.040  
##  Max.   :2.600  
##  NA's   :11704

To make the analysis easier to work with, let’s try to limit to individual with all phenotypes being non-missing.

all_none_missing = rowSums(is.na(dat[, martin_traits])) == 0
dat %>% filter(all_none_missing) %>% group_by(meaning) %>% summarize(nindiv = n()) %>% pander(caption = 'Number of individuals after limiting to ones with all phenotypes non-missing')
Number of individuals after limiting to ones with all phenotypes non-missing
meaning nindiv
African 2835
British 356476
Chinese 1326
Indian 4789
dat_cleaned = dat %>% filter(all_none_missing) %>% select(-ethnicity_x_instance_0_x_array_0, -ethnicity_x_instance_1_x_array_0, -ethnicity_x_instance_2_x_array_0)

4 Take a quick look at phenotypes

The breif summary of the cleaned-up data is as follow and we save it as CSV file before proceeding.

dat_cleaned %>% summary
##       eid          age_recruitment      sex              pc1        
##  Min.   :1000020   Min.   :39.00   Min.   :0.0000   Min.   :-19.27  
##  1st Qu.:2256438   1st Qu.:51.00   1st Qu.:0.0000   1st Qu.:-13.37  
##  Median :3512290   Median :58.00   Median :0.0000   Median :-12.27  
##  Mean   :3512858   Mean   :56.82   Mean   :0.4649   Mean   : -7.26  
##  3rd Qu.:4768155   3rd Qu.:63.00   3rd Qu.:1.0000   3rd Qu.:-11.08  
##  Max.   :6026402   Max.   :73.00   Max.   :1.0000   Max.   :418.38  
##                                                                     
##       pc2                pc3                 pc4          
##  Min.   :-282.317   Min.   :-143.2580   Min.   :-70.6994  
##  1st Qu.:   2.656   1st Qu.:  -2.6521   1st Qu.: -0.8197  
##  Median :   3.741   Median :  -1.5475   Median :  1.0735  
##  Mean   :   1.702   Mean   :  -0.9098   Mean   :  1.0362  
##  3rd Qu.:   4.788   3rd Qu.:  -0.4021   3rd Qu.:  3.1504  
##  Max.   :  86.112   Max.   :  98.8226   Max.   : 37.1031  
##                                                           
##       pc5                pc6                pc7          
##  Min.   :-17.7238   Min.   :-20.8499   Min.   :-21.1385  
##  1st Qu.: -5.6678   1st Qu.: -1.5241   1st Qu.: -1.0023  
##  Median : -2.4843   Median : -0.3892   Median :  0.2852  
##  Mean   : -0.9078   Mean   : -0.3847   Mean   :  0.2188  
##  3rd Qu.:  2.3834   3rd Qu.:  0.7492   3rd Qu.:  1.5514  
##  Max.   : 34.5769   Max.   : 12.6545   Max.   : 55.2278  
##                                                          
##       pc8                pc9                 pc10          ethnicity_agg 
##  Min.   :-20.0411   Min.   :-33.79720   Min.   :-54.1515   Min.   :   5  
##  1st Qu.: -1.8267   1st Qu.: -1.62195   1st Qu.: -1.4029   1st Qu.:1001  
##  Median : -0.4840   Median :  0.64032   Median :  0.1147   Median :1001  
##  Mean   : -0.4346   Mean   : -0.08772   Mean   :  0.2047   Mean   :1047  
##  3rd Qu.:  0.8573   3rd Qu.:  2.63460   3rd Qu.:  1.7167   3rd Qu.:1001  
##  Max.   : 19.1440   Max.   : 12.61250   Max.   : 35.9669   Max.   :4002  
##                                                                          
##                        meaning           height           dbp        
##  British                   :356476   Min.   :121.0   Min.   : 35.00  
##  Indian                    :  4789   1st Qu.:162.0   1st Qu.: 75.00  
##  African                   :  2835   Median :168.0   Median : 82.00  
##  Chinese                   :  1326   Mean   :168.7   Mean   : 82.28  
##  Any other Asian background:     0   3rd Qu.:175.5   3rd Qu.: 89.00  
##  Any other Black background:     0   Max.   :209.0   Max.   :148.00  
##  (Other)                   :     0                                   
##       sbp             bmi             wbc               rbc       
##  Min.   : 62.0   Min.   :12.12   Min.   :  0.000   Min.   :0.006  
##  1st Qu.:126.0   1st Qu.:24.13   1st Qu.:  5.640   1st Qu.:4.240  
##  Median :139.0   Median :26.71   Median :  6.650   Median :4.507  
##  Mean   :140.1   Mean   :27.38   Mean   :  6.882   Mean   :4.518  
##  3rd Qu.:152.0   3rd Qu.:29.83   3rd Qu.:  7.840   3rd Qu.:4.790  
##  Max.   :264.0   Max.   :74.68   Max.   :181.400   Max.   :7.911  
##                                                                   
##        hb              ht             mcv              mch       
##  Min.   : 0.12   Min.   : 0.05   Min.   : 53.17   Min.   : 0.00  
##  1st Qu.:13.37   1st Qu.:38.76   1st Qu.: 88.65   1st Qu.:30.52  
##  Median :14.19   Median :41.10   Median : 91.30   Median :31.50  
##  Mean   :14.20   Mean   :41.15   Mean   : 91.22   Mean   :31.49  
##  3rd Qu.:15.05   3rd Qu.:43.53   3rd Qu.: 93.90   3rd Qu.:32.50  
##  Max.   :22.27   Max.   :72.48   Max.   :143.00   Max.   :95.67  
##                                                                  
##       mchc          platelet        lymphocyte         monocyte      
##  Min.   :16.10   Min.   :   0.3   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.:33.90   1st Qu.: 213.0   1st Qu.:  1.500   1st Qu.: 0.3700  
##  Median :34.47   Median : 247.6   Median :  1.870   Median : 0.4500  
##  Mean   :34.52   Mean   : 252.6   Mean   :  1.955   Mean   : 0.4773  
##  3rd Qu.:35.10   3rd Qu.: 286.6   3rd Qu.:  2.280   3rd Qu.: 0.5700  
##  Max.   :94.78   Max.   :1583.0   Max.   :147.500   Max.   :34.2600  
##                                                                      
##    neutrophil       eosinophil        basophil      
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 3.290   1st Qu.:0.1000   1st Qu.:0.00000  
##  Median : 4.030   Median :0.1400   Median :0.02000  
##  Mean   : 4.236   Mean   :0.1744   Mean   :0.03425  
##  3rd Qu.: 4.970   3rd Qu.:0.2100   3rd Qu.:0.04000  
##  Max.   :52.020   Max.   :9.6000   Max.   :2.60000  
## 
write.csv(dat_cleaned, '../output/query_first_attempt_with_qc.csv', quote = F, row.names = F)

Here, for categorical variable, we show the prevalence.

# cat_var = c('sex', 'meaning')
prev = dat_cleaned[, c('sex', 'meaning')] %>% group_by(meaning) %>% summarize(fraction_of_male = mean(sex))
prev %>% ggplot() + geom_bar(aes(x = meaning, y = fraction_of_male, fill = meaning), stat = 'identity') + ggtitle('Fraction of male')

Here, for each of the quantitative variable, we plot the distribution stratified by population.

quant_var = colnames(dat_cleaned %>% select(-eid, -sex, -ethnicity_agg, -meaning))
subsample = subsample_for_vis(dat_cleaned$meaning, nmax = 5000)
for(i in quant_var) {
  sub = dat_cleaned %>% filter(subsample) 
  sub = sub[, c('meaning', i)]
  colnames(sub)[2] = 'x'
  range = quantile(sub$x, probs = c(0.01, 0.99))
  p = sub %>% ggplot() + geom_density(aes(x = x, fill = meaning), alpha = .5) + coord_cartesian(xlim = range) + ggtitle(i)
  print(p)
}