rm(list = ls())
library(dplyr)
library(pander)
library(ggplot2)
panderOptions('table.split.table', Inf)
source('../code/rlib_doc.R')
We analyze the quantitative traits analyzed by Martin et al.
s6 = read.csv('../external_data/martin_et_al_2019ng_table_s6.csv')
s6 %>% pander
Trait | Ntotal..BBJ. | NGWAS..BBJ…UKBB. | Ntarget..BBJ…UKBB. | UKBB.code | X..BBJ.clumps | X..UKBB.clumps |
---|---|---|---|---|---|---|
Basophil | 87665 | 82665 | 5000 | 30160 | 8939 | 8690 |
BMI | 155426 | 150426 | 5000 | 21001 | 19114 | 21339 |
DBP | 137991 | 132991 | 5000 | 4079 | 9865 | 14213 |
Eosinophil | 88675 | 83675 | 5000 | 30150 | 9266 | 13061 |
Hb | 144653 | 139653 | 5000 | 30020 | 10483 | 16184 |
Height | 156569 | 151569 | 5000 | 50 | 37216 | 31854 |
Ht | 144947 | 139947 | 5000 | 30030 | 10554 | 15408 |
Lymphocyte | 91157 | 86157 | 5000 | 30120 | 9400 | 13648 |
MCH | 121249 | 116249 | 5000 | 30050 | 12598 | 15222 |
MCHC | 128232 | 123232 | 5000 | 30060 | 10272 | 10074 |
MCV | 122912 | 117912 | 5000 | 30040 | 13169 | 16354 |
Monocyte | 90593 | 85593 | 5000 | 30130 | 10886 | 13452 |
Neutrophil | 79287 | 74287 | 5000 | 30140 | 9150 | 12211 |
Platelet | 140610 | 135610 | 5000 | 30080 | 14843 | 19259 |
RBC | 145426 | 140426 | 5000 | 30010 | 12467 | 18069 |
SBP | 137981 | 132981 | 5000 | 4080 | 11231 | 14562 |
WBC | 146158 | 141158 | 5000 | 30000 | 12664 | 17581 |
I queried these traits by UKB code where I queried all instances and arrays see details here. So, I need to aggregate the phenotypes (from multiple instances and arrays in some way).
Here I load the data passing population QC see details here.
dat = readRDS('../output/query_first_attempt_with_population_qc.rds')
dat = dat$dat_pass_pop_qc %>% select(-bmi, -height) # remove the two test traits which are duplicated from the Martin et al phenotypes
First of all, let’s see how many instances and how many arraies each trait has.
trait_meta = list()
cols_relavant = colnames(dat)
cols_relavant = cols_relavant[!is.na(stringr::str_match(cols_relavant, '_x_'))]
parsed_col = as.data.frame(do.call(rbind, strsplit(cols_relavant, '_x_')))
colnames(parsed_col) = c('trait', 'instance', 'array')
parsed_col = parsed_col %>% mutate(instance = stringr::str_remove(instance, 'instance_'), array = stringr::str_remove(array, 'array_'))
parsed_col = parsed_col %>% filter(trait != 'ethnicity') # remove ethnicity since this part is taken care of in population_assignment.Rmd
parsed_col %>% group_by(trait) %>% summarize(ninstance = length(unique(instance))) %>% pander(caption = 'number of instances')
trait | ninstance |
---|---|
basophil | 3 |
bmi | 3 |
dbp | 3 |
eosinophil | 3 |
hb | 3 |
height | 3 |
ht | 3 |
lymphocyte | 3 |
mch | 3 |
mchc | 3 |
mcv | 3 |
monocyte | 3 |
neutrophil | 3 |
platelet | 3 |
rbc | 3 |
sbp | 3 |
wbc | 3 |
trait_instance_with_multi_array = parsed_col %>% group_by(trait, instance) %>% summarize(number_of_array = length(unique(array))) %>% ungroup() %>% filter(number_of_array > 1)
trait_instance_with_multi_array %>% pander(caption = 'trait/instance pair with more than one array')
trait | instance | number_of_array |
---|---|---|
dbp | 0 | 2 |
dbp | 1 | 2 |
dbp | 2 | 2 |
sbp | 0 | 2 |
sbp | 1 | 2 |
sbp | 2 | 2 |
OK, let’s first see how the two array varies.
subsample = subsample_for_vis(group = dat$meaning, nmax = 1500)
for(i in 1 : nrow(trait_instance_with_multi_array)) {
this = trait_instance_with_multi_array[i, ]
prefix = paste0(this$trait, '_x_', 'instance_', this$instance, '_x_', 'array_')
cols = paste0(prefix, 0 : (this$number_of_array - 1))
sub = dat[subsample, c('meaning', cols)]
sub = sub[rowSums(!is.na(sub)) > 0, ]
print(myggpairs(sub %>% select(-meaning), sub$meaning) + geom_abline(intercept = 0, slope = 1))
}
## Warning: Removed 343 rows containing missing values (geom_point).
## Warning: Removed 5792 rows containing missing values (geom_point).
## Warning: Removed 5690 rows containing missing values (geom_point).
## Warning: Removed 343 rows containing missing values (geom_point).
## Warning: Removed 5792 rows containing missing values (geom_point).
## Warning: Removed 5690 rows containing missing values (geom_point).
It seems that taking the values from multiple arraies are quite consistent so that we aggregate array by taking the average.
dat_old = dat
for(i in 1 : nrow(trait_instance_with_multi_array)) {
this = trait_instance_with_multi_array[i, ]
prefix = paste0(this$trait, '_x_', 'instance_', this$instance, '_x_', 'array_')
cols = paste0(prefix, 0 : (this$number_of_array - 1))
newcol = data.frame(x = aggregate_instances(dat[, cols], aggregate_instances))
colnames(newcol) = paste0(prefix, 'agg')
dat = cbind(dat, newcol)
dat[, cols] = NULL
}
Then, we proceed to aggregate across multiple instances. And here we use the first non-NA value as the value of the phenotype for that individual.
for(i in unique(parsed_col$trait)) {
this = parsed_col %>% filter(trait == i)
suffix = '_x_array_0'
if(i %in% trait_instance_with_multi_array$trait) {
suffix = '_x_array_agg'
}
prefix = paste0(i, '_x_', 'instance_')
cols = paste0(prefix, unique(this$instance), suffix)
newcol = data.frame(x = aggregate_instances(dat[, cols], first_non_na))
colnames(newcol) = i
dat = cbind(dat, newcol)
dat[, cols] = NULL
}
martin_traits = as.character(unique(parsed_col$trait))
summary(dat[, martin_traits])
## height dbp sbp bmi
## Min. : 75.0 Min. : 32.00 Min. : 62.0 Min. :12.12
## 1st Qu.:162.0 1st Qu.: 75.00 1st Qu.:126.0 1st Qu.:24.14
## Median :168.0 Median : 82.00 Median :139.0 Median :26.73
## Mean :168.7 Mean : 82.29 Mean :140.1 Mean :27.41
## 3rd Qu.:175.0 3rd Qu.: 89.00 3rd Qu.:152.0 3rd Qu.:29.86
## Max. :209.0 Max. :148.00 Max. :264.0 Max. :74.68
## NA's :992 NA's :21354 NA's :21357 NA's :1388
## wbc rbc hb ht
## Min. : 0.000 Min. :0.006 Min. : 0.12 Min. : 0.05
## 1st Qu.: 5.640 1st Qu.:4.238 1st Qu.:13.37 1st Qu.:38.74
## Median : 6.650 Median :4.504 Median :14.19 Median :41.10
## Mean : 6.883 Mean :4.516 Mean :14.20 Mean :41.13
## 3rd Qu.: 7.840 3rd Qu.:4.789 3rd Qu.:15.06 3rd Qu.:43.52
## Max. :189.500 Max. :7.911 Max. :22.27 Max. :72.48
## NA's :11031 NA's :11027 NA's :11028 NA's :11027
## mcv mch mchc platelet
## Min. : 53.17 Min. : 0.00 Min. :16.10 Min. : 0.3
## 1st Qu.: 88.66 1st Qu.:30.53 1st Qu.:33.90 1st Qu.: 213.3
## Median : 91.30 Median :31.52 Median :34.49 Median : 248.0
## Mean : 91.23 Mean :31.51 Mean :34.53 Mean : 253.1
## 3rd Qu.: 93.92 3rd Qu.:32.52 3rd Qu.:35.10 3rd Qu.: 287.0
## Max. :143.00 Max. :95.67 Max. :97.30 Max. :1821.0
## NA's :11028 NA's :11031 NA's :11033 NA's :11031
## lymphocyte monocyte neutrophil eosinophil
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.000
## 1st Qu.: 1.500 1st Qu.: 0.370 1st Qu.: 3.290 1st Qu.:0.100
## Median : 1.870 Median : 0.450 Median : 4.030 Median :0.140
## Mean : 1.955 Mean : 0.477 Mean : 4.236 Mean :0.174
## 3rd Qu.: 2.280 3rd Qu.: 0.570 3rd Qu.: 4.980 3rd Qu.:0.210
## Max. :177.000 Max. :34.260 Max. :52.020 Max. :9.600
## NA's :11704 NA's :11704 NA's :11704 NA's :11704
## basophil
## Min. :0.000
## 1st Qu.:0.000
## Median :0.020
## Mean :0.034
## 3rd Qu.:0.040
## Max. :2.600
## NA's :11704
To make the analysis easier to work with, let’s try to limit to individual with all phenotypes being non-missing.
all_none_missing = rowSums(is.na(dat[, martin_traits])) == 0
dat %>% filter(all_none_missing) %>% group_by(meaning) %>% summarize(nindiv = n()) %>% pander(caption = 'Number of individuals after limiting to ones with all phenotypes non-missing')
meaning | nindiv |
---|---|
African | 2835 |
British | 356476 |
Chinese | 1326 |
Indian | 4789 |
dat_cleaned = dat %>% filter(all_none_missing) %>% select(-ethnicity_x_instance_0_x_array_0, -ethnicity_x_instance_1_x_array_0, -ethnicity_x_instance_2_x_array_0)
The breif summary of the cleaned-up data is as follow and we save it as CSV file before proceeding.
dat_cleaned %>% summary
## eid age_recruitment sex pc1
## Min. :1000020 Min. :39.00 Min. :0.0000 Min. :-19.27
## 1st Qu.:2256438 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.:-13.37
## Median :3512290 Median :58.00 Median :0.0000 Median :-12.27
## Mean :3512858 Mean :56.82 Mean :0.4649 Mean : -7.26
## 3rd Qu.:4768155 3rd Qu.:63.00 3rd Qu.:1.0000 3rd Qu.:-11.08
## Max. :6026402 Max. :73.00 Max. :1.0000 Max. :418.38
##
## pc2 pc3 pc4
## Min. :-282.317 Min. :-143.2580 Min. :-70.6994
## 1st Qu.: 2.656 1st Qu.: -2.6521 1st Qu.: -0.8197
## Median : 3.741 Median : -1.5475 Median : 1.0735
## Mean : 1.702 Mean : -0.9098 Mean : 1.0362
## 3rd Qu.: 4.788 3rd Qu.: -0.4021 3rd Qu.: 3.1504
## Max. : 86.112 Max. : 98.8226 Max. : 37.1031
##
## pc5 pc6 pc7
## Min. :-17.7238 Min. :-20.8499 Min. :-21.1385
## 1st Qu.: -5.6678 1st Qu.: -1.5241 1st Qu.: -1.0023
## Median : -2.4843 Median : -0.3892 Median : 0.2852
## Mean : -0.9078 Mean : -0.3847 Mean : 0.2188
## 3rd Qu.: 2.3834 3rd Qu.: 0.7492 3rd Qu.: 1.5514
## Max. : 34.5769 Max. : 12.6545 Max. : 55.2278
##
## pc8 pc9 pc10 ethnicity_agg
## Min. :-20.0411 Min. :-33.79720 Min. :-54.1515 Min. : 5
## 1st Qu.: -1.8267 1st Qu.: -1.62195 1st Qu.: -1.4029 1st Qu.:1001
## Median : -0.4840 Median : 0.64032 Median : 0.1147 Median :1001
## Mean : -0.4346 Mean : -0.08772 Mean : 0.2047 Mean :1047
## 3rd Qu.: 0.8573 3rd Qu.: 2.63460 3rd Qu.: 1.7167 3rd Qu.:1001
## Max. : 19.1440 Max. : 12.61250 Max. : 35.9669 Max. :4002
##
## meaning height dbp
## British :356476 Min. :121.0 Min. : 35.00
## Indian : 4789 1st Qu.:162.0 1st Qu.: 75.00
## African : 2835 Median :168.0 Median : 82.00
## Chinese : 1326 Mean :168.7 Mean : 82.28
## Any other Asian background: 0 3rd Qu.:175.5 3rd Qu.: 89.00
## Any other Black background: 0 Max. :209.0 Max. :148.00
## (Other) : 0
## sbp bmi wbc rbc
## Min. : 62.0 Min. :12.12 Min. : 0.000 Min. :0.006
## 1st Qu.:126.0 1st Qu.:24.13 1st Qu.: 5.640 1st Qu.:4.240
## Median :139.0 Median :26.71 Median : 6.650 Median :4.507
## Mean :140.1 Mean :27.38 Mean : 6.882 Mean :4.518
## 3rd Qu.:152.0 3rd Qu.:29.83 3rd Qu.: 7.840 3rd Qu.:4.790
## Max. :264.0 Max. :74.68 Max. :181.400 Max. :7.911
##
## hb ht mcv mch
## Min. : 0.12 Min. : 0.05 Min. : 53.17 Min. : 0.00
## 1st Qu.:13.37 1st Qu.:38.76 1st Qu.: 88.65 1st Qu.:30.52
## Median :14.19 Median :41.10 Median : 91.30 Median :31.50
## Mean :14.20 Mean :41.15 Mean : 91.22 Mean :31.49
## 3rd Qu.:15.05 3rd Qu.:43.53 3rd Qu.: 93.90 3rd Qu.:32.50
## Max. :22.27 Max. :72.48 Max. :143.00 Max. :95.67
##
## mchc platelet lymphocyte monocyte
## Min. :16.10 Min. : 0.3 Min. : 0.000 Min. : 0.0000
## 1st Qu.:33.90 1st Qu.: 213.0 1st Qu.: 1.500 1st Qu.: 0.3700
## Median :34.47 Median : 247.6 Median : 1.870 Median : 0.4500
## Mean :34.52 Mean : 252.6 Mean : 1.955 Mean : 0.4773
## 3rd Qu.:35.10 3rd Qu.: 286.6 3rd Qu.: 2.280 3rd Qu.: 0.5700
## Max. :94.78 Max. :1583.0 Max. :147.500 Max. :34.2600
##
## neutrophil eosinophil basophil
## Min. : 0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.290 1st Qu.:0.1000 1st Qu.:0.00000
## Median : 4.030 Median :0.1400 Median :0.02000
## Mean : 4.236 Mean :0.1744 Mean :0.03425
## 3rd Qu.: 4.970 3rd Qu.:0.2100 3rd Qu.:0.04000
## Max. :52.020 Max. :9.6000 Max. :2.60000
##
write.csv(dat_cleaned, '../output/query_first_attempt_with_qc.csv', quote = F, row.names = F)
Here, for categorical variable, we show the prevalence.
# cat_var = c('sex', 'meaning')
prev = dat_cleaned[, c('sex', 'meaning')] %>% group_by(meaning) %>% summarize(fraction_of_male = mean(sex))
prev %>% ggplot() + geom_bar(aes(x = meaning, y = fraction_of_male, fill = meaning), stat = 'identity') + ggtitle('Fraction of male')
Here, for each of the quantitative variable, we plot the distribution stratified by population.
quant_var = colnames(dat_cleaned %>% select(-eid, -sex, -ethnicity_agg, -meaning))
subsample = subsample_for_vis(dat_cleaned$meaning, nmax = 5000)
for(i in quant_var) {
sub = dat_cleaned %>% filter(subsample)
sub = sub[, c('meaning', i)]
colnames(sub)[2] = 'x'
range = quantile(sub$x, probs = c(0.01, 0.99))
p = sub %>% ggplot() + geom_density(aes(x = x, fill = meaning), alpha = .5) + coord_cartesian(xlim = range) + ggtitle(i)
print(p)
}