rm(list = ls())
library(dplyr)
library(pander)
library(data.table)
options(datatable.fread.datatable = F)
source('../code/rlib_doc.R')
bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')}
o = get_meta_for_supp()
df_color_category = o$df_color_category
# color_mixer = o$color_mixer

my_rename_pop = function(pop) {
  res = rep('EUR', length(pop))
  res[pop == 'Indian'] = 'S.ASN'
  res[pop == 'Chinese'] = 'E.ASN'
  res[pop == 'African'] = 'AFR'
  res[pop == 'British \n (validation)'] = 'EUR ref.'
  res[pop == 'British \n (test)'] = 'EUR test'
  factor(res, levels = c('EUR ref.', 'EUR test', 'EUR', 'S.ASN', 'E.ASN', 'AFR'))
}

1 S Table: phenotype information

Phenotype information.

pheno_tab = read.csv('../external_data/martin_et_al_2019ng_table_s6.csv')
query_ = yaml::read_yaml('../output/query_phenotypes.yaml')$data
query = list()
for(i in names(query_)) {
  ii = strsplit(i, '_x_')[[1]][1]
  if(ii %in% names(query)) {
    next
  } else {
    query[[ii]] = data.frame(id = stringr::str_remove(strsplit(query_[[i]], '_')[[1]][1], 'c'), phenotype_tag = ii)
  }
}
query = do.call(rbind, query)
rownames(query) = NULL
dict = read.csv("~/Downloads/Data_Dictionary_Showcase.csv", stringsAsFactors = F)
dict$FieldID = as.character(dict$FieldID)
info = left_join(query, dict %>% select(Field, Instances, Array, FieldID), by = c('id' = 'FieldID'))
## Warning: Column `id`/`FieldID` joining factor and character vector, coercing into character vector
info = left_join(info %>% mutate(trait = tolower(phenotype_tag)), df_color_category, by = 'trait')
## Warning: Column `trait` joining character vector and factor, coercing into character vector
info = info %>% filter(phenotype_tag %in% pheno_tab$Trait)
info = info %>% select(Field, id, phenotype_tag, group) 
colnames(info) = c('UKB Field Description', 'UKB Field ID', 'Tag', 'Phenotype Category')

### save as tex table
print(
  xtable::xtable(info, display=c("s", "s", "s", "s", "s"), caption = '\\textbf{Meta information of the phenotypes retrieved from UK Biobank which were used in the analysis}. The \`\`Tag\'\' column shows the short name of the phenotyes used in this paper. And phenotypes are assigned into five categories which are shown in \`\`Phenotype Category\'\' column', label = 'tab:trait_table', digits = 5), 
  math.style.exponents = TRUE, 
  include.rownames = FALSE, 
  file ='../analysis_output/phenotype_info.tex', size="\\scriptsize", 
  sanitize.colnames.function=bold,
  booktabs = TRUE
)

# update the copy in paper-ptrs repo
cmd = paste('cp', '../analysis_output/phenotype_info.tex', '../../paper-ptrs/tables/phenotype_info.tex')
system(cmd)

2 S Table: number of individuals by ancestry/population

pheno_table = fread('../output/query_phenotypes_cleaned_up.csv', sep = ',')
dd = pheno_table %>% group_by(meaning) %>% summarize(n = n()) %>% ungroup() %>% mutate(meaning = my_rename_pop(meaning))
colnames(dd) = c('Ancestry', 'Number of individuals')

### save as tex table
print(
  xtable::xtable(dd, display=c("s", "s", "g"), caption = '\\textbf{Number of individuals included in the analysis stratified by ancestry}.', label = 'tab:indiv_table', digits = 6), 
  math.style.exponents = TRUE, 
  include.rownames = FALSE, 
  file ='../analysis_output/population_info.tex', size="\\scriptsize", 
  sanitize.colnames.function=bold,
  booktabs = TRUE
)

# update the copy in paper-ptrs repo
cmd = paste('cp', '../analysis_output/population_info.tex', '../../paper-ptrs/tables/population_info.tex')
system(cmd)

3 S Table: prediction models

Information of prediction models.

# model method 
# data source
# population 
# tissue
# number of genes
# sample size
# model abbreviation

### GTEx V8
tissues = read.table('~/Desktop/tmp/tissue_list.txt', header = F)$V1
gtex_sheet = read.csv('~/Documents/repo/bitbucket/rotation-at-imlab/data/gtex_sample_counts_by_tissue.csv')
pops = c('British-test-1')
df = list()
for(t in tissues) {
  for(p in pops) {
    t = stringr::str_remove(t, 'ctimp_')
    filename = paste0('/Users/yanyul/Desktop/tmp/gcta_regu/reml_from_hail_martin_et_al_traits_x_ctimp_', t, '_x_', p, '.tsv')
    df[[length(df) + 1]] = read.table(filename, header = T, sep = '\t') %>% mutate(population = p, tissue = t)
  }
}
df = do.call(rbind, df)
df = df %>% select(num_predictors, tissue) %>% filter(!duplicated(tissue)) # %>% pander
df = left_join(df, gtex_sheet %>% select(tissue, v8_eur), by = 'tissue')
## Warning: Column `tissue` joining character vector and factor, coercing into character vector
df_gtex = data.frame(method = 'CTIMP', data_source = 'GTEx V8', population = 'European', tissue = df$tissue, number_of_genes = df$num_predictors, sample_size = df$v8_eur)

### MESA 
f2 = 'African American (AFA, n = 233), Hispanic (HIS, n = 352), and European (CAU, n = 578)'
f = do.call(rbind, sapply(strsplit(f2, '),')[[1]], function(x) {
  x = strsplit(x, ' \\(')[[1]]
  meta = strsplit(x[2], ', ')[[1]]
  abbr = meta[1]
  sample_size = stringr::str_remove(meta[2], 'n = ')
  sample_size = as.numeric(stringr::str_remove(sample_size, '\\)'))
  data.frame(population = stringr::str_remove(x[1], 'and '), abbreviation = abbr, sample_size = sample_size)
}, simplify = F))
rownames(f) = NULL
f = f %>% mutate(source = 'MESA', tissue = 'Monocyte')
f = rbind(f, data.frame(population = paste(f$population[1], f$population[2]), abbreviation = 'AFHI', sample_size = f$sample_size[1] + f$sample_size[2], source = f$source[1], tissue = f$tissue[1]))
# f = rbind(f, data.frame(population = 'European', abbreviation = 'gtex_v8', sample_size = 573, source = 'GTEx_v8', tissue = 'Whole_Blood'))
f %>% pander('Sample size of gene expression prediction models in the analysis')
Sample size of gene expression prediction models in the analysis
population abbreviation sample_size source tissue
African American AFA 233 MESA Monocyte
Hispanic HIS 352 MESA Monocyte
European CAU 578 MESA Monocyte
African American Hispanic AFHI 585 MESA Monocyte
pops = c('British-test-1')
models = c('CAU', 'AFHI')
tmp_list = list()
for(p in pops) {
  for(m in models) {
    filename = paste0('~/Desktop/tmp/gcta_regu/reml_from_hail_martin_et_al_traits_x_', m, '_x_', p, '.tsv')
    tmp_list[[length(tmp_list) + 1]] = read.table(filename, header  = T, sep = '\t') %>% mutate(population = p, model = m)
  }
}
mesa = do.call(rbind, tmp_list)
mesa = mesa %>% select(num_predictors, model) %>% filter(!duplicated(model)) # %>% pander
mesa = left_join(mesa, data.frame(model = c('EUR', 'AFHI'), pop = c('European', 'African American or Hispanic')), by = 'model')
## Warning: Column `model` joining character vector and factor, coercing into character vector
mesa = left_join(mesa, f %>% select(abbreviation, sample_size), by = c('model' = 'abbreviation'))
## Warning: Column `model`/`abbreviation` joining character vector and factor, coercing into character vector
df_mesa = data.frame(method = 'Elastic Net', data_source = 'MESA', population = mesa$pop, tissue = 'Monocyte', number_of_genes = mesa$num_predictors, sample_size = mesa$sample_size)

### the combined table
dd = rbind(df_gtex, df_mesa)
dd$tag = c(rep('', nrow(dd) - 3), c('GTEx EUR', 'MESA EUR', 'MESA AFHI'))
colnames(dd) = c('Method', 'Data source', 'Population', 'Tissue', 'Number of genes', 'Sample size', 'Tag')

### save as tex table
to_highlight = sort(nrow(dd) - 1:3)
print(
  xtable::xtable(dd, display=c("s", "s", "s","s","s", "g", "g", "s"), caption = '\\textbf{Meta information of the prediction models used in the analysis}. The highlighted prediction models were used to build PTRS. The \`\`Tag\'\' column shows the short name of the models used in this paper.', label = 'tab:prediction_model', digits = 5), 
  math.style.exponents = TRUE, 
  include.rownames = FALSE, 
  file ='../analysis_output/prediction_models.tex', size="\\tiny", 
  sanitize.colnames.function=bold,
  booktabs = TRUE,
  add.to.row=list(
        pos=as.list(to_highlight),
        command=rep("\\rowcolor{yellow}",
                    length(seq(from=1,to=length(to_highlight),by=1))))
)

# update the copy in paper-ptrs repo
cmd = paste('cp', '../analysis_output/prediction_models.tex', '../../paper-ptrs/tables/prediction_models.tex')
system(cmd)

4 S Table: chip h2, PVE

PRS related table.

h2 = readRDS('../analysis_output/hsq_neale_lab.rds')
h2$h2_observed[is.na(h2$h2_observed)] = 0
df_h2 = h2 %>% select(trait, h2_observed, h2_observed_se) %>% rename(chip_h2 = h2_observed, chip_h2_se = h2_observed_se)

# h2 table
write.csv(df_h2, '../analysis_output/h2_table.csv', quote = F, row.names = F)


df_prs = read.table('../../ptrs-analysis/data/partial_r2-prs.subset1_British.tsv', header = T, sep = '\t', stringsAsFactors = F)
df_prs$sample[df_prs$sample == 'British-test-1'] = 'British_test'
df_prs$sample[df_prs$sample == 'British-validation-1'] = 'British_validation'
df_prs = df_prs %>% rename(prs_cutoff = ptrs_cutoff)
df_prs = df_prs %>% arrange(desc(-prs_cutoff))

best_prs = df_prs %>% group_by(trait, sample) %>% summarize(r2_max = max(partial_r2, na.rm = T)) %>% ungroup() %>% mutate(population = my_rename_pop(update_popname(sample)))
df_best_prs = best_prs %>% select(trait, population, r2_max) %>% rename(partial_R2 = r2_max) 
df_best_prs = df_best_prs[ order(df_best_prs$population, df_best_prs$trait), ]

# prs table
write.csv(df_best_prs, '../analysis_output/prs_table.csv', quote = F, row.names = F)

PTRS related table.

pve = readRDS('../analysis_output/regulability_ctimp.rds')
pve$population[pve$population == 'British-test-1'] = 'British_test'
pve$population[pve$population == 'British-validation-1'] = 'British_validation'
pve$h_sq[is.na(pve$h_sq)] = 0

pve2 = readRDS('../analysis_output/regulability_mesa_common_genes.rds')
pve2$population[pve2$population == 'British-test-1'] = 'British_test'
pve2$population[pve2$population == 'British-validation-1'] = 'British_validation'
pve2$h_sq[is.na(pve2$h_sq)] = 0

pops = c('African', 'British-test-1', 'Chinese', 'Indian')
pve3 = list()
for(pop in pops) {
  tmp = read.table(paste0('../../ptrs-analysis/data/reml_from_hail-multi-tissue-tissue_svd_x_martin_et_al_traits_x_ctimp_x_', pop, '.tsv'), header = T, stringsAsFactors = F, sep = '\t')
  pve3[[length(pve3) + 1]] = tmp %>% mutate(population = pop)
}
pve3 = do.call(rbind, pve3)
pve3[is.na(pve3)] = 0
pve3$population[pve3$population == 'British-test-1'] = 'British_test'


pve2$train_population = rep('EUR', nrow(pve2))
pve2$train_population[pve2$model == 'AFHI'] = 'AFHI'
pve2$training_data = 'MESA'
pve2$tissue = 'Monocyte'

df_pve = rbind(
  pve %>% select(trait, num_predictors, h_sq, h_sq_se, population) %>% mutate(train_population = 'EUR', training_data = 'GTEx', tissue = 'Whole Blood'),
  pve2 %>% select(trait, num_predictors, h_sq, h_sq_se, population, train_population, training_data, tissue),
  pve3 %>% select(trait, num_predictors, h_sq, h_sq_se, population) %>% mutate(train_population = 'EUR', training_data = 'GTEx', tissue = '10 Tissues')
)

df_pve$population = my_rename_pop(df_pve$population)
df_pve = df_pve %>% rename(num_genes = num_predictors) %>% rename(pve = h_sq, pve_se = h_sq_se)

# pve table
write.csv(df_pve, '../analysis_output/pve_table.csv', quote = F, row.names = F)


df_en = read.csv('../../ptrs-analysis/data/elastic_net_ptrs_gtex_british.performance.csv', stringsAsFactors = F)
df_en$sample[ df_en$sample == 'British_valid' ] = 'British_validation'
df_en = df_en %>% filter(!is.na(partial_r2)) %>% group_by(trait, alpha, sample) %>% mutate(order = 1 : n()) %>% ungroup() %>% filter(order <= 11) %>% select(-order)
best_en = df_en %>% select(sample, trait, partial_r2) %>%
  group_by(trait, sample) %>% summarize(r2_max = max(partial_r2, na.rm = T)) %>% ungroup() %>% mutate(population = my_rename_pop(update_popname(sample)))
df_best_ptrs = best_en %>% select(trait, population, r2_max) %>% rename(partial_R2 = r2_max) %>% mutate(train_population = 'EUR', training_data = 'GTEx', tissue = 'Whole Blood')


df_en2 = read.csv('../../ptrs-analysis/data/elastic_net_ptrs_mesa_british.performance.csv', stringsAsFactors = F)
df_en2$sample[ df_en2$sample == 'British_valid' ] = 'British_validation'
df_en2 = df_en2 %>% filter(!is.na(partial_r2)) %>% group_by(trait, alpha, sample, pred_expr_source) %>% mutate(order = 1 : n()) %>% ungroup() %>% filter(order <= 11) %>% select(-order)
best_en2 = df_en2 %>% select(sample, trait, partial_r2, pred_expr_source) %>%
  group_by(trait, sample, pred_expr_source) %>% summarize(r2_max = max(partial_r2, na.rm = T)) %>% ungroup() %>% mutate(population = my_rename_pop(update_popname(sample)))
df_cau = df_en2 %>% filter(pred_expr_source == 'train')
df_afhi = df_en2 %>% filter(pred_expr_source == 'against')

best_cau = df_cau %>% group_by(trait, sample) %>% summarize(r2_max = max(partial_r2, na.rm = T)) %>% ungroup() %>% mutate(population = my_rename_pop(update_popname(sample)))
best_afhi = df_afhi %>% group_by(trait, sample) %>% summarize(r2_max = max(partial_r2, na.rm = T)) %>% ungroup() %>% mutate(population = my_rename_pop(update_popname(sample)))

df_best_ptrs2 = rbind(
  best_cau %>% select(trait, population, r2_max) %>% rename(partial_R2 = r2_max) %>% mutate(train_population = 'EUR', training_data = 'MESA', tissue = 'Monocyte'),
  best_afhi %>% select(trait, population, r2_max) %>% rename(partial_R2 = r2_max) %>% mutate(train_population = 'AFHI', training_data = 'MESA', tissue = 'Monocyte')
)

df_best_ptrs = rbind(df_best_ptrs, df_best_ptrs2) 
df_best_ptrs = df_best_ptrs[ order(df_best_ptrs$training_data, df_best_ptrs$train_population, df_best_ptrs$population, df_best_ptrs$trait), ]

# ptrs table
write.csv(df_best_ptrs, '../analysis_output/ptrs_table.csv', quote = F, row.names = F)