rm(list = ls())
library(data.table)
options(datatable.fread.datatable = F)
options(stringsAsFactors = F)
library(SilverStandardPerformance)
library(dplyr)
library(ggplot2)
library(patchwork)
theme_set(theme_bw(base_size=15))
fast_corr = function(x) {
x = as.matrix(x)
mean_ = apply(x, 1, mean)
sd_ = apply(x, 1, mysd)
x_norm = (x - mean_) / sd_
cor_ = x_norm %*% t(x_norm) / ncol(x_norm)
cor_
}
inverse_normalize = function(x, offset = 1) {
g = rank(x, ties.method = 'average') / (length(x) + offset)
o = qnorm(g)
return(o)
}
mysd = function(x) {
sqrt(mean((x - mean(x)) ^ 2))
}
annot_color = function(pop, color_panel) {
e = data.frame(pop = pop)
f = left_join(e, color_panel, by = 'pop')
f$color
}
myimage = function(cor_mat, indiv_color_panel = NULL) {
cor_mat_melt = melt(cor_mat)
p = cor_mat_melt %>% ggplot() +
geom_raster(aes(x = Var1, y = Var2, fill = value)) + scale_fill_gradient2(low = 'blue', mid = 'white', high = 'red')
p = p + theme(
plot.title = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.line = element_blank()
)
if(!is.null(indiv_color_panel)) {
p = p +
geom_point(aes(x = Var1, y = -2, color = Var1), show.legend = F, shape = 15, size = 1) +
geom_point(aes(x = -1, y = Var2, color = Var2), show.legend = F, shape = 15, size = 1) +
scale_color_manual(values = indiv_color_panel)
}
p
}
get_partition = function(indiv_list, test_size) {
test_idx = sample(1 : length(indiv_list), size = test_size, replace = FALSE)
test_ind = (1 : length(indiv_list)) %in% test_idx
list(reference = indiv_list[!test_ind], test = indiv_list[test_ind])
}
mat_dist = function(m1, m2) {
sqrt(sum((m1 - m2) ^ 2))
}
mat_cor = function(m1, m2) {
v1 = m1[upper.tri(m1)]
v2 = m2[upper.tri(m2)]
cor(v1, v2)
}
candidate_colors = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
From Box link. Here I used RPKM which is normalized by library size and normalized by gene length (this does not matter since we do inverse normalization gene by gene). First filter out individuals with analysis freeze = FALSE. Secondly, filter out individuals with X3p_bias missing (I have no idea what that means).
Thridly, limit to PBMC samples.
data("gene_annotation_gencode_v26_hg38")
head(gene_annotation_gencode_v26_hg38$gene_annotation)
## gene_id gene_name gene_type chromosome start end strand
## 1 ENSG00000279061 RP3-332O11.2 TEC chr1 172752586 172752924 +
## 2 ENSG00000279667 RP11-656D10.7 TEC chr1 40473055 40474059 +
## 3 ENSG00000279430 RP11-190A12.9 TEC chr1 159910094 159910554 -
## 4 ENSG00000279839 RP11-193J6.1 TEC chr1 3205988 3208664 +
## 5 ENSG00000279838 RP4-635A23.6 TEC chr1 185292384 185294372 -
## 6 ENSG00000279214 RP5-1024N4.5 TEC chr1 48262230 48263179 -
indiv_meta = read.csv('~/Desktop/tmp/TOPMed_MESA/TOPMed_MESA_RNAseq_sample_attributes_freeze.csv')
indiv_meta = indiv_meta %>% filter(analysis_freeze, race != '') %>% filter(!is.na(X3p_bias)) %>% filter(sample_type == 'PBMC')
indiv_meta = indiv_meta[order(indiv_meta$race), ]
df = fread('zcat < ~/Desktop/tmp/TOPMed_MESA/TOPMed_MESA_RNAseq_Pilot_RNASeQCv2.0.0.gene_rpkm.gct.gz', header = T)
df_indiv = colnames(df)[c(-1, -2)]
df_cleaned = df[, c(T, T, df_indiv %in% indiv_meta$tor_id)]
rownames(df_cleaned) = df$Name
After discussing with Haky, I need to check whether the remaining samples are from the same individuals (since some individual may have two samples measured at different time points). To do so, I need to load the wide MESA meta table where each row is one individual.
mesa_wide = read.csv('~/Desktop/tmp/TOPMed_MESA/MESA_TOPMed_WideID_20190517.csv')
# the first two rows are for another column id naming
meta_columns = data.frame(name = as.character(colnames(mesa_wide)), tag = as.character(mesa_wide[2, ]))
mesa_wide = mesa_wide[c(-1, -2), ]
# get column index for TOR ID
tor_id_cols_idx = which(!is.na(stringr::str_match(meta_columns$tag, 'tor_id')))
# and get column index for TOR specimen
tor_specimen_cols = stringr::str_replace(meta_columns$tag[tor_id_cols_idx], 'id', 'specimen')
tor_specimen_cols_idx = sapply(tor_specimen_cols, function(x) {
which(meta_columns$tag == x)
})
mesa_tor_unfold = list()
for(i in 1 : nrow(mesa_wide)) {
indiv_ = mesa_wide[i, ]
for(k in 1 : length(tor_id_cols_idx)) {
tor_id = as.character(indiv_[tor_id_cols_idx[k]])
tor_specimen = as.character(indiv_[tor_specimen_cols_idx[k]])
if(tor_id == '') {
next
} else {
mesa_tor_unfold[[length(mesa_tor_unfold) + 1]] = data.frame(share_id = mesa_wide$SHARe.Participant.ID[i], tor_id = tor_id, tor_specimen = tor_specimen)
}
}
}
mesa_tor_unfold = do.call(rbind, mesa_tor_unfold)
# check if there are any duplicated individuals
tmp_ = indiv_meta %>% left_join(mesa_tor_unfold, by = 'tor_id')
tmp_ %>% group_by(share_id) %>% summarize(nsample = n()) %>% arrange(desc(nsample))
## # A tibble: 606 x 2
## share_id nsample
## <chr> <int>
## 1 <NA> 27
## 2 10020 2
## 3 10052 2
## 4 10055 2
## 5 10131 2
## 6 10139 2
## 7 10167 2
## 8 10195 2
## 9 10226 2
## 10 10248 2
## # … with 596 more rows
# suppose we limit to exam 1
indiv_meta_exam_1 = indiv_meta %>% filter(exam == '1') #
indiv_meta_exam_1 %>% group_by(race) %>% summarize(n())
## # A tibble: 4 x 2
## race `n()`
## <chr> <int>
## 1 Asian 40
## 2 Black 124
## 3 Hispanic 117
## 4 White 180
# check again the duplicated individuals
tmp_ = indiv_meta_exam_1 %>% left_join(mesa_tor_unfold, by = 'tor_id')
tmp_ %>% group_by(share_id) %>% summarize(nsample = n()) %>% arrange(desc(nsample))
## # A tibble: 450 x 2
## share_id nsample
## <chr> <int>
## 1 <NA> 12
## 2 10020 1
## 3 10028 1
## 4 10043 1
## 5 10052 1
## 6 10055 1
## 7 10061 1
## 8 10125 1
## 9 10131 1
## 10 10139 1
## # … with 440 more rows
# we further remove individuals without share ID
indiv_meta_exam_1_w_shareid = tmp_ %>% filter(!is.na(share_id)) # %>% group_by(race) %>% summarize(n())
# ok, filter the original transcriptome matrix
indiv_meta = indiv_meta_exam_1_w_shareid
df_indiv = colnames(df)[c(-1, -2)]
df_cleaned = df[, c(T, T, df_indiv %in% indiv_meta$tor_id)]
rownames(df_cleaned) = df$Name
Limit to protein coding and/or lincRNA. Limit to genes with non-zero observations in all individuals. For each population, do inverse normalization gene by gene to obtain the expression matrix. Limit to top 1000 highly expression genes (expression is measured as the median of RPKM across all individuals).
df_protein_or_linc = df_cleaned %>% filter(Description %in% gene_annotation_gencode_v26_hg38$gene_annotation$gene_name[gene_annotation_gencode_v26_hg38$gene_annotation$gene_type %in% c('protein_coding', 'lincRNA')])
n_zeros = apply(df_protein_or_linc[, c(-1, -2)], 1, function(x) {
sum(x == 0)
})
rownames(df_protein_or_linc) = df_protein_or_linc$Name
df_protein_or_linc_non_zero = df_protein_or_linc[n_zeros == 0, ]
expr_level = apply(df_protein_or_linc_non_zero[, c(-1, -2)], 1, function(x) {
median(x)
})
top_1000_expr_level_ind = rank(-expr_level) %in% 1 : 1000
df_protein_or_linc_non_zero = df_protein_or_linc_non_zero[top_1000_expr_level_ind, ]
tmp = df_protein_or_linc_non_zero[, 3 : ncol(df_protein_or_linc_non_zero)]
tmp = tmp[, match(indiv_meta$tor_id, colnames(tmp))]
df_protein_or_linc_non_zero = cbind(df_protein_or_linc_non_zero[, 1:2], tmp)
# some meta operations
indiv_meta %>% group_by(race) %>% summarize(n = n())
## # A tibble: 4 x 2
## race n
## <chr> <int>
## 1 Asian 40
## 2 Black 121
## 3 Hispanic 108
## 4 White 180
color_panel = data.frame(pop = unique(indiv_meta$race), color = candidate_colors[1 : length(unique(indiv_meta$race))])
indiv_color_panel = indiv_meta %>% left_join(color_panel, by = c('race' = 'pop')) %>% pull(color)
names(indiv_color_panel) = indiv_meta$tor_id
Perform normalization for each populations.
# among all
normalized_mat = t(apply(df_protein_or_linc_non_zero[, c(-1, -2)], 1, inverse_normalize))
# for each population
mat = df_protein_or_linc_non_zero[, c(-1, -2)]
normalized_mat_by_pop = matrix(NA, ncol = ncol(normalized_mat), nrow = nrow(normalized_mat))
start = 1
starts = c()
names_ = c()
for(rr in unique(indiv_meta$race)) {
starts = c(starts, start)
message(rr, ' start = ', start)
indiv_here = indiv_meta$tor_id[indiv_meta$race == rr]
mat_sub = mat[, colnames(mat) %in% indiv_here]
normalized_mat_by_pop[, start : (length(indiv_here) + start - 1)] = t(apply(mat_sub, 1, inverse_normalize))
start = length(indiv_here) + start
names_ = c(names_, colnames(mat_sub))
}
## Asian start = 1
## Black start = 41
## Hispanic start = 162
## White start = 270
colnames(normalized_mat_by_pop) = names_
rownames(normalized_mat_by_pop) = rownames(normalized_mat)
cor_indiv = fast_corr(t(as.matrix(normalized_mat)))
cor_indiv_by_pop = fast_corr(t(as.matrix(normalized_mat_by_pop)))
Visualization.
# image Asian
myimage(cor_indiv[1:90, 1:90], indiv_color_panel)
## Warning in melt(cor_mat): The melt generic in data.table has been passed a matrix and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(cor_mat). In the next version, this warning will become an error.
myimage(cor_indiv_by_pop[1:90, 1:90], indiv_color_panel)
## Warning in melt(cor_mat): The melt generic in data.table has been passed a matrix and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(cor_mat). In the next version, this warning will become an error.
# each taking 40
indexes = c()
for(i in starts) {
indexes = c(indexes, i : (i + 40 - 1))
}
myimage(cor_indiv[indexes, indexes], indiv_color_panel)
## Warning in melt(cor_mat): The melt generic in data.table has been passed a matrix and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(cor_mat). In the next version, this warning will become an error.
myimage(cor_indiv_by_pop[indexes, indexes], indiv_color_panel)
## Warning in melt(cor_mat): The melt generic in data.table has been passed a matrix and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(cor_mat). In the next version, this warning will become an error.
As a test case, I calculate the gene correlation matrix within Black (using inverse normalized expression within each population). Here we are interested in measuring the similarity between Black’s gene correlation and White’s gene correlation. To account for the sample standard error, we partition the White samples into two parts: 1) test part: with equal size to Black samples; 2) reference part: the rest (it should be larger part). For the first partition, we calculate the distance between Black and reference part. And for the other partitions, we calcualte the distance between test part and reference part. Here the distance is the test statistic.
Pack it up as a dirty magic function.
test_gene_cor_between_pop = function(test_pop, ref_pop, expr_mat) {
set.seed(1)
### The following notes are deprecated!
### IMPORTANT!!!
# for fast testing, I randomly select 500 genes.
# it is just for testing code!
# normalized_mat_by_pop_debug = normalized_mat_by_pop # [sample(nrow(normalized_mat_by_pop), size = 500), ]
### END
## for lazy pack up
normalized_mat_by_pop_debug = expr_mat
test_indiv_here = test_pop
ref_indiv_here = ref_pop
## END
num_partitions = 100
# test_pop = 'Asian'
# test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
expr_test = normalized_mat_by_pop_debug[, colnames(normalized_mat_by_pop_debug) %in% test_indiv_here]
mat_test = fast_corr(expr_test)
# ref_pop = 'White'
# ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
partition_here = get_partition(ref_indiv_here, test_size = length(test_indiv_here))
expr_ref = normalized_mat_by_pop_debug[, colnames(normalized_mat_by_pop_debug) %in% partition_here$reference]
mat_ref = fast_corr(expr_ref)
test_u = mat_dist(mat_test, mat_ref)
test_u2 = mat_cor(mat_test, mat_ref)
test_null = c()
test_null2 = c()
for(i in 1 : num_partitions) {
partition_here = get_partition(ref_indiv_here, test_size = length(test_indiv_here))
expr_ref = normalized_mat_by_pop_debug[, colnames(normalized_mat_by_pop_debug) %in% partition_here$reference]
mat_ref = fast_corr(expr_ref)
expr_tref = normalized_mat_by_pop_debug[, colnames(normalized_mat_by_pop_debug) %in% partition_here$test]
mat_tref = fast_corr(expr_tref)
test_null = c(test_null, mat_dist(mat_tref, mat_ref))
test_null2 = c(test_null2, mat_cor(mat_tref, mat_ref))
}
#
# myimage(mat_test[1:100, 1:100])
# myimage(mat_ref[1:100, 1:100])
p1 = ggplot(data.frame(mat_dist = test_null)) + geom_histogram(aes(x = mat_dist)) + geom_vline(xintercept = test_u) + ggtitle('matrix 2-norm distance')
p2 = ggplot(data.frame(mat_cor = test_null2)) + geom_histogram(aes(x = mat_cor)) + geom_vline(xintercept = test_u2) + ggtitle('matrix upper triangular correlation')
return(list(p1, p2))
}
test_pop = 'Black'
test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
po = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
po[[1]] / po[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
test_pop = 'Hispanic'
test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
po = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
po[[1]] / po[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
test_pop = 'Asian'
test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
po = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
po[[1]] / po[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
set.seed(1)
downsample_size = 40
test_pop = 'Black'
test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
test_indiv_here = test_indiv_here[sample(length(test_indiv_here), size = downsample_size)]
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
po = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
po[[1]] / po[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
set.seed(1)
downsample_size = 40
test_pop = 'Hispanic'
test_indiv_here = indiv_meta$tor_id[indiv_meta$race == test_pop]
test_indiv_here = test_indiv_here[sample(length(test_indiv_here), size = downsample_size)]
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
po = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
po[[1]] / po[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here we try different splits of White and see how the matrix similarity measurement change.
ref_pop = 'White'
ref_indiv_here = indiv_meta$tor_id[indiv_meta$race == ref_pop]
proportion = c(0.5, 1 : 5) / 10
plist = list()
for(i in proportion) {
downsample_size = floor(length(ref_indiv_here) * i)
test_indiv_here = ref_indiv_here[sample(length(ref_indiv_here), size = downsample_size)]
message('proportion = ', i, ', downsample = ', downsample_size)
plist[[length(plist) + 1]] = test_gene_cor_between_pop(
test_pop = test_indiv_here,
ref_pop = ref_indiv_here,
expr_mat = normalized_mat_by_pop
)
}
## proportion = 0.05, downsample = 9
## proportion = 0.1, downsample = 18
## proportion = 0.2, downsample = 36
## proportion = 0.3, downsample = 54
## proportion = 0.4, downsample = 72
## proportion = 0.5, downsample = 90
mat_dist_collector = list()
mat_dist_obs = list()
mat_cor_collector = list()
mat_cor_obs = list()
split_order = c()
for(i in 1 : length(plist)) {
downsample_size = floor(length(ref_indiv_here) * proportion[i])
split_str = paste0('split: ', downsample_size, '/', length(ref_indiv_here) - downsample_size)
split_order = c(split_order, split_str)
mat_dist_collector[[length(mat_dist_collector) + 1]] = data.frame(dist = plist[[i]][[1]]$data$mat_dist, proportion = proportion[i], split_str = split_str)
mat_dist_obs[[length(mat_dist_obs) + 1]] = data.frame(obs = plist[[i]][[1]]$layers[[2]]$data$xintercept[1], proportion = proportion[i], split_str = split_str)
mat_cor_collector[[length(mat_cor_collector) + 1]] = data.frame(cor = plist[[i]][[2]]$data$mat_cor, proportion = proportion[i], split_str = split_str)
mat_cor_obs[[length(mat_cor_obs) + 1]] = data.frame(obs = plist[[i]][[2]]$layers[[2]]$data$xintercept[1], proportion = proportion[i], split_str = split_str)
}
mat_dist_collector = do.call(rbind, mat_dist_collector)
mat_dist_collector$split_str = factor(mat_dist_collector$split_str, levels = split_order)
mat_dist_obs = do.call(rbind, mat_dist_obs)
mat_cor_collector = do.call(rbind, mat_cor_collector)
mat_cor_collector$split_str = factor(mat_cor_collector$split_str, levels = split_order)
mat_cor_obs = do.call(rbind, mat_cor_obs)
mat_dist_collector %>% ggplot() + geom_histogram(aes(x = dist)) + facet_wrap(~split_str, ncol = 1) + ggtitle('Matrix 2-norm distance') # + geom_vline(data = mat_dist_obs, aes(xintercept = obs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mat_cor_collector %>% ggplot() + geom_histogram(aes(x = cor)) + facet_wrap(~split_str, ncol = 1) + ggtitle('Matrix correlation') # + geom_vline(data = mat_cor_obs, aes(xintercept = obs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.