The following Python script downloads the BioKG knowledge graph from the Open Graph Benchmark (OGB).
import numpy as np
import copy
import json
from ogb.linkproppred import LinkPropPredDataset
= LinkPropPredDataset(name = "ogbl-biokg")
dataset
= dataset.get_edge_split()
split_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
train_edge, valid_edge, test_edge = dataset[0] # graph: library-agnostic graph object graph
The edge index in the graph
object is converted into
JSON format to be read into R.
print(graph.keys())
= graph["edge_index_dict"].copy() edge_index
To make the conversion, the dictionary keys must be renamed (i.e., cannot be tuples).
= list(edge_index.keys())
old_keys for old_name in old_keys:
= "--".join(old_name)
new_name = edge_index[old_name]
edge_index[new_name] del edge_index[old_name]
A special numpy encoder is defined, borrowed from this StackOverflow post.
class NumpyEncoder(json.JSONEncoder):
""" Special json encoder for numpy types """
def default(self, obj):
if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
np.int16, np.int32, np.int64, np.uint8,
np.uint16, np.uint32, np.uint64)):return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32,
np.float64)):return float(obj)
elif isinstance(obj, (np.ndarray,)):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
Finally, we convert the edge index to JSON and write to a file.
= json.dumps(edge_index, cls = NumpyEncoder)
edge_index_json
with open('inst//extdata//edge_index.json', 'a') as f:
+ '\n') f.write(edge_index_json
Read the JSON dataset saved previously.
# load libraries
library(data.table)
library(purrr)
library(magrittr)
library(rjson)
# load metapaths library
library(metapaths)
# read data
biokg = fromJSON(file = "inst/extdata/edge_index.json")
Read mappings downloaded from OGB.
mapping_dir = "inst/extdata/biokg_mappings"
read_mapping = function(mapping_path) {
mapping = fread(file.path(mapping_dir, mapping_path))
mapping[, Type := strsplit(mapping_path, "_")[[1]][1]]
return(mapping)
}
mappings = list.files(mapping_dir) %>%
.[grepl("entidx2name", .)] %>%
map_dfr(read_mapping)
mappings = mappings %>% copy() %>%
setnames(c("ent idx", "ent name"), c("Index", "Name")) %>%
setcolorder("Type") %>%
.[, Label := paste(Type, Index, sep = "_")]
Create a function to convert the edge list to a
data.table
. Note that the node IDs are specific to each
type, so we must add a type-specific prefix.
convert_biokg = function(sub_kg, sub_label) {
# split label
split_label = strsplit(sub_label, "--")[[1]]
origin_label = paste(split_label[1], sub_kg[[1]], sep = "_")
destination_label = paste(split_label[3], sub_kg[[2]], sep = "_")
# create data table
kg_dt = data.table(
Origin = origin_label,
Destination = destination_label,
OriginType = split_label[1],
DestinationType = split_label[3],
EdgeType = split_label[2])
}
Now, map the conversion function over the biokg
list.
biokg_edge_list = imap_dfr(biokg, convert_biokg)
biokg_node_list = get_node_list(biokg_edge_list)
head(biokg_edge_list)
Check that the counts of each node type conform with
graph["num_nodes_dict"]
from the Python script.
disease | drug | function | protein | sideeffect |
---|---|---|---|---|
10687 | 10533 | 45085 | 17499 | 9969 |
biokg_node_list[, .N, by = NodeType]
Add node names from mappings
table.
# add node names
biokg_node_list = merge(biokg_node_list, mappings[, .(Label, Name)], by.x = "Node", by.y = "Label", sort = F) %>%
setnames("Name", "NodeName")
Sample the knowledge graph to generate a small, connected test set.
# load igraph library
library(igraph)
# generate graph
biokg_graph = igraph::graph.data.frame(biokg_edge_list, vertices = biokg_node_list, directed = T)
# sample random nodes
set.seed(42)
seed_nodes = sample(biokg_node_list$Node, 5000)
# generate induced subgraph
biokg_sample = igraph::induced.subgraph(graph = biokg_graph, vids = seed_nodes)
# get largest connected component
comp = igraph::components(biokg_sample)
comp_idx = which.max(comp$csize)
lcc = comp$membership %>%
{names(.)[. == comp_idx]}
# generate connected subgraph
biokg_sub = igraph::induced.subgraph(graph = biokg_graph, vids = lcc)
# generate subsamples node and edge lists
sub_edge_list = biokg_edge_list[Origin %in% lcc & Destination %in% lcc]
sub_node_list = biokg_node_list[Node %in% lcc]
# convenience function
lookup = function(node) {
# return(sub_node_list[Node == node, NodeName])
return(biokg_node_list[Node == node, NodeName])
}
Save node list and edge list to file.
Check possible edge types.
Find hub genes.
# calculate degree
sub_degrees = degree(biokg_sub)
# find candidate nodes
candidate_nodes = sub_degrees %>%
.[which(. > 20 & . < 30)] %>% names() %>% .[grepl("protein", .)] %>%
{ sub_node_list[Node %in% ., NodeName] }
cat(paste("HGNC:", candidate_nodes, sep = ""), sep = "\n")
Test similarity between an Alzheimer’s disease (AD)-related drug (donepezil) and an AD-related pathway (“regulation of amyloid fibril formation”).
sim_test = get_similarity(
"drug_762", "function_42893",
mp = c("drug", "disease", "protein", "function"),
node_list = biokg_node_list,
edge_list = biokg_edge_list)
sim_res = sim_test$OriginPaths %>%
.[.[["function"]] == "function_36342", ]
Visualize identified metapaths in the KG.
sim_nodes = unique(unlist(sim_res))
# generate connected subgraph
sim_graph = igraph::induced.subgraph(graph = biokg_graph, vids = sim_nodes)
# plot subgraph
plot(sim_graph, vertex.size = 15, edge.arrow.size = 0.25)
As a negative control, randomly sample 100 pathways (i.e., nodes of
type "function"
likely not AD-related) and compute the
similarity.
set.seed(42)
# sample 100 pathways
random_neg_pathways = biokg_node_list[NodeType == "function", ] %>%
.[sample(1:nrow(.), 100), ]
neg_tests = list()
neg_sims = data.table()
# for each pathway, compute similarity
for (i in 1:100) {
message("Pathway #", i, ": ", random_neg_pathways[i, NodeName])
sim_neg_test = get_similarity(
"drug_762", random_neg_pathways[i, Node], # "function_29825",
mp = c("drug", "disease", "protein", "function"),
metric = c("pc", "npc", "dwpc"),
node_list = biokg_node_list,
edge_list = biokg_edge_list,
verbose = F)
neg_tests = append(neg_tests, sim_neg_test)
neg_sims = rbind(neg_sims, sim_neg_test$Similarity[, Pathway := random_neg_pathways[i, NodeName]])
}
# calculate statistics (mean, median, SD, etc.)
neg_sims[, sum(Similarity == 0), by = "Metric"]
neg_sims[, mean(Similarity), by = "Metric"]
neg_sims[, median(Similarity), by = "Metric"]
neg_sims[, sd(Similarity), by = "Metric"]
# saveRDS(neg_tests, "<insert file path>.RDS")
# fwrite(neg_sims, "<insert file path>.csv")
Again, test set-to-set similarity by computing similarity scores between three AD-related drugs (i.e., donepezil, memantine, and galantamine) and a set of six AD-related pathways.
# set of three AD drugs
alzheimer_drugs = c("drug_762", "drug_1075", "drug_895")
# set of six AD-related pathways
alzheimer_pathways = c("function_36342", "function_26258", "function_21333", "function_42901", "function_17601", "function_36167")
# compare sets
alzheimer_test = compare_sets(
alzheimer_drugs,
alzheimer_pathways,
mp = c("drug", "disease", "protein", "function"),
method = c("minimum", "maximum"),
node_list = biokg_node_list,
edge_list = biokg_edge_list)
As a negative control, test set-to-set similarity by computing similarity scores between the same three AD-related drugs and a set of six randomly subset pathways.
set.seed(1234) # selecting different seed
# randomly subset pathways
random_pathways = biokg_node_list[NodeType == "function", ] %>%
.[sample(1:nrow(.), 6), ]
# compare sets
random_test = compare_sets(
alzheimer_drugs,
random_pathways$Node,
mp = c("drug", "disease", "protein", "function"),
method = c("minimum", "maximum"),
node_list = biokg_node_list,
edge_list = biokg_edge_list)