Download BioKG

The following Python script downloads the BioKG knowledge graph from the Open Graph Benchmark (OGB).

import numpy as np
import copy
import json
from ogb.linkproppred import LinkPropPredDataset

dataset = LinkPropPredDataset(name = "ogbl-biokg")

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0] # graph: library-agnostic graph object

The edge index in the graph object is converted into JSON format to be read into R.

print(graph.keys())
edge_index = graph["edge_index_dict"].copy()

To make the conversion, the dictionary keys must be renamed (i.e., cannot be tuples).

old_keys = list(edge_index.keys())
for old_name in old_keys:
    new_name = "--".join(old_name)
    edge_index[new_name] = edge_index[old_name]
    del edge_index[old_name]

A special numpy encoder is defined, borrowed from this StackOverflow post.

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

Finally, we convert the edge index to JSON and write to a file.

edge_index_json = json.dumps(edge_index, cls = NumpyEncoder)

with open('inst//extdata//edge_index.json', 'a') as f:
    f.write(edge_index_json + '\n')

Parse KG in R

Read the JSON dataset saved previously.

# load libraries
library(data.table)
library(purrr)
library(magrittr)
library(rjson)

# load metapaths library
library(metapaths)

# read data
biokg = fromJSON(file = "inst/extdata/edge_index.json")

Read mappings downloaded from OGB.

mapping_dir = "inst/extdata/biokg_mappings"

read_mapping = function(mapping_path) {
  mapping = fread(file.path(mapping_dir, mapping_path))
  mapping[, Type := strsplit(mapping_path, "_")[[1]][1]]
  return(mapping)
}

mappings = list.files(mapping_dir) %>%
  .[grepl("entidx2name", .)] %>%
  map_dfr(read_mapping)

mappings = mappings %>% copy() %>%
  setnames(c("ent idx", "ent name"), c("Index", "Name")) %>%
  setcolorder("Type") %>%
  .[, Label := paste(Type, Index, sep = "_")]

Create a function to convert the edge list to a data.table. Note that the node IDs are specific to each type, so we must add a type-specific prefix.

convert_biokg = function(sub_kg, sub_label) {
  
  # split label
  split_label = strsplit(sub_label, "--")[[1]]
  origin_label = paste(split_label[1], sub_kg[[1]], sep = "_")
  destination_label = paste(split_label[3], sub_kg[[2]], sep = "_")
  
  # create data table
  kg_dt = data.table(
    Origin = origin_label,
    Destination = destination_label,
    OriginType = split_label[1],
    DestinationType = split_label[3],
    EdgeType = split_label[2])
  
}

Now, map the conversion function over the biokg list.

biokg_edge_list = imap_dfr(biokg, convert_biokg)
biokg_node_list = get_node_list(biokg_edge_list)
head(biokg_edge_list)

Check that the counts of each node type conform with graph["num_nodes_dict"] from the Python script.

disease drug function protein sideeffect
10687 10533 45085 17499 9969
biokg_node_list[, .N, by = NodeType]

Add node names from mappings table.

# add node names
biokg_node_list = merge(biokg_node_list, mappings[, .(Label, Name)], by.x = "Node", by.y = "Label", sort = F) %>%
  setnames("Name", "NodeName")

Sample the knowledge graph to generate a small, connected test set.

# load igraph library
library(igraph)

# generate graph
biokg_graph = igraph::graph.data.frame(biokg_edge_list, vertices = biokg_node_list, directed = T)

# sample random nodes
set.seed(42)
seed_nodes = sample(biokg_node_list$Node, 5000)

# generate induced subgraph
biokg_sample = igraph::induced.subgraph(graph = biokg_graph, vids = seed_nodes)

# get largest connected component
comp = igraph::components(biokg_sample) 
comp_idx = which.max(comp$csize)
lcc = comp$membership %>%
  {names(.)[. == comp_idx]}

# generate connected subgraph
biokg_sub = igraph::induced.subgraph(graph = biokg_graph, vids = lcc)

# generate subsamples node and edge lists
sub_edge_list = biokg_edge_list[Origin %in% lcc & Destination %in% lcc]
sub_node_list = biokg_node_list[Node %in% lcc]

# convenience function
lookup = function(node) {
  # return(sub_node_list[Node == node, NodeName])
  return(biokg_node_list[Node == node, NodeName])
}

Save node list and edge list to file.

save(sub_edge_list, file = "data/sub_edge_list.rda")
save(sub_node_list, file = "data/sub_node_list.rda")

Test Node Similarity

Check possible edge types.

message("- ", paste(unique(sub_node_list[, NodeType]), collapse = "\n- "))

Find hub genes.

# calculate degree
sub_degrees = degree(biokg_sub)

# find candidate nodes
candidate_nodes = sub_degrees %>%
  .[which(. > 20 & . < 30)] %>% names() %>% .[grepl("protein", .)] %>%
  { sub_node_list[Node %in% ., NodeName] }

cat(paste("HGNC:", candidate_nodes, sep = ""), sep = "\n")

Test similarity between an Alzheimer’s disease (AD)-related drug (donepezil) and an AD-related pathway (“regulation of amyloid fibril formation”).

sim_test = get_similarity(
  "drug_762", "function_42893",
  mp = c("drug", "disease", "protein", "function"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list)

sim_res = sim_test$OriginPaths  %>%
  .[.[["function"]] == "function_36342", ]

Visualize identified metapaths in the KG.

sim_nodes = unique(unlist(sim_res))

# generate connected subgraph
sim_graph = igraph::induced.subgraph(graph = biokg_graph, vids = sim_nodes)

# plot subgraph
plot(sim_graph, vertex.size = 15, edge.arrow.size = 0.25)

As a negative control, randomly sample 100 pathways (i.e., nodes of type "function" likely not AD-related) and compute the similarity.

set.seed(42)

# sample 100 pathways
random_neg_pathways = biokg_node_list[NodeType == "function", ] %>%
    .[sample(1:nrow(.), 100), ] 
neg_tests = list()
neg_sims = data.table()

# for each pathway, compute similarity
for (i in 1:100) {
  
  message("Pathway #", i, ": ", random_neg_pathways[i, NodeName])
  
  sim_neg_test = get_similarity(
  "drug_762", random_neg_pathways[i, Node], # "function_29825",
  mp = c("drug", "disease", "protein", "function"),
  metric = c("pc", "npc", "dwpc"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list,
  verbose = F)
  
  neg_tests = append(neg_tests, sim_neg_test)
  neg_sims = rbind(neg_sims, sim_neg_test$Similarity[, Pathway := random_neg_pathways[i, NodeName]])
  
}

# calculate statistics (mean, median, SD, etc.)
neg_sims[, sum(Similarity == 0), by = "Metric"]
neg_sims[, mean(Similarity), by = "Metric"]
neg_sims[, median(Similarity), by = "Metric"]
neg_sims[, sd(Similarity), by = "Metric"]

# saveRDS(neg_tests, "<insert file path>.RDS")
# fwrite(neg_sims, "<insert file path>.csv")

Test Set Similarity

Again, test set-to-set similarity by computing similarity scores between three AD-related drugs (i.e., donepezil, memantine, and galantamine) and a set of six AD-related pathways.

# set of three AD drugs
alzheimer_drugs = c("drug_762", "drug_1075", "drug_895")

# set of six AD-related pathways
alzheimer_pathways = c("function_36342", "function_26258", "function_21333", "function_42901", "function_17601", "function_36167")

# compare sets
alzheimer_test = compare_sets(
  alzheimer_drugs,
  alzheimer_pathways,
  mp = c("drug", "disease", "protein", "function"),
  method = c("minimum", "maximum"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list)

As a negative control, test set-to-set similarity by computing similarity scores between the same three AD-related drugs and a set of six randomly subset pathways.

set.seed(1234) # selecting different seed

# randomly subset pathways
random_pathways = biokg_node_list[NodeType == "function", ] %>%
  .[sample(1:nrow(.), 6), ]

# compare sets
random_test = compare_sets(
  alzheimer_drugs,
  random_pathways$Node,
  mp = c("drug", "disease", "protein", "function"),
  method = c("minimum", "maximum"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list)