Published December 3, 2021 | Version 21Q4
Dataset Open

HMDB Clean

Description

This is a cleaned and condensed version of the HMDB XML metabolites file from https://hmdb.ca/downloads, current version 5.0. The original zipped file (~1GB) extracts to 6.49 GB xml file. The full is then cleaned and then only small sub-files are exported. This is too big to extract from the zipped file on the server, and gets clipped (to about 4GB). As a result, the XML to DF function fails because of missing data.

This repo containing 4 files is an alternative pointer for only the data needed. Code to process it from the hmdb_full file is pasted in the notes below.

These 4 files are used on www.datadrivenhypothesis.org

Notes

#read current release information to set parameters for download source(here::here("code", "current_release.R")) #LOAD BIG data temp <- tempfile() download.file(metabolites_url, temp) hmdb_full <- xmlToDataFrame(unzip(temp)) %>% #extracting XML took a long time for a 4GB XML file! janitor::clean_names() unlink(temp) unlink("hmdb_metabolites.xml") #hmdb_full <- xmlToDataFrame(here::here("tmp", "hmdb_metabolites.xml")) %>% janitor::clean_names() #for getting metabolite classes class_finder <- function(string) { if(stringr::str_detect(string, "class\\b")) { class_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "class"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "class of organic compounds known as ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(class_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "classified\\b")) { classified_compounds <- function(string) { tmp <- purrr::map(string, ~ unlist(str_split(., pattern = "\\."))) %>% purrr::map(., ~keep(.x, ~ str_detect(.x, "classified"))) %>% flatten_chr(.) class <- stringr::str_split(tmp, "classified as a member of the ") return(stringr::str_to_sentence(class[[1]][[2]]))} tryCatch(classified_compounds(string), error = function(x){"Organic compound"}) } else if(stringr::str_detect(string, "steroid")) { return("Steroid hormone") } else if(stringr::str_detect(string, "nucleoside")) { return("Nucleoside") } else if(stringr::str_detect(string, "acid")) { return("Organic acid") } else { return("Organic compound") } } #purrr::map(hmdb_class$description, ~ class_finder(.)) hmdb_class <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, description) %>% dplyr::mutate(class = purrr::map(description, ~ class_finder(.))) %>% dplyr::select(-description) hmdb_names <- hmdb_full %>% #slice(1:10) %>% #for testing dplyr::select(name, synonyms, cid = pubchem_compound_id) %>% dplyr::left_join(hmdb_class, by = "name") hmdb_meta <- hmdb_full %>% dplyr::select(name, synonyms, cid = pubchem_compound_id, accession, description, chemical_formula, average_molecular_weight, wikipedia_id) %>% dplyr::left_join(hmdb_class, by = "name") temp <- tempfile() download.file(metabolite_proteins_url, temp) hmdb_proteins_raw <- XML::xmlToDataFrame(unzip(temp)) %>% janitor::clean_names() unlink(temp) unlink("hmdb_proteins.xml") censor_proteins <- c("") censor_metabolites <- c("","Water", "Hydrogen Ion") hmdb_proteins_long <- hmdb_proteins_raw %>% #dplyr::slice(1:20) %>% #for testing dplyr::mutate(metabolite_associations = stringr::str_replace_all(metabolite_associations, "HMDB", "\\.HMDB")) %>% tidyr::separate_rows(metabolite_associations, sep = "\\.") %>% dplyr::filter(metabolite_associations != "") %>% tidyr::separate(metabolite_associations, into = c("metabolite_accession", "metabolite_name"), sep = 11) %>% dplyr::select(gene_name, metabolite_name, gene_accession = accession, metabolite_accession) %>% dplyr::filter(!gene_name %in% censor_proteins, !metabolite_name %in% censor_metabolites, stringr::str_detect(metabolite_accession, "HMDB")) # tmp <- "Water" # tmp <- "Glucose-6-Phosphate" # tmp <- "3-Carbamoyl-2-phenylpropionaldehyde" # tmp <- "-1(11),7,9-trien-11-ol" # metabolite_string <- "R-95913" #collapse metabolite name if it's long collapse_metabolites <- function(metabolite_string) { # #skip if missing # if(stringr::str_length(metabolite_string) == 0 | is.na(metabolite_string)) { # return(metabolite_string) # } #skip if too short if(stringr::str_length(metabolite_string) < 7) { return(metabolite_string) } #get first word new_metabolite_string <- stringr::str_extract(metabolite_string, "[[:alpha:]]\\w+") #alpha omits numbers and punct, w gets word character, + gets one or more #if new_string is NA b/c it has a bizarre name (like a drug name), code breaks; so return original string if(is.na(new_metabolite_string)) { return(metabolite_string) } #add plus to indicate that it got collapsed if(metabolite_string != new_metabolite_string) { new_metabolite_string <- glue::glue("{new_metabolite_string} +")} return(new_metabolite_string) } # collapse_metabolites(tmp) # simplify_metabolites <- function(df) { # # if (nrow(df) < 20) { # # return(df) # # } # new_df <- # df %>% # mutate(metabolite_name_simple = map_chr(.x = df[[2]], .f = collapse_metabolites)) # return(new_df) # } hmdb_proteins_full <- hmdb_proteins_long %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(original_num_rows = map_int(data, nrow)) hmdb_proteins <- hmdb_proteins_long %>% dplyr::mutate(metabolite_name_simple = map_chr(metabolite_name, collapse_metabolites)) %>% dplyr::distinct(gene_name, metabolite_name_simple, .keep_all = TRUE) %>% dplyr::group_by(fav_gene = gene_name) %>% tidyr::nest() %>% dplyr::filter(!fav_gene %in% censor_proteins) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::left_join(hmdb_proteins_full, by = "fav_gene", suffix = c("_collapsed", "_original")) %>% dplyr::arrange(desc(num_rows)) #colnames(hmdb_proteins) are fav_gene, data_collapsed, num_rows, data_original, original_num_rows hmdb_metabolites <- hmdb_proteins_long %>% #dplyr::mutate(metabolite_name = stringr::str_to_lower(metabolite_name)) %>% dplyr::group_by(fav_metabolite = metabolite_name) %>% tidyr::nest() %>% dplyr::filter(!fav_metabolite %in% censor_metabolites) %>% dplyr::mutate(num_rows = map_int(data, nrow)) %>% dplyr::arrange(desc(num_rows)) #save files #saveRDS(hmdb_full, file = here::here("data", paste0(release, "_hmdb_full.Rds"))) saveRDS(hmdb_names, file = here::here("data", paste0(release, "_hmdb_names.Rds"))) saveRDS(hmdb_meta, file = here::here("data", paste0(release, "_hmdb_meta.Rds"))) saveRDS(hmdb_proteins, file = here::here("data", paste0(release, "_hmdb_proteins.Rds"))) saveRDS(hmdb_metabolites, file = here::here("data", paste0(release, "_hmdb_metabolites.Rds")))

Files

Files (35.9 MB)

Name Size Download all
md5:0fa3c4f9ae6389e4712932f260b53318
15.0 MB Download
md5:71f927874f203f539c9d8a98de9466b0
2.1 MB Download
md5:62502e08196e160355a36de476a10493
6.4 MB Download
md5:970ab6aa24db28c6e644473ddcd6e089
12.4 MB Download