{
  "access": {
    "embargo": {
      "active": false,
      "reason": null
    },
    "files": "public",
    "record": "public",
    "status": "open"
  },
  "created": "2021-12-04T17:41:56.852363+00:00",
  "custom_fields": {},
  "deletion_status": {
    "is_deleted": false,
    "status": "P"
  },
  "files": {
    "count": 4,
    "enabled": true,
    "entries": {
      "hmdb_meta.Rds": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:0fa3c4f9ae6389e4712932f260b53318",
        "ext": "rds",
        "id": "9b3efee6-028b-41f8-86c0-857589e29890",
        "key": "hmdb_meta.Rds",
        "links": {
          "content": "https://zenodo.org/api/records/5758926/files/hmdb_meta.Rds/content",
          "self": "https://zenodo.org/api/records/5758926/files/hmdb_meta.Rds"
        },
        "metadata": null,
        "mimetype": "application/octet-stream",
        "size": 15018581,
        "storage_class": "L"
      },
      "hmdb_metabolites.Rds": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:71f927874f203f539c9d8a98de9466b0",
        "ext": "rds",
        "id": "5178f1cd-a5ec-4ccc-8577-ca09ba909199",
        "key": "hmdb_metabolites.Rds",
        "links": {
          "content": "https://zenodo.org/api/records/5758926/files/hmdb_metabolites.Rds/content",
          "self": "https://zenodo.org/api/records/5758926/files/hmdb_metabolites.Rds"
        },
        "metadata": null,
        "mimetype": "application/octet-stream",
        "size": 2067123,
        "storage_class": "L"
      },
      "hmdb_names.Rds": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:62502e08196e160355a36de476a10493",
        "ext": "rds",
        "id": "2cad5824-b3e7-44e1-af2e-e5a69c5a28eb",
        "key": "hmdb_names.Rds",
        "links": {
          "content": "https://zenodo.org/api/records/5758926/files/hmdb_names.Rds/content",
          "self": "https://zenodo.org/api/records/5758926/files/hmdb_names.Rds"
        },
        "metadata": null,
        "mimetype": "application/octet-stream",
        "size": 6409585,
        "storage_class": "L"
      },
      "hmdb_proteins.Rds": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:970ab6aa24db28c6e644473ddcd6e089",
        "ext": "rds",
        "id": "aa74d853-047a-4438-b8b7-c8af4c32634a",
        "key": "hmdb_proteins.Rds",
        "links": {
          "content": "https://zenodo.org/api/records/5758926/files/hmdb_proteins.Rds/content",
          "self": "https://zenodo.org/api/records/5758926/files/hmdb_proteins.Rds"
        },
        "metadata": null,
        "mimetype": "application/octet-stream",
        "size": 12425546,
        "storage_class": "L"
      }
    },
    "order": [],
    "total_bytes": 35920835
  },
  "id": "5758926",
  "is_draft": false,
  "is_published": true,
  "links": {
    "access": "https://zenodo.org/api/records/5758926/access",
    "access_grants": "https://zenodo.org/api/records/5758926/access/grants",
    "access_links": "https://zenodo.org/api/records/5758926/access/links",
    "access_request": "https://zenodo.org/api/records/5758926/access/request",
    "access_users": "https://zenodo.org/api/records/5758926/access/users",
    "archive": "https://zenodo.org/api/records/5758926/files-archive",
    "archive_media": "https://zenodo.org/api/records/5758926/media-files-archive",
    "communities": "https://zenodo.org/api/records/5758926/communities",
    "communities-suggestions": "https://zenodo.org/api/records/5758926/communities-suggestions",
    "doi": "https://doi.org/10.5281/zenodo.5758926",
    "draft": "https://zenodo.org/api/records/5758926/draft",
    "file_modification": "https://zenodo.org/api/records/5758926/file-modification",
    "files": "https://zenodo.org/api/records/5758926/files",
    "latest": "https://zenodo.org/api/records/5758926/versions/latest",
    "latest_html": "https://zenodo.org/records/5758926/latest",
    "media_files": "https://zenodo.org/api/records/5758926/media-files",
    "parent": "https://zenodo.org/api/records/5758925",
    "parent_doi": "https://doi.org/10.5281/zenodo.5758925",
    "parent_doi_html": "https://zenodo.org/doi/10.5281/zenodo.5758925",
    "parent_html": "https://zenodo.org/records/5758925",
    "preview_html": "https://zenodo.org/records/5758926?preview=1",
    "request_deletion": "https://zenodo.org/api/records/5758926/request-deletion",
    "requests": "https://zenodo.org/api/records/5758926/requests",
    "reserve_doi": "https://zenodo.org/api/records/5758926/draft/pids/doi",
    "self": "https://zenodo.org/api/records/5758926",
    "self_doi": "https://doi.org/10.5281/zenodo.5758926",
    "self_doi_html": "https://zenodo.org/doi/10.5281/zenodo.5758926",
    "self_html": "https://zenodo.org/records/5758926",
    "self_iiif_manifest": "https://zenodo.org/api/iiif/record:5758926/manifest",
    "self_iiif_sequence": "https://zenodo.org/api/iiif/record:5758926/sequence/default",
    "versions": "https://zenodo.org/api/records/5758926/versions"
  },
  "media_files": {
    "count": 0,
    "enabled": false,
    "entries": {},
    "order": [],
    "total_bytes": 0
  },
  "metadata": {
    "additional_descriptions": [
      {
        "description": "#read current release information to set parameters for download\nsource(here::here(\"code\", \"current_release.R\"))\n\n#LOAD BIG data \ntemp &lt;- tempfile()\ndownload.file(metabolites_url, temp)\nhmdb_full &lt;- xmlToDataFrame(unzip(temp)) %&gt;% #extracting XML took a long time for a 4GB XML file!\n  janitor::clean_names()\nunlink(temp)\nunlink(\"hmdb_metabolites.xml\")\n\n#hmdb_full &lt;- xmlToDataFrame(here::here(\"tmp\", \"hmdb_metabolites.xml\")) %&gt;% janitor::clean_names()\n\n#for getting metabolite classes\nclass_finder &lt;- function(string) {\n  if(stringr::str_detect(string, \"class\\\\b\")) {\n    class_compounds &lt;- function(string) {\n      tmp &lt;- purrr::map(string, ~ unlist(str_split(., pattern = \"\\\\.\"))) %&gt;% \n        purrr::map(., ~keep(.x, ~ str_detect(.x, \"class\"))) %&gt;% flatten_chr(.)\n      class &lt;- stringr::str_split(tmp, \"class of organic compounds known as \")\n      return(stringr::str_to_sentence(class[[1]][[2]]))}\n    tryCatch(class_compounds(string), \n             error = function(x){\"Organic compound\"})\n  } else if(stringr::str_detect(string, \"classified\\\\b\")) {\n    classified_compounds &lt;- function(string) {\n      tmp &lt;- purrr::map(string, ~ unlist(str_split(., pattern = \"\\\\.\"))) %&gt;% \n        purrr::map(., ~keep(.x, ~ str_detect(.x, \"classified\"))) %&gt;% flatten_chr(.)\n      class &lt;- stringr::str_split(tmp, \"classified as a member of the \")\n      return(stringr::str_to_sentence(class[[1]][[2]]))}\n    tryCatch(classified_compounds(string), \n             error = function(x){\"Organic compound\"})\n  } else if(stringr::str_detect(string, \"steroid\")) {\n    return(\"Steroid hormone\")\n  } else if(stringr::str_detect(string, \"nucleoside\")) {\n    return(\"Nucleoside\")\n  } else if(stringr::str_detect(string, \"acid\")) {\n    return(\"Organic acid\")\n  } else {\n    return(\"Organic compound\")\n  }\n}\n\n#purrr::map(hmdb_class$description, ~ class_finder(.))\n\nhmdb_class &lt;- \n  hmdb_full %&gt;% \n  #slice(1:10) %&gt;% #for testing\n  dplyr::select(name, description) %&gt;% \n  dplyr::mutate(class = purrr::map(description, ~ class_finder(.))) %&gt;% \n  dplyr::select(-description)\n\nhmdb_names &lt;-\n  hmdb_full %&gt;%\n  #slice(1:10) %&gt;% #for testing\n  dplyr::select(name, synonyms, cid = pubchem_compound_id) %&gt;% \n  dplyr::left_join(hmdb_class, by = \"name\")\n\nhmdb_meta &lt;-\n  hmdb_full %&gt;%\n  dplyr::select(name, synonyms, cid = pubchem_compound_id, accession, description, chemical_formula, average_molecular_weight, wikipedia_id) %&gt;% \n  dplyr::left_join(hmdb_class, by = \"name\")\n\n\n\n\n\n\n\ntemp &lt;- tempfile()\ndownload.file(metabolite_proteins_url, temp)\nhmdb_proteins_raw &lt;- \n  XML::xmlToDataFrame(unzip(temp)) %&gt;% \n  janitor::clean_names()\nunlink(temp)\nunlink(\"hmdb_proteins.xml\")\n\ncensor_proteins &lt;- c(\"\")\ncensor_metabolites &lt;- c(\"\",\"Water\", \"Hydrogen Ion\")\n\nhmdb_proteins_long &lt;- \n  hmdb_proteins_raw %&gt;%\n  #dplyr::slice(1:20) %&gt;% #for testing\n  dplyr::mutate(metabolite_associations = stringr::str_replace_all(metabolite_associations, \"HMDB\", \"\\\\.HMDB\")) %&gt;% \n  tidyr::separate_rows(metabolite_associations, sep = \"\\\\.\") %&gt;% \n  dplyr::filter(metabolite_associations != \"\") %&gt;% \n  tidyr::separate(metabolite_associations, \n                  into = c(\"metabolite_accession\", \"metabolite_name\"), \n                  sep = 11) %&gt;% \n  dplyr::select(gene_name, metabolite_name, gene_accession = accession, metabolite_accession) %&gt;% \n  dplyr::filter(!gene_name %in% censor_proteins, \n                !metabolite_name %in% censor_metabolites, \n                stringr::str_detect(metabolite_accession, \"HMDB\")) \n\n# tmp &lt;- \"Water\"\n# tmp &lt;- \"Glucose-6-Phosphate\"\n# tmp &lt;- \"3-Carbamoyl-2-phenylpropionaldehyde\"\n# tmp &lt;- \"-1(11),7,9-trien-11-ol\"\n# metabolite_string &lt;- \"R-95913\"\n\n\n#collapse metabolite name if it's long\ncollapse_metabolites &lt;- function(metabolite_string) {\n  # #skip if missing\n  # if(stringr::str_length(metabolite_string) == 0 | is.na(metabolite_string)) {\n  #   return(metabolite_string)\n  # }\n  #skip if too short\n  if(stringr::str_length(metabolite_string) &lt; 7) {\n    return(metabolite_string)\n  }\n  #get first word\n  new_metabolite_string &lt;- \n    stringr::str_extract(metabolite_string, \"[[:alpha:]]\\\\w+\") #alpha omits numbers and punct, w gets word character, + gets one or more\n  #if new_string is NA b/c it has a bizarre name (like a drug name), code breaks; so return original string\n  if(is.na(new_metabolite_string)) {\n    return(metabolite_string)\n  }\n  #add plus to indicate that it got collapsed\n  if(metabolite_string != new_metabolite_string) {\n    new_metabolite_string &lt;- glue::glue(\"{new_metabolite_string} +\")}\n  return(new_metabolite_string)\n}\n# collapse_metabolites(tmp)\n\n# simplify_metabolites &lt;- function(df) {\n#   # if (nrow(df) &lt; 20) {\n#   #   return(df)\n#   # }\n#   new_df &lt;- \n#     df %&gt;% \n#     mutate(metabolite_name_simple = map_chr(.x = df[[2]], .f = collapse_metabolites))\n#   return(new_df)\n# }\n\nhmdb_proteins_full &lt;-\n  hmdb_proteins_long %&gt;% \n  dplyr::group_by(fav_gene = gene_name) %&gt;% \n  tidyr::nest() %&gt;% \n  dplyr::filter(!fav_gene %in% censor_proteins) %&gt;%\n  dplyr::mutate(original_num_rows = map_int(data, nrow)) \n\nhmdb_proteins &lt;-\n  hmdb_proteins_long %&gt;% \n  dplyr::mutate(metabolite_name_simple = map_chr(metabolite_name, collapse_metabolites)) %&gt;% \n  dplyr::distinct(gene_name, metabolite_name_simple, .keep_all = TRUE) %&gt;% \n  dplyr::group_by(fav_gene = gene_name) %&gt;% \n  tidyr::nest() %&gt;% \n  dplyr::filter(!fav_gene %in% censor_proteins) %&gt;% \n  dplyr::mutate(num_rows = map_int(data, nrow)) %&gt;% \n  dplyr::left_join(hmdb_proteins_full, by = \"fav_gene\", suffix = c(\"_collapsed\", \"_original\")) %&gt;% \n  dplyr::arrange(desc(num_rows))\n\n#colnames(hmdb_proteins) are fav_gene, data_collapsed, num_rows, data_original, original_num_rows\n\nhmdb_metabolites &lt;-\n  hmdb_proteins_long %&gt;% \n  #dplyr::mutate(metabolite_name = stringr::str_to_lower(metabolite_name)) %&gt;% \n  dplyr::group_by(fav_metabolite = metabolite_name) %&gt;% \n  tidyr::nest() %&gt;% \n  dplyr::filter(!fav_metabolite %in% censor_metabolites) %&gt;% \n  dplyr::mutate(num_rows = map_int(data, nrow)) %&gt;% \n  dplyr::arrange(desc(num_rows))\n\n#save files\n#saveRDS(hmdb_full, file = here::here(\"data\", paste0(release, \"_hmdb_full.Rds\")))\nsaveRDS(hmdb_names, file = here::here(\"data\", paste0(release, \"_hmdb_names.Rds\")))\nsaveRDS(hmdb_meta, file = here::here(\"data\", paste0(release, \"_hmdb_meta.Rds\")))\nsaveRDS(hmdb_proteins, file = here::here(\"data\", paste0(release, \"_hmdb_proteins.Rds\")))\nsaveRDS(hmdb_metabolites, file = here::here(\"data\", paste0(release, \"_hmdb_metabolites.Rds\")))",
        "type": {
          "id": "notes",
          "title": {
            "de": "Anmerkungen",
            "en": "Notes"
          }
        }
      }
    ],
    "creators": [
      {
        "person_or_org": {
          "family_name": "Hirschey",
          "given_name": "Matthew",
          "name": "Hirschey, Matthew",
          "type": "personal"
        }
      }
    ],
    "description": "<p>This is a cleaned and condensed version of the HMDB XML metabolites file from&nbsp;https://hmdb.ca/downloads, current version 5.0. The original zipped file (~1GB) extracts to 6.49 GB xml file. The full is then cleaned and then&nbsp;only small sub-files are exported.&nbsp;This&nbsp;is too big to&nbsp;extract from the zipped file&nbsp;on the server, and gets clipped (to about 4GB). As a result,&nbsp;the XML to DF function&nbsp;fails because of missing data.</p>\n\n<p>This repo containing 4 files&nbsp;is an alternative pointer for only the data needed.&nbsp;Code to process it from the hmdb_full file is pasted in the notes below.</p>\n\n<p>These 4 files are used on www.datadrivenhypothesis.org</p>",
    "publication_date": "2021-12-03",
    "publisher": "Zenodo",
    "resource_type": {
      "id": "dataset",
      "title": {
        "de": "Datensatz",
        "en": "Dataset"
      }
    },
    "rights": [
      {
        "description": {
          "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited."
        },
        "icon": "cc-by-icon",
        "id": "cc-by-4.0",
        "props": {
          "scheme": "spdx",
          "url": "https://creativecommons.org/licenses/by/4.0/legalcode"
        },
        "title": {
          "en": "Creative Commons Attribution 4.0 International"
        }
      }
    ],
    "title": "HMDB Clean",
    "version": "21Q4"
  },
  "parent": {
    "access": {
      "owned_by": {
        "user": "119920"
      }
    },
    "communities": {},
    "id": "5758925",
    "pids": {
      "doi": {
        "client": "datacite",
        "identifier": "10.5281/zenodo.5758925",
        "provider": "datacite"
      }
    }
  },
  "pids": {
    "doi": {
      "client": "datacite",
      "identifier": "10.5281/zenodo.5758926",
      "provider": "datacite"
    },
    "oai": {
      "identifier": "oai:zenodo.org:5758926",
      "provider": "oai"
    }
  },
  "revision_id": 2,
  "stats": {
    "all_versions": {
      "data_volume": 4123766029.0,
      "downloads": 462,
      "unique_downloads": 418,
      "unique_views": 631,
      "views": 653
    },
    "this_version": {
      "data_volume": 4123766029.0,
      "downloads": 462,
      "unique_downloads": 418,
      "unique_views": 630,
      "views": 652
    }
  },
  "status": "published",
  "swh": {},
  "updated": "2021-12-05T13:48:44.757425+00:00",
  "versions": {
    "index": 1,
    "is_latest": true
  }
}