# re3data extract


MIT License

Copyright (c) 2022 re3data

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Code was partially adopted from https://github.com/re3data/using_the_re3data_API, credit: Dorothea Strecker and Yi Wang





In [1]:
# load packages
library(purrr)
library(httr)
library(xml2)
library(tidyr)

In [2]:
# get request for all repositories
re3data_request <- GET("https://www.re3data.org/api/beta/repositories") 

In [3]:
# retrieve URLs for indivual repositories
URLs <- xml_text(xml_find_all(read_xml(re3data_request), xpath = "//@href"))

In [4]:
# extract desired attributes
extract_repository_info <- function(url) {
  list(
    re3data.orgIdentifier = xml_text(xml_find_all(repository_metadata_XML, "//r3d:re3data.orgIdentifier")),
    repositoryName = xml_text(xml_find_all(repository_metadata_XML, "//r3d:repositoryName")),
    repositoryURL = xml_text(xml_find_all(repository_metadata_XML, "//r3d:repositoryURL")),
    repositoryIdentifier = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:repositoryIdentifier"))), collapse = "|"),
    description  = xml_text(xml_find_all(repository_metadata_XML, "//r3d:description")),
    type = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:type"))), collapse = "|"),
    size = xml_text(xml_find_all(repository_metadata_XML, "//r3d:size")),
    updated = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//@updated"))), collapse = "|"),
    startDate = xml_text(xml_find_all(repository_metadata_XML, "//r3d:startDate")),
    endDate = xml_text(xml_find_all(repository_metadata_XML, "//r3d:endDate")),
    subject = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:subject"))), collapse = "|"),
    contentType = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:contentType"))), collapse = "|"),
    providerType = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:providerType"))), collapse = "|"),
    keyword = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:keyword"))), collapse = "|"),
    databaseAccessType = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:databaseAccessType"))), collapse = "|"),
    dataUploadType = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:dataUploadType"))), collapse = "|"),
    softwareName = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:softwareName"))), collapse = "|"),
    api = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:api"))), collapse = "|"),
    apiType = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//@apiType"))), collapse = "|"),
    pidSystem = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:pidSystem"))), collapse = "|"),
    enhancedPublication = xml_text(xml_find_all(repository_metadata_XML, "//r3d:enhancedPublication")),
    certificate = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:certificate"))), collapse = "|"),
    metadataStandardName = paste(unique(xml_text(xml_find_all(repository_metadata_XML, "//r3d:metadataStandardName"))), collapse = "|"),
    remarks = xml_text(xml_find_all(repository_metadata_XML, "//r3d:remarks")),
    entryDate = xml_text(xml_find_all(repository_metadata_XML, "//r3d:entryDate")),
    lastUpdate = xml_text(xml_find_all(repository_metadata_XML, "//r3d:lastUpdate"))
  )
}

In [5]:
# create empty container
sample_data1 <- data.frame(matrix(ncol = 26, nrow = 0))

colnames(sample_data1) <- c("re3data.orgIdentifier", "repositoryName", "repositoryURL", "repositoryIdentifier", "description", 
                               "type", "size", "updated", "startDate", "endDate",
                               "subject", "contentType", "providerType", "keyword", "databaseAccessType",
                               "dataUploadType", "softwareName", "api" ,"apiType", "pidSystem", 
                               "enhancedPublication", "certificate", "metadataStandardName","remarks", "entryDate", 
                               "lastUpdate")

In [6]:
# execute attribute extraction and save in container 
for (url in URLs) {
  repository_metadata_request <- GET(url)
  repository_metadata_XML <-read_xml(repository_metadata_request) 
  results_list <- extract_repository_info(repository_metadata_XML)
  sample_data1 <- rbind(sample_data1, results_list)
}

In [7]:
head(sample_data1)

Unnamed: 0_level_0,re3data.orgIdentifier,repositoryName,repositoryURL,repositoryIdentifier,description,type,size,updated,startDate,endDate,⋯,softwareName,api,apiType,pidSystem,enhancedPublication,certificate,metadataStandardName,remarks,entryDate,lastUpdate
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,r3d100010141,GAWSIS,https://gawsis.meteoswiss.ch/GAWSIS//index.html#/,,"GAWSIS is being developed and maintained by the Federal Office of Meteorology and Climatology MeteoSwiss in collaboration with the WMO GAW Secretariat, the GAW World Data Centres and other GAW representatives to improve the management of information about the GAW network of ground-based stations. The application is presently hosted by the Swiss Laboratories for Materials Testing and Research Empa. GAWSIS provides the GAW community and other interested people with an up-to-date, searchable data base of site descriptions, measurements programs and data available, contact people, bibliographic references. Linked data collections are hosted at the World Data Centers of the WMO Global Atmosphere Watch.",disciplinary|institutional,,,2000,,⋯,unknown,https://github.com/wmo-im/docs/blob/master/XML%20station%20representation%20in%20OSCAR.ipynb,REST,other,unknown,,ISO 19115,"Currently GAW coordinates activities and data from 29 Global stations, more than 400 Regional stations, and around 100 Contributing stations operated by Contributing networks. More than 80 countries actively host GAW stations",2013-08-29,2021-04-28
2,r3d100010153,VegBank,http://vegbank.org/vegbank/index.jsp,,"VegBank is the vegetation plot database of the Ecological Society of America's Panel on Vegetation Classification. VegBank consists of three linked databases that contain the actual plot records, vegetation types recognized in the U.S. National Vegetation Classification and other vegetation types submitted by users, and all plant taxa recognized by ITIS/USDA as well as all other plant taxa recorded in plot records. Vegetation records, community types and plant taxa may be submitted to VegBank and may be subsequently searched, viewed, annotated, revised, interpreted, downloaded, and cited. VegBank receives its data from the VegBank community of users.",disciplinary,108.062 plots; 293.165 plant concepts; 26.433 community concepts,2017-03-13,2003,,⋯,unknown,http://vegbank.org/vegdocs/vegbranch/vegbranch.html,other,none,unknown,,FGDC/CSDGM - Federal Geographic Data Committee Content Standard for Digital Geospatial Metadata,,2012-10-25,2021-07-30
3,r3d100010201,LMU-ifo Economics & Business Data Center,http://www.cesifo-group.de/ifoHome/facts/EBDC.html,,"The Economics & Business Data Center (EBDC) is a combined platform for empirical research in business administration and economics of the Ludwig–Maximilian University of Munich (LMU) and the Ifo Institute and aims at opening new fields for empirical research in business administration and economics. In this regard, the EBDC provides innovative datasets of German companies, containing both survey data of the Ifo Institute as well as external balance sheet data. Therefore, the tasks of the EBDC also include the procurement and administration of data sources for research and teaching, the central provision, updating and documentation of external databases, as well as the acquisition of corresponding support tools. Beyond that, the EBDC serves as a contact and central coordinator on licensing economic firm-level datasets for LMU’s Munich School of Management and LMU’s Department of Economics and supports researchers and guests of the LMU and the Ifo Institute on site. In the future, it will also conduct academic conferences on research with company data.",disciplinary|institutional,,,2008,,⋯,unknown,,,DOI,unknown,RatSWD,,"LMU-ifo Economics & Business Data Center (EBDC) is covered by Thomson Reuters Data Citation Index. The Economics and Business Data Center is supported by the Exzellenzinitiative of the Federal Ministry of Education and Research (within the “LMUExcellent” program) and was founded in 2008. In spring 2011 it received accreditation as a research data centre of the Rat für Sozial- und Wirtschaftsdaten. // The CESifo Group, consisting of the Center for Economic Studies (CES), the Ifo Institute and the CESifo GmbH (Munich Society for the Promotion of Economic Research) is a research group unique in Europe in the area of economic research. It combines the theoretically oriented economic research of the university with the empirical work of a leading Economic research institute and places this combination in an international environment.",2013-02-11,2021-06-29
4,r3d100010209,CLARIN-ERIC,https://www.clarin.eu/,,"CLARIN is a European Research Infrastructure for the Humanities and Social Sciences, focusing on language resources (data and tools). It is being implemented and constantly improved at leading institutions in a large and growing number of European countries, aiming at improving Europe's multi-linguality competence. CLARIN provides several services, such as access to language data and tools to analyze data, and offers to deposit research data, as well as direct access to knowledge about relevant topics in relation to (research on and with) language resources. The main tool is the 'Virtual Language Observatory' providing metadata and access to the different national CLARIN centers and their data.",disciplinary,944.380 records,2016-01-29,2012,,⋯,unknown,https://www.clarin.eu/faq-page/275#t275n2858|https://www.clarin.eu/content/clarin-software-github,OAI-PMH|REST,hdl,unknown,,,"CLARIN-ERIC is a network memeber of ICSU World Data System. Letter of Agreement is pending (08.01.2019). In 2012 CLARIN gets ERIC status.CLARIN, the pan-European Common Language Resources and Technology Infrastructure, is the second European research infrastructure to be granted with ERIC status. Eight countries are committed to the setting up of CLARIN-ERIC: Austria, Bulgaria, the Czech Republic, Denmark, Estonia, Germany, Poland, and the Netherlands. The Dutch Language Union completes the list of founding members. CLARIN-ERIC will be hosted in Utrecht, the Netherlands. The “European Strategy Forum for Research Infrastructures” (ESFRI) has emphasised the importance of CLARIN by including the project in its Roadmap 2006. This also comprises four other important initiatives in the arts and humanities and social studies, including a similar proposal for a data infrastructure for the social sciences (CESSDA ERIC Major Upgrade) and for the arts and humanities (DARIAH). In the preparatory phase (2008-2010) CLARIN is funded by the EU through the 7th Framework ESFRI programme. One of the objectives of the preparatory phase is to come with cost estimations for the construction and exploitation phase. The main funders will then be the national governments, with a possible minor contribution from the EU for some generic costs of the infrastructure. Clarin members: https://www.clarin.eu/content/about-eric. // CLARIN offers two central resource discovery tools, (1) the metadata-based Virtual Language Observatory which lists hundreds of thousands of individual resources, not only at CLARIN centres, and (2) the Federated Content Search, which allows searching WITHIN resources at CLARIN centres.",2013-02-12,2019-01-08
5,r3d100010221,EMBL-EBI,https://www.ebi.ac.uk/,RRID:SCR_004727|RRID:nlx_72386,"The European Bioinformatics Institute (EBI) has a long-standing mission to collect, organise and make available databases for biomolecular science. It makes available a collection of databases along with tools to search, download and analyse their content. These databases include DNA and protein sequences and structures, genome annotation, gene expression information, molecular interactions and pathways. Connected to these are linking and descriptive data resources such as protein motifs, ontologies and many others. In many of these efforts, the EBI is a European node in global data-sharing agreements involving, for example, the USA and Japan.",disciplinary|institutional,,,1994,,⋯,unknown,ftp://ftp.ebi.ac.uk/|https://www.ebi.ac.uk/ebisearch/overview.ebi/about|https://www.ebi.ac.uk/seqdb/confluence/display/JDSAT/EMBL-EBI+Web+Services+APIs+-+Data+Retrieval,FTP|REST|SOAP,none,unknown,,,"As part of the European Molecular Biology Laboratory EMBL, the largest part of our funding comes from the governments of EMBL's 20 member states. Other major funders include the European Commission, Wellcome Trust, US National Institutes of Health, UK Research Councils, our industry partners. In addition, the Wellcome Trust generously provides the facilities for the EMBL-EBI on its Genome Campus at Hinxton, and the UK Research Councils have also provided funds for our facilities in Hinxton.",2013-03-01,2019-02-01
6,r3d100010226,Alternative Fuels Data Center,https://afdc.energy.gov/,,"The Alternative Fuels Data Center (AFDC) is a comprehensive clearinghouse of information about advanced transportation technologies. The AFDC offers transportation decision makers unbiased information, data, and tools related to the deployment of alternative fuels and advanced vehicles. The AFDC launched in 1991 in response to the Alternative Motor Fuels Act of 1988 and the Clean Air Act Amendments of 1990. It originally served as a repository for alternative fuel performance data. The AFDC has since evolved to offer a broad array of information resources that support efforts to reduce petroleum use in transportation. The AFDC serves Clean Cities stakeholders, fleets regulated by the Energy Policy Act, businesses, policymakers, government agencies, and the general public.",disciplinary,,,1991-01-01,,⋯,unknown,https://developer.nrel.gov/docs/transportation/transportation-incentives-laws-v1/,other,none,unknown,,,The AFDC is a resource of the U.S. Department of Energy's Clean Cities program.,2013-05-06,2021-04-30


In [8]:
# export as csv
write.csv(sample_data1, "repository_info_v1.csv", fileEncoding="UTF8")