# re3data normalize


MIT License

Copyright (c) 2022 Rouven Schabinger

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


In [1]:
# load packages
library(purrr)
library(httr)
library(xml2)
library(tidyr)
library(stringr)


In [2]:
# read in csv files, treat strings as factors (categorical variables)
sample_data2 <- read.csv("repository_info_v1.csv", stringsAsFactors = TRUE, fileEncoding="UTF8")

In [3]:
sample_data2$size

## Split up size in numbers and units

In [4]:
# convert list in an atomic vector
unlisted <- unlist(sample_data2$size)
# regex, matching all: 3 groups (repeated), numbers clustered by dots, units seperated with whitespace  
lst <- str_match_all(unlisted,"(\\d{1,3}(.?\\d{3})*)\\s+([a-zA-z ]+)*") 


# create empty matrix
m <- matrix(0, nrow = length(lst), ncol = 2) 

# loop through regexed output and distribute it to two columns, separate multiple instances by "|"

i <- 1
while (i < length(lst))
{
    x <- lst[[i]][,2]   # navigate to column (list , matrix, ) number
    y <- lst[[i]][,4]  # navigate to column (list , matrix, ) unit

    m[i, 1] <- paste(str_replace_all(x, "\\.", ""), collapse = "|")   
    m[i, 2] <- paste(y, collapse = "|")     
    
    i = i + 1
}
m
m[,1] # all the numbers
m[,2] # all the units





0,1
,
2115339113,fiiles in total
108062|293165|26433,plots|plant concepts|community concepts
,
944380,records
,
,
77459|29417|1128339,publications for |chemical associations|post translational modifications
140000,images
281783,strains


In [5]:
sample_data2$size_number <- m[,1]
sample_data2$size_unit <- m[,2]

In [6]:
sample_data2

X,re3data.orgIdentifier,repositoryName,repositoryURL,repositoryIdentifier,description,type,size,updated,startDate,⋯,apiType,pidSystem,enhancedPublication,certificate,metadataStandardName,remarks,entryDate,lastUpdate,size_number,size_unit
<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>
1,r3d100010141,GAWSIS,https://gawsis.meteoswiss.ch/GAWSIS//index.html#/,,"GAWSIS is being developed and maintained by the Federal Office of Meteorology and Climatology MeteoSwiss in collaboration with the WMO GAW Secretariat, the GAW World Data Centres and other GAW representatives to improve the management of information about the GAW network of ground-based stations. The application is presently hosted by the Swiss Laboratories for Materials Testing and Research Empa. GAWSIS provides the GAW community and other interested people with an up-to-date, searchable data base of site descriptions, measurements programs and data available, contact people, bibliographic references. Linked data collections are hosted at the World Data Centers of the WMO Global Atmosphere Watch.",disciplinary|institutional,,,2000,⋯,REST,other,unknown,,ISO 19115,"Currently GAW coordinates activities and data from 29 Global stations, more than 400 Regional stations, and around 100 Contributing stations operated by Contributing networks. More than 80 countries actively host GAW stations",2013-08-29,2021-04-28,,
2,r3d100010148,BABS,https://www.babs-muenchen.de/,,"BABS include digital reproductions from the digitization of the Munich Digitisation CenterMunich Digitization Center/Digital Library of the Bavarian State Library including digital reproductions from copyright-free works from the BSB collections created by cooperation partners or service providers, such as digital copies from the The google-ProjectGoogle project; official publications of authorities, departments and agencies of the State of Bavaria according to the ""Bavarian State Promulgation 2 December 2008 (Az.: B II 2-480-30)"" on the delivery of official publications to libraries, the Promulgation Platform Bavaria (Verkündungsplattform), as well as voluntary deliveries of electronic publications of different (mainly Bavarian scientific) publishing houses and other publishers; scientifically relevant literature (open access publications and websites) of national and international origin in the Areas of Collection Emphasis of the BSB (history including classical studies, Eastern Europe, history of France and Italy, music, library science, book studies and information science) as well as Bavarica; electronic publications produced by the BSB specialist departments, especially those of the Center for Electronic Publishing (ZEP); local/regional/national licensed or purchased electronic publications",disciplinary|institutional,2.115.339.113 fiiles in total,2021-01-01,2007,⋯,,DOI|URN,unknown,,,Public-Private-Partnership of Bavarian State Library and Google,2013-09-05,2021-04-28,2115339113,fiiles in total
3,r3d100010153,VegBank,http://vegbank.org/vegbank/index.jsp,,"VegBank is the vegetation plot database of the Ecological Society of America's Panel on Vegetation Classification. VegBank consists of three linked databases that contain the actual plot records, vegetation types recognized in the U.S. National Vegetation Classification and other vegetation types submitted by users, and all plant taxa recognized by ITIS/USDA as well as all other plant taxa recorded in plot records. Vegetation records, community types and plant taxa may be submitted to VegBank and may be subsequently searched, viewed, annotated, revised, interpreted, downloaded, and cited. VegBank receives its data from the VegBank community of users.",disciplinary,108.062 plots; 293.165 plant concepts; 26.433 community concepts,2017-03-13,2003,⋯,other,none,unknown,,FGDC/CSDGM - Federal Geographic Data Committee Content Standard for Digital Geospatial Metadata,,2012-10-25,2021-07-30,108062|293165|26433,plots|plant concepts|community concepts
4,r3d100010201,LMU-ifo Economics & Business Data Center,http://www.cesifo-group.de/ifoHome/facts/EBDC.html,,"The Economics & Business Data Center (EBDC) is a combined platform for empirical research in business administration and economics of the Ludwig–Maximilian University of Munich (LMU) and the Ifo Institute and aims at opening new fields for empirical research in business administration and economics. In this regard, the EBDC provides innovative datasets of German companies, containing both survey data of the Ifo Institute as well as external balance sheet data. Therefore, the tasks of the EBDC also include the procurement and administration of data sources for research and teaching, the central provision, updating and documentation of external databases, as well as the acquisition of corresponding support tools. Beyond that, the EBDC serves as a contact and central coordinator on licensing economic firm-level datasets for LMU’s Munich School of Management and LMU’s Department of Economics and supports researchers and guests of the LMU and the Ifo Institute on site. In the future, it will also conduct academic conferences on research with company data.",disciplinary|institutional,,,2008,⋯,,DOI,unknown,RatSWD,,"LMU-ifo Economics & Business Data Center (EBDC) is covered by Thomson Reuters Data Citation Index. The Economics and Business Data Center is supported by the Exzellenzinitiative of the Federal Ministry of Education and Research (within the “LMUExcellent” program) and was founded in 2008. In spring 2011 it received accreditation as a research data centre of the Rat für Sozial- und Wirtschaftsdaten. // The CESifo Group, consisting of the Center for Economic Studies (CES), the Ifo Institute and the CESifo GmbH (Munich Society for the Promotion of Economic Research) is a research group unique in Europe in the area of economic research. It combines the theoretically oriented economic research of the university with the empirical work of a leading Economic research institute and places this combination in an international environment.",2013-02-11,2021-06-29,,
5,r3d100010209,CLARIN-ERIC,https://www.clarin.eu/,,"CLARIN is a European Research Infrastructure for the Humanities and Social Sciences, focusing on language resources (data and tools). It is being implemented and constantly improved at leading institutions in a large and growing number of European countries, aiming at improving Europe's multi-linguality competence. CLARIN provides several services, such as access to language data and tools to analyze data, and offers to deposit research data, as well as direct access to knowledge about relevant topics in relation to (research on and with) language resources. The main tool is the 'Virtual Language Observatory' providing metadata and access to the different national CLARIN centers and their data.",disciplinary,944.380 records,2016-01-29,2012,⋯,OAI-PMH|REST,hdl,unknown,,,"CLARIN-ERIC is a network memeber of ICSU World Data System. Letter of Agreement is pending (08.01.2019). In 2012 CLARIN gets ERIC status.CLARIN, the pan-European Common Language Resources and Technology Infrastructure, is the second European research infrastructure to be granted with ERIC status. Eight countries are committed to the setting up of CLARIN-ERIC: Austria, Bulgaria, the Czech Republic, Denmark, Estonia, Germany, Poland, and the Netherlands. The Dutch Language Union completes the list of founding members. CLARIN-ERIC will be hosted in Utrecht, the Netherlands. The “European Strategy Forum for Research Infrastructures” (ESFRI) has emphasised the importance of CLARIN by including the project in its Roadmap 2006. This also comprises four other important initiatives in the arts and humanities and social studies, including a similar proposal for a data infrastructure for the social sciences (CESSDA ERIC Major Upgrade) and for the arts and humanities (DARIAH). In the preparatory phase (2008-2010) CLARIN is funded by the EU through the 7th Framework ESFRI programme. One of the objectives of the preparatory phase is to come with cost estimations for the construction and exploitation phase. The main funders will then be the national governments, with a possible minor contribution from the EU for some generic costs of the infrastructure. Clarin members: https://www.clarin.eu/content/about-eric. // CLARIN offers two central resource discovery tools, (1) the metadata-based Virtual Language Observatory which lists hundreds of thousands of individual resources, not only at CLARIN centres, and (2) the Federated Content Search, which allows searching WITHIN resources at CLARIN centres.",2013-02-12,2019-01-08,944380,records
6,r3d100010221,EMBL-EBI,https://www.ebi.ac.uk/,RRID:SCR_004727|RRID:nlx_72386,"The European Bioinformatics Institute (EBI) has a long-standing mission to collect, organise and make available databases for biomolecular science. It makes available a collection of databases along with tools to search, download and analyse their content. These databases include DNA and protein sequences and structures, genome annotation, gene expression information, molecular interactions and pathways. Connected to these are linking and descriptive data resources such as protein motifs, ontologies and many others. In many of these efforts, the EBI is a European node in global data-sharing agreements involving, for example, the USA and Japan.",disciplinary|institutional,,,1994,⋯,FTP|REST|SOAP,none,unknown,,,"As part of the European Molecular Biology Laboratory EMBL, the largest part of our funding comes from the governments of EMBL's 20 member states. Other major funders include the European Commission, Wellcome Trust, US National Institutes of Health, UK Research Councils, our industry partners. In addition, the Wellcome Trust generously provides the facilities for the EMBL-EBI on its Genome Campus at Hinxton, and the UK Research Councils have also provided funds for our facilities in Hinxton.",2013-03-01,2019-02-01,,
7,r3d100010226,Alternative Fuels Data Center,https://afdc.energy.gov/,,"The Alternative Fuels Data Center (AFDC) is a comprehensive clearinghouse of information about advanced transportation technologies. The AFDC offers transportation decision makers unbiased information, data, and tools related to the deployment of alternative fuels and advanced vehicles. The AFDC launched in 1991 in response to the Alternative Motor Fuels Act of 1988 and the Clean Air Act Amendments of 1990. It originally served as a repository for alternative fuel performance data. The AFDC has since evolved to offer a broad array of information resources that support efforts to reduce petroleum use in transportation. The AFDC serves Clean Cities stakeholders, fleets regulated by the Energy Policy Act, businesses, policymakers, government agencies, and the general public.",disciplinary,,,1991-01-01,⋯,other,none,unknown,,,The AFDC is a resource of the U.S. Department of Energy's Clean Cities program.,2013-05-06,2021-04-30,,
8,r3d100010350,BioGRID,https://thebiogrid.org/,FAIRsharing_doi:10.25504/fairsharing.9d5f5r|MIR:00000058|OMICS_01901|RRID:SCR_007393|RRID:nif-0000-00432,The Biological General Repository for Interaction Datasets (BioGRID) is a public database that archives and disseminates genetic and protein interaction data from model organisms and humans. BioGRID is an online interaction repository with data compiled through comprehensive curation efforts. All interaction data are freely provided through our search index and available via download in a wide variety of standardized formats.,disciplinary,77.459 publications for 2.124752protein and genetic interactions; 29.417 chemical associations; 1.128.339 post translational modifications.,2021-07-26,2003,⋯,REST,none,yes,,,BioGRID is partner of the International Molecular Exchange Consortium (IMEx),2013-04-11,2021-07-26,77459|29417|1128339,publications for |chemical associations|post translational modifications
9,r3d100010363,Visual Arts Data Service,https://vads.ac.uk/,,"VADS is the online resource for visual arts. It has provided services to the academic community for 12 years and has built up a considerable portfolio of visual art collections comprising over 100,000 images that are freely available and copyright cleared for use in learning, teaching and research in the UK. VADS provides: expert guidance and help for digital projects in art education; resource development and hosting for art education; project management and consultancy for art education; leadership in the innovative use of ICT in education through its research and development activities. VADS offers advice and guidance to the visual arts research, teaching and learning communities on all aspects of digital resource management from funding, through delivery and use, to preservation.",disciplinary|institutional,over 140.000 images,2021-09-03,1997,⋯,,none,unknown,,,,2013-03-29,2021-09-03,140000,images
10,r3d100010393,International Mouse Strain Resource,http://www.findmice.org/,RRID:SCR_001526|RRID:nif-0000-09876,"The IMSR is a searchable online database of mouse strains, stocks, and mutant ES cell lines available worldwide, including inbred, mutant, and genetically engineered strains. The goal of the IMSR is to assist the international scientific community in locating and obtaining mouse resources for research. Note that the data content found in the IMSR is as supplied by strain repository holders. For each strain or cell line listed in the IMSR, users can obtain information about: Where that resource is available (Repository Site); What state(s) the resource is available as (e.g. live, cryopreserved embryo or germplasm, ES cells); Links to descriptive information about a strain or ES cell line; Links to mutant alleles carried by a strain or ES cell line; Links for ordering a strain or ES cell line from a Repository; Links for contacting the Repository to send a query",disciplinary|institutional,281.783 strains,2019-04-08,2008,⋯,,other,unknown,,,"The IMSR currently contains data from 21 different repositories, including The Jackson Laboratory, the Mutant Mouse Regional Resource Centers (MMRRCs), RIKEN BRC (RBRC), Taconic (TAC), European Mouse Mutant Archive (EMMA), Canadian Mouse Mutant Repository (CMMR), Texas A&M Institute for Genomic Medicine (TIGM) and the Knockout Mouse Project (KOMP) Repository. Computing repositories: http://www.findmice.org/repository",2013-04-26,2019-04-08,281783,strains


In [7]:
# get column names and change order 
colnames(sample_data2)

col_order <- c( 'X','re3data.orgIdentifier','repositoryName','repositoryURL','repositoryIdentifier','description','type','size','size_number',
               'size_unit','updated','startDate','endDate','subject','contentType','providerType','keyword','databaseAccessType',
               'dataUploadType','softwareName','api','apiType','pidSystem','enhancedPublication','certificate',
               'metadataStandardName','remarks','entryDate','lastUpdate'
)
sample_data2 <- sample_data2[, col_order]
sample_data2

X,re3data.orgIdentifier,repositoryName,repositoryURL,repositoryIdentifier,description,type,size,size_number,size_unit,⋯,softwareName,api,apiType,pidSystem,enhancedPublication,certificate,metadataStandardName,remarks,entryDate,lastUpdate
<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<chr>,<chr>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,r3d100010141,GAWSIS,https://gawsis.meteoswiss.ch/GAWSIS//index.html#/,,"GAWSIS is being developed and maintained by the Federal Office of Meteorology and Climatology MeteoSwiss in collaboration with the WMO GAW Secretariat, the GAW World Data Centres and other GAW representatives to improve the management of information about the GAW network of ground-based stations. The application is presently hosted by the Swiss Laboratories for Materials Testing and Research Empa. GAWSIS provides the GAW community and other interested people with an up-to-date, searchable data base of site descriptions, measurements programs and data available, contact people, bibliographic references. Linked data collections are hosted at the World Data Centers of the WMO Global Atmosphere Watch.",disciplinary|institutional,,,,⋯,unknown,https://github.com/wmo-im/docs/blob/master/XML%20station%20representation%20in%20OSCAR.ipynb,REST,other,unknown,,ISO 19115,"Currently GAW coordinates activities and data from 29 Global stations, more than 400 Regional stations, and around 100 Contributing stations operated by Contributing networks. More than 80 countries actively host GAW stations",2013-08-29,2021-04-28
2,r3d100010148,BABS,https://www.babs-muenchen.de/,,"BABS include digital reproductions from the digitization of the Munich Digitisation CenterMunich Digitization Center/Digital Library of the Bavarian State Library including digital reproductions from copyright-free works from the BSB collections created by cooperation partners or service providers, such as digital copies from the The google-ProjectGoogle project; official publications of authorities, departments and agencies of the State of Bavaria according to the ""Bavarian State Promulgation 2 December 2008 (Az.: B II 2-480-30)"" on the delivery of official publications to libraries, the Promulgation Platform Bavaria (Verkündungsplattform), as well as voluntary deliveries of electronic publications of different (mainly Bavarian scientific) publishing houses and other publishers; scientifically relevant literature (open access publications and websites) of national and international origin in the Areas of Collection Emphasis of the BSB (history including classical studies, Eastern Europe, history of France and Italy, music, library science, book studies and information science) as well as Bavarica; electronic publications produced by the BSB specialist departments, especially those of the Center for Electronic Publishing (ZEP); local/regional/national licensed or purchased electronic publications",disciplinary|institutional,2.115.339.113 fiiles in total,2115339113,fiiles in total,⋯,unknown,,,DOI|URN,unknown,,,Public-Private-Partnership of Bavarian State Library and Google,2013-09-05,2021-04-28
3,r3d100010153,VegBank,http://vegbank.org/vegbank/index.jsp,,"VegBank is the vegetation plot database of the Ecological Society of America's Panel on Vegetation Classification. VegBank consists of three linked databases that contain the actual plot records, vegetation types recognized in the U.S. National Vegetation Classification and other vegetation types submitted by users, and all plant taxa recognized by ITIS/USDA as well as all other plant taxa recorded in plot records. Vegetation records, community types and plant taxa may be submitted to VegBank and may be subsequently searched, viewed, annotated, revised, interpreted, downloaded, and cited. VegBank receives its data from the VegBank community of users.",disciplinary,108.062 plots; 293.165 plant concepts; 26.433 community concepts,108062|293165|26433,plots|plant concepts|community concepts,⋯,unknown,http://vegbank.org/vegdocs/vegbranch/vegbranch.html,other,none,unknown,,FGDC/CSDGM - Federal Geographic Data Committee Content Standard for Digital Geospatial Metadata,,2012-10-25,2021-07-30
4,r3d100010201,LMU-ifo Economics & Business Data Center,http://www.cesifo-group.de/ifoHome/facts/EBDC.html,,"The Economics & Business Data Center (EBDC) is a combined platform for empirical research in business administration and economics of the Ludwig–Maximilian University of Munich (LMU) and the Ifo Institute and aims at opening new fields for empirical research in business administration and economics. In this regard, the EBDC provides innovative datasets of German companies, containing both survey data of the Ifo Institute as well as external balance sheet data. Therefore, the tasks of the EBDC also include the procurement and administration of data sources for research and teaching, the central provision, updating and documentation of external databases, as well as the acquisition of corresponding support tools. Beyond that, the EBDC serves as a contact and central coordinator on licensing economic firm-level datasets for LMU’s Munich School of Management and LMU’s Department of Economics and supports researchers and guests of the LMU and the Ifo Institute on site. In the future, it will also conduct academic conferences on research with company data.",disciplinary|institutional,,,,⋯,unknown,,,DOI,unknown,RatSWD,,"LMU-ifo Economics & Business Data Center (EBDC) is covered by Thomson Reuters Data Citation Index. The Economics and Business Data Center is supported by the Exzellenzinitiative of the Federal Ministry of Education and Research (within the “LMUExcellent” program) and was founded in 2008. In spring 2011 it received accreditation as a research data centre of the Rat für Sozial- und Wirtschaftsdaten. // The CESifo Group, consisting of the Center for Economic Studies (CES), the Ifo Institute and the CESifo GmbH (Munich Society for the Promotion of Economic Research) is a research group unique in Europe in the area of economic research. It combines the theoretically oriented economic research of the university with the empirical work of a leading Economic research institute and places this combination in an international environment.",2013-02-11,2021-06-29
5,r3d100010209,CLARIN-ERIC,https://www.clarin.eu/,,"CLARIN is a European Research Infrastructure for the Humanities and Social Sciences, focusing on language resources (data and tools). It is being implemented and constantly improved at leading institutions in a large and growing number of European countries, aiming at improving Europe's multi-linguality competence. CLARIN provides several services, such as access to language data and tools to analyze data, and offers to deposit research data, as well as direct access to knowledge about relevant topics in relation to (research on and with) language resources. The main tool is the 'Virtual Language Observatory' providing metadata and access to the different national CLARIN centers and their data.",disciplinary,944.380 records,944380,records,⋯,unknown,https://www.clarin.eu/faq-page/275#t275n2858|https://www.clarin.eu/content/clarin-software-github,OAI-PMH|REST,hdl,unknown,,,"CLARIN-ERIC is a network memeber of ICSU World Data System. Letter of Agreement is pending (08.01.2019). In 2012 CLARIN gets ERIC status.CLARIN, the pan-European Common Language Resources and Technology Infrastructure, is the second European research infrastructure to be granted with ERIC status. Eight countries are committed to the setting up of CLARIN-ERIC: Austria, Bulgaria, the Czech Republic, Denmark, Estonia, Germany, Poland, and the Netherlands. The Dutch Language Union completes the list of founding members. CLARIN-ERIC will be hosted in Utrecht, the Netherlands. The “European Strategy Forum for Research Infrastructures” (ESFRI) has emphasised the importance of CLARIN by including the project in its Roadmap 2006. This also comprises four other important initiatives in the arts and humanities and social studies, including a similar proposal for a data infrastructure for the social sciences (CESSDA ERIC Major Upgrade) and for the arts and humanities (DARIAH). In the preparatory phase (2008-2010) CLARIN is funded by the EU through the 7th Framework ESFRI programme. One of the objectives of the preparatory phase is to come with cost estimations for the construction and exploitation phase. The main funders will then be the national governments, with a possible minor contribution from the EU for some generic costs of the infrastructure. Clarin members: https://www.clarin.eu/content/about-eric. // CLARIN offers two central resource discovery tools, (1) the metadata-based Virtual Language Observatory which lists hundreds of thousands of individual resources, not only at CLARIN centres, and (2) the Federated Content Search, which allows searching WITHIN resources at CLARIN centres.",2013-02-12,2019-01-08
6,r3d100010221,EMBL-EBI,https://www.ebi.ac.uk/,RRID:SCR_004727|RRID:nlx_72386,"The European Bioinformatics Institute (EBI) has a long-standing mission to collect, organise and make available databases for biomolecular science. It makes available a collection of databases along with tools to search, download and analyse their content. These databases include DNA and protein sequences and structures, genome annotation, gene expression information, molecular interactions and pathways. Connected to these are linking and descriptive data resources such as protein motifs, ontologies and many others. In many of these efforts, the EBI is a European node in global data-sharing agreements involving, for example, the USA and Japan.",disciplinary|institutional,,,,⋯,unknown,ftp://ftp.ebi.ac.uk/|https://www.ebi.ac.uk/ebisearch/overview.ebi/about|https://www.ebi.ac.uk/seqdb/confluence/display/JDSAT/EMBL-EBI+Web+Services+APIs+-+Data+Retrieval,FTP|REST|SOAP,none,unknown,,,"As part of the European Molecular Biology Laboratory EMBL, the largest part of our funding comes from the governments of EMBL's 20 member states. Other major funders include the European Commission, Wellcome Trust, US National Institutes of Health, UK Research Councils, our industry partners. In addition, the Wellcome Trust generously provides the facilities for the EMBL-EBI on its Genome Campus at Hinxton, and the UK Research Councils have also provided funds for our facilities in Hinxton.",2013-03-01,2019-02-01
7,r3d100010226,Alternative Fuels Data Center,https://afdc.energy.gov/,,"The Alternative Fuels Data Center (AFDC) is a comprehensive clearinghouse of information about advanced transportation technologies. The AFDC offers transportation decision makers unbiased information, data, and tools related to the deployment of alternative fuels and advanced vehicles. The AFDC launched in 1991 in response to the Alternative Motor Fuels Act of 1988 and the Clean Air Act Amendments of 1990. It originally served as a repository for alternative fuel performance data. The AFDC has since evolved to offer a broad array of information resources that support efforts to reduce petroleum use in transportation. The AFDC serves Clean Cities stakeholders, fleets regulated by the Energy Policy Act, businesses, policymakers, government agencies, and the general public.",disciplinary,,,,⋯,unknown,https://developer.nrel.gov/docs/transportation/transportation-incentives-laws-v1/,other,none,unknown,,,The AFDC is a resource of the U.S. Department of Energy's Clean Cities program.,2013-05-06,2021-04-30
8,r3d100010350,BioGRID,https://thebiogrid.org/,FAIRsharing_doi:10.25504/fairsharing.9d5f5r|MIR:00000058|OMICS_01901|RRID:SCR_007393|RRID:nif-0000-00432,The Biological General Repository for Interaction Datasets (BioGRID) is a public database that archives and disseminates genetic and protein interaction data from model organisms and humans. BioGRID is an online interaction repository with data compiled through comprehensive curation efforts. All interaction data are freely provided through our search index and available via download in a wide variety of standardized formats.,disciplinary,77.459 publications for 2.124752protein and genetic interactions; 29.417 chemical associations; 1.128.339 post translational modifications.,77459|29417|1128339,publications for |chemical associations|post translational modifications,⋯,,https://wiki.thebiogrid.org/doku.php/biogridrest,REST,none,yes,,,BioGRID is partner of the International Molecular Exchange Consortium (IMEx),2013-04-11,2021-07-26
9,r3d100010363,Visual Arts Data Service,https://vads.ac.uk/,,"VADS is the online resource for visual arts. It has provided services to the academic community for 12 years and has built up a considerable portfolio of visual art collections comprising over 100,000 images that are freely available and copyright cleared for use in learning, teaching and research in the UK. VADS provides: expert guidance and help for digital projects in art education; resource development and hosting for art education; project management and consultancy for art education; leadership in the innovative use of ICT in education through its research and development activities. VADS offers advice and guidance to the visual arts research, teaching and learning communities on all aspects of digital resource management from funding, through delivery and use, to preservation.",disciplinary|institutional,over 140.000 images,140000,images,⋯,unknown,,,none,unknown,,,,2013-03-29,2021-09-03
10,r3d100010393,International Mouse Strain Resource,http://www.findmice.org/,RRID:SCR_001526|RRID:nif-0000-09876,"The IMSR is a searchable online database of mouse strains, stocks, and mutant ES cell lines available worldwide, including inbred, mutant, and genetically engineered strains. The goal of the IMSR is to assist the international scientific community in locating and obtaining mouse resources for research. Note that the data content found in the IMSR is as supplied by strain repository holders. For each strain or cell line listed in the IMSR, users can obtain information about: Where that resource is available (Repository Site); What state(s) the resource is available as (e.g. live, cryopreserved embryo or germplasm, ES cells); Links to descriptive information about a strain or ES cell line; Links to mutant alleles carried by a strain or ES cell line; Links for ordering a strain or ES cell line from a Repository; Links for contacting the Repository to send a query",disciplinary|institutional,281.783 strains,281783,strains,⋯,unknown,,,other,unknown,,,"The IMSR currently contains data from 21 different repositories, including The Jackson Laboratory, the Mutant Mouse Regional Resource Centers (MMRRCs), RIKEN BRC (RBRC), Taconic (TAC), European Mouse Mutant Archive (EMMA), Canadian Mouse Mutant Repository (CMMR), Texas A&M Institute for Genomic Medicine (TIGM) and the Knockout Mouse Project (KOMP) Repository. Computing repositories: http://www.findmice.org/repository",2013-04-26,2019-04-08


## Normalize values

In [8]:
# delete unknown softwareName
table(sample_data2$softwareName)

# convert variable to character
sample_data2$softwareName <- as.character(sample_data2$softwareName)

# replace values with empty
sample_data2$softwareName[sample_data2$softwareName=="unknown"] <- ""

# convert variable back to factor
sample_data2$softwareName <- as.factor(sample_data2$softwareName)

table(sample_data2$softwareName)


                                     CKAN           CKAN|MySQL 
                 665                   87                    1 
          CKAN|other               DSpace            DataVerse 
                   3                  118                  152 
      DigitalCommons              EPrints               Fedora 
                   5                   34                   41 
Fedora|Nesstar|other         Fedora|other                MySQL 
                   1                    3                   85 
         MySQL|other              Nesstar        Nesstar|other 
                   2                   17                    1 
                Opus           Opus|other               dLibra 
                   1                    1                    2 
             eSciDoc                other              unknown 
                   5                  617                 1196 


                                     CKAN           CKAN|MySQL 
                1861                   87                    1 
          CKAN|other               DSpace            DataVerse 
                   3                  118                  152 
      DigitalCommons              EPrints               Fedora 
                   5                   34                   41 
Fedora|Nesstar|other         Fedora|other                MySQL 
                   1                    3                   85 
         MySQL|other              Nesstar        Nesstar|other 
                   2                   17                    1 
                Opus           Opus|other               dLibra 
                   1                    1                    2 
             eSciDoc                other 
                   5                  617 

In [9]:
sample_data2$softwareName[sample_data2$pidSystem=="DOI|none"]

In [10]:
# delete none pidSystem
table(sample_data2$pidSystem)

# convert variable to character
sample_data2$pidSystem <- as.character(sample_data2$pidSystem)

# replace values with empty
sample_data2$pidSystem[sample_data2$pidSystem=="none"] <- ""
sample_data2$pidSystem[sample_data2$pidSystem=="DOI|none"] <- "DOI" # 100010701 senseless

# convert variable back to factor
sample_data2$pidSystem <- as.factor(sample_data2$pidSystem)

table(sample_data2$pidSystem)


                                      ARK              ARK|DOI 
                 450                   12                    9 
    ARK|DOI|PURL|URN ARK|DOI|PURL|URN|hdl      ARK|DOI|URN|hdl 
                   1                    1                    1 
         ARK|DOI|hdl        ARK|DOI|other              ARK|hdl 
                   2                    1                    1 
                 DOI             DOI|PURL         DOI|PURL|URN 
                 795                    7                    1 
    DOI|PURL|URN|hdl         DOI|PURL|hdl       DOI|PURL|other 
                   1                    2                    1 
             DOI|URN              DOI|hdl        DOI|hdl|other 
                  22                   93                    1 
            DOI|none            DOI|other                 PURL 
                   1                   22                   13 
            PURL|URN             PURL|hdl                  URN 
                   3                   


                                      ARK              ARK|DOI 
                1789                   12                    9 
    ARK|DOI|PURL|URN ARK|DOI|PURL|URN|hdl      ARK|DOI|URN|hdl 
                   1                    1                    1 
         ARK|DOI|hdl        ARK|DOI|other              ARK|hdl 
                   2                    1                    1 
                 DOI             DOI|PURL         DOI|PURL|URN 
                 796                    7                    1 
    DOI|PURL|URN|hdl         DOI|PURL|hdl       DOI|PURL|other 
                   1                    2                    1 
             DOI|URN              DOI|hdl        DOI|hdl|other 
                  22                   93                    1 
           DOI|other                 PURL             PURL|URN 
                  22                   13                    3 
            PURL|hdl                  URN              URN|hdl 
                   2                   

In [11]:
# delete unknown softwareName
table(sample_data2$enhancedPublication)

# convert variable to character
sample_data2$enhancedPublication <- as.character(sample_data2$enhancedPublication)

# replace values with empty
sample_data2$enhancedPublication[sample_data2$enhancedPublication=="unknown"] <- ""

# convert variable back to factor
sample_data2$enhancedPublication <- as.factor(sample_data2$enhancedPublication)

table(sample_data2$enhancedPublication)


     no unknown     yes 
    245    1814     978 


       no  yes 
1814  245  978 

In [12]:
# coarsen manual inserted dates to year only
sample_data2$startDate
sample_data2$endDate

sample_data2$startDate <- gsub("-.*", "", sample_data2$startDate)
sample_data2$endDate <- gsub("-.*", "", sample_data2$endDate) 



In [13]:
# export as csv
write.csv(sample_data2, "repository_info_v2.csv", fileEncoding="UTF8")