Conference paper Open Access

I'll take that to go: Big data bags and minimal identifiers for exchange of large, complex datasets

Chard, Kyle; D'Arcy, Mike; Heavner, Ben; Foster, Ian; Kesselman, Carl; Madduri, Ravi; Rodriguez, Alexis; Soiland-Reyes, Stian; Goble, Carole; Clark, Kristi; Deutsch, Eric W.; Dinov, Ivo; Price, Nathan; Toga, Arthur


MARC21 XML Export

<?xml version='1.0' encoding='UTF-8'?>
<record xmlns="http://www.loc.gov/MARC21/slim">
  <leader>00000nam##2200000uu#4500</leader>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Big Data</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">data analysis</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">BDBags</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Big Data analysis</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Big Data bags</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Big Data sharing</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Minid</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">data assembling</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">data collections</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">data descriptions</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">datasets</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">identifiers</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">research objects</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Encoding</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Metadata</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Payloads</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Robustness</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Software</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Uniform resource locators</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">bdbag</subfield>
  </datafield>
  <controlfield tag="005">20190410041503.0</controlfield>
  <controlfield tag="001">820878</controlfield>
  <datafield tag="711" ind1=" " ind2=" ">
    <subfield code="d">2016-12-05 / 2016-12-08</subfield>
    <subfield code="g">Big Data</subfield>
    <subfield code="p">BigD491</subfield>
    <subfield code="a">2016 IEEE International Conference on Big Data</subfield>
    <subfield code="c">Washington, DC, USA</subfield>
    <subfield code="n">8</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">University of Southern California, Los Angeles, CA, USA</subfield>
    <subfield code="a">D'Arcy, Mike</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">Institute for Systems Biology, Seattle, WA, USA</subfield>
    <subfield code="a">Heavner, Ben</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Chicago and Argonne National Laboratory, Chicago IL, USA</subfield>
    <subfield code="a">Foster, Ian</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">University of Southern California, Los Angeles, CA, USA</subfield>
    <subfield code="a">Kesselman, Carl</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Chicago and Argonne National Laboratory, Chicago IL, USA</subfield>
    <subfield code="a">Madduri, Ravi</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Chicago and Argonne National Laboratory, Chicago IL, USA</subfield>
    <subfield code="a">Rodriguez, Alexis</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Manchester, Manchester, UK</subfield>
    <subfield code="a">Soiland-Reyes, Stian</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Manchester, Manchester, UK</subfield>
    <subfield code="a">Goble, Carole</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">University of Southern California, Los Angeles, CA, USA</subfield>
    <subfield code="a">Clark, Kristi</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">Institute for Systems Biology, Seattle, WA, USA</subfield>
    <subfield code="a">Deutsch, Eric W.</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Michigan, Ann Arbor, MI, USA</subfield>
    <subfield code="a">Dinov, Ivo</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">Institute for Systems Biology, Seattle, WA, USA</subfield>
    <subfield code="a">Price, Nathan</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">University of Southern California, Los Angeles, CA, USA</subfield>
    <subfield code="a">Toga, Arthur</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">The University of Chicago</subfield>
    <subfield code="4">oth</subfield>
    <subfield code="a">Jung, Segun</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">713184</subfield>
    <subfield code="z">md5:91195ab648922564b86d629e83ea88d8</subfield>
    <subfield code="u">https://zenodo.org/record/820878/files/bagminid.pdf</subfield>
  </datafield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="l">open</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="y">Conference website</subfield>
    <subfield code="u">http://cci.drexel.edu/bigdata/bigdata2016/</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2016-12-05</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="p">openaire</subfield>
    <subfield code="p">user-bioexcel</subfield>
    <subfield code="p">user-linkeddata</subfield>
    <subfield code="o">oai:zenodo.org:820878</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="u">The University of Chicago and Argonne National Laboratory, Chicago IL, USA</subfield>
    <subfield code="a">Chard, Kyle</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">I'll take that to go: Big data bags and minimal identifiers for exchange of large, complex datasets</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">user-bioexcel</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">user-linkeddata</subfield>
  </datafield>
  <datafield tag="536" ind1=" " ind2=" ">
    <subfield code="c">675728</subfield>
    <subfield code="a">Centre of Excellence for Biomolecular Research</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="u">http://creativecommons.org/licenses/by/4.0/legalcode</subfield>
    <subfield code="a">Creative Commons Attribution 4.0 International</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="a">cc-by</subfield>
    <subfield code="2">opendefinition.org</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">&lt;p&gt;&lt;em&gt;Big data workflows&lt;/em&gt; often require the assembly and exchange of complex, multi-element datasets. For example, in biomedical applications, the input to an analytic pipeline can be a dataset consisting thousands of images and genome sequences assembled from diverse repositories, requiring a description of the contents of the dataset in a concise and unambiguous form. Typical approaches to creating datasets for big data workflows assume that all data reside in a single location, requiring costly data marshaling and permitting errors of omission and commission because dataset members are not explicitly specified.&lt;/p&gt;

&lt;p&gt;We address these issues by proposing simple methods and tools for assembling, sharing, and analyzing large and complex datasets that scientists can easily integrate into their daily workflows. These tools combine a simple and robust method for describing data collections (&lt;strong&gt;BDBags&lt;/strong&gt;), data descriptions (&lt;strong&gt;Research Objects&lt;/strong&gt;), and simple persistent identifiers (&lt;strong&gt;Minids&lt;/strong&gt;) to create a powerful ecosystem of tools and services for big data analysis and sharing.&lt;/p&gt;

&lt;p&gt;We present these tools and use biomedical case studies to illustrate their use for the rapid assembly, sharing, and analysis of large datasets.&lt;/p&gt;</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isIdenticalTo</subfield>
    <subfield code="a">https://static.aminer.org/pdf/fa/bigdata2016/BigD418.pdf</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isIdenticalTo</subfield>
    <subfield code="a">https://www.research.manchester.ac.uk/portal/files/45989205/bagminid.pdf</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isSupplementedBy</subfield>
    <subfield code="a">http://bd2k.ini.usc.edu/tools/</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isSupplementedBy</subfield>
    <subfield code="a">https://github.com/ini-bdds/bdbag</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isPartOf</subfield>
    <subfield code="a">https://www.research.manchester.ac.uk/portal/en/publications/ill-take-that-to-go(8335e672-1d85-4649-a245-56fbdb1bd423).html</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">cites</subfield>
    <subfield code="a">https://w3id.org/ro/bagit</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="g">319-328</subfield>
    <subfield code="b">IEEE</subfield>
    <subfield code="z">978-1-4673-9005-7</subfield>
    <subfield code="t">2016 IEEE International Conference on Big Data (Big Data)</subfield>
  </datafield>
  <datafield tag="024" ind1=" " ind2=" ">
    <subfield code="a">10.1109/BigData.2016.7840618</subfield>
    <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">publication</subfield>
    <subfield code="b">conferencepaper</subfield>
  </datafield>
</record>
309
148
views
downloads
Views 309
Downloads 148
Data volume 105.6 MB
Unique views 295
Unique downloads 135

Share

Cite as