Dataset Open Access

bioRxiv 10k

Daniel Ecer


MARC21 XML Export

<?xml version='1.0' encoding='UTF-8'?>
<record xmlns="http://www.loc.gov/MARC21/slim">
  <leader>00000nmm##2200000uu#4500</leader>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="x">Constantin, A., Steve, P., Andrei, V.: Fully-automated PDF-to-XML conversion of scientific literature. In: Proceedings of the ACM Symposium on Document Engineering, pp. 177–180. ACM, New York (2013). doi: 10.1145/2494266.2494271</subfield>
  </datafield>
  <datafield tag="041" ind1=" " ind2=" ">
    <subfield code="a">eng</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">bioRxiv</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">PDF</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">XML</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">JATS</subfield>
  </datafield>
  <controlfield tag="005">20200724005924.0</controlfield>
  <controlfield tag="001">3873702</controlfield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">5738592606</subfield>
    <subfield code="z">md5:942cb97541b82440e74409e19e17d94d</subfield>
    <subfield code="u">https://zenodo.org/record/3873702/files/biorxiv-10k-test-2000.zip</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">17233894299</subfield>
    <subfield code="z">md5:e4fefd52a2d480951d8514360395c34a</subfield>
    <subfield code="u">https://zenodo.org/record/3873702/files/biorxiv-10k-train-6000.zip</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">5811669057</subfield>
    <subfield code="z">md5:83dbf0d9ee7da1617afc319a38fc07a7</subfield>
    <subfield code="u">https://zenodo.org/record/3873702/files/biorxiv-10k-validation-2000.zip</subfield>
  </datafield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="l">open</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2020-07-23</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="p">openaire_data</subfield>
    <subfield code="o">oai:zenodo.org:3873702</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="0">(orcid)0000-0003-0320-4300</subfield>
    <subfield code="a">Daniel Ecer</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">bioRxiv 10k</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="u">https://creativecommons.org/licenses/by/4.0/legalcode</subfield>
    <subfield code="a">Creative Commons Attribution 4.0 International</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="a">cc-by</subfield>
    <subfield code="2">opendefinition.org</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">&lt;p&gt;This dataset is a CC-BY 4.0 subset of what bioRxiv kindly made available: &lt;a href="https://www.biorxiv.org/tdm"&gt;https://www.biorxiv.org/tdm&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It is randomized and split into train (6,000), validation (2,000) and test (2,000) subsets - 10,000 PDF / XML pairs in total.&lt;/p&gt;

&lt;p&gt;The zip files further contain file lists of smaller subsets that used the subject area to potentially create a balanced subset.&lt;/p&gt;

&lt;p&gt;The zip is similar in structure to the &amp;quot;PMC sample 1943&amp;quot; dataset that was created as part of: &lt;a href="https://doi.org/10.1145/2494266.2494271"&gt;https://doi.org/10.1145/2494266.2494271&lt;/a&gt; (a working link is available from: &lt;a href="https://grobid.readthedocs.io/en/stable/End-to-end-evaluation/"&gt;https://grobid.readthedocs.io/en/stable/End-to-end-evaluation/&lt;/a&gt;).&lt;/p&gt;

&lt;p&gt;Therefore it is well suited for evaluation of &lt;a href="https://github.com/elifesciences/sciencebeam/wiki/Related-Projects"&gt;PDF to XML conversion tools&lt;/a&gt;, such as &lt;a href="https://github.com/kermitt2/grobid"&gt;GROBID&lt;/a&gt;. The dataset was created as part of &lt;a href="https://elifesciences.org/"&gt;eLife&lt;/a&gt;&amp;#39;s &lt;a href="https://github.com/elifesciences/sciencebeam"&gt;ScienceBeam&lt;/a&gt; project.&lt;/p&gt;</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">url</subfield>
    <subfield code="i">isDerivedFrom</subfield>
    <subfield code="a">http://biorxiv.org/tdm</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">references</subfield>
    <subfield code="a">10.1145/2494266.2494271</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">isVersionOf</subfield>
    <subfield code="a">10.5281/zenodo.3873701</subfield>
  </datafield>
  <datafield tag="024" ind1=" " ind2=" ">
    <subfield code="a">10.5281/zenodo.3873702</subfield>
    <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">dataset</subfield>
  </datafield>
</record>
131
19,738
views
downloads
All versions This version
Views 131131
Downloads 19,73819,738
Data volume 262.6 TB262.6 TB
Unique views 118118
Unique downloads 1,5901,590

Share

Cite as