Dataset Open Access
{ "files": [ { "links": { "self": "https://zenodo.org/api/files/efa917d9-ab49-4256-a452-78ee459401fd/unarXive-2020.tar.bz2" }, "checksum": "md5:c66da9551fc3e3b7374d55726003552f", "bucket": "efa917d9-ab49-4256-a452-78ee459401fd", "key": "unarXive-2020.tar.bz2", "type": "bz2", "size": 19120394657 } ], "owners": [ 57099 ], "doi": "10.5281/zenodo.4313164", "stats": { "version_unique_downloads": 3086.0, "unique_views": 700.0, "views": 800.0, "version_views": 3549.0, "unique_downloads": 278.0, "version_unique_views": 2842.0, "volume": 8604177595650.0, "version_downloads": 24220.0, "downloads": 450.0, "version_volume": 494782966207942.0 }, "links": { "doi": "https://doi.org/10.5281/zenodo.4313164", "conceptdoi": "https://doi.org/10.5281/zenodo.2553522", "bucket": "https://zenodo.org/api/files/efa917d9-ab49-4256-a452-78ee459401fd", "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.2553522.svg", "html": "https://zenodo.org/record/4313164", "latest_html": "https://zenodo.org/record/4313164", "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.4313164.svg", "latest": "https://zenodo.org/api/records/4313164" }, "conceptdoi": "10.5281/zenodo.2553522", "created": "2020-12-09T16:42:31.655483+00:00", "updated": "2021-11-10T15:57:23.393220+00:00", "conceptrecid": "2553522", "revision": 6, "id": 4313164, "metadata": { "access_right_category": "success", "doi": "10.5281/zenodo.4313164", "description": "<p>In recent years, scholarly data sets have been used for various purposes, such as paper recommendation, citation recommendation, citation context analysis, and citation context-based document summarization. The evaluation of approaches to such tasks and their applicability in real-world scenarios heavily depend on the used data set. However, existing scholarly data sets are limited in several regards.</p>\n\n<p>Here, we propose a new <strong>data set based on all publications from all scientific disciplines available on arXiv.org</strong>. Apart from providing the <strong>papers' plain text</strong>, <strong>in-text citations were annotated</strong> via global identifiers. Furthermore, citing and cited publications were linked to the <strong>Microsoft Academic Graph</strong>, providing access to rich metadata. Our data set consists of <strong>over one million documents and 29.2 million citation contexts</strong>. The data set, which is made freely available for research purposes, not only can enhance the future evaluation of research paper-based and citation context-based approaches but also serve as a basis for new ways to analyze in-text citations.</p>\n\n<p>This <strong>updated version</strong> (v3) of our data set is based on all arXiv publications until 2020-07-31 and on the Microsoft Academic Graph as of 2020-08-18. As additional contribution, we included a table with the publication date and the scientific discipline for each paper for easier filtering.</p>\n\n<p>See <a href=\"https://github.com/IllDepence/unarXive\">https://github.com/IllDepence/unarXive</a> for the <strong>source code</strong> which has been used for creating the data set.</p>\n\n<p><strong>Usage examples</strong> for our data set are provided at <a href=\"https://github.com/IllDepence/unarXive#usage-examples\">https://github.com/IllDepence/unarXive#usage-examples</a>.</p>\n\n<p>For <strong>citing</strong> our data set and for further information we can refer to our journal article</p>\n\n<p><em>Tarek Saier, Michael Färber: "<a href=\"https://www.aifb.kit.edu/images/f/f9/UnarXive_Scientometrics2020.pdf\">unarXive: A Large Scholarly Data Set with Publications’ Full-Text, Annotated In-Text Citations, and Links to Metadata</a>", Scientometrics, 2020, <a href=\"http://dx.doi.org/10.1007/s11192-020-03382-z\">http://dx.doi.org/10.1007/s11192-020-03382-z</a>.</em></p>\n\n<p> </p>", "license": { "id": "other-at" }, "title": "unarXive: A Large Scholarly Data Set with Publications' Full-Text, Annotated In-Text Citations, and Links to Metadata", "relations": { "version": [ { "count": 4, "index": 3, "parent": { "pid_type": "recid", "pid_value": "2553522" }, "is_last": true, "last_child": { "pid_type": "recid", "pid_value": "4313164" } } ] }, "communities": [ { "id": "bibliometrics" }, { "id": "natural-language-processing" }, { "id": "scholarly-data" } ], "keywords": [ "scholarly data", "citations", "papers", "arXiv.org", "digital libraries", "dataset", "scientometrics", "full-text" ], "publication_date": "2020-12-09", "creators": [ { "orcid": "0000-0001-5028-0109", "affiliation": "University of Freiburg", "name": "Saier, Tarek" }, { "orcid": "0000-0001-5458-8645", "affiliation": "University of Freiburg", "name": "F\u00e4rber, Michael" } ], "access_right": "open", "resource_type": { "type": "dataset", "title": "Dataset" }, "related_identifiers": [ { "scheme": "url", "identifier": "https://link.springer.com/article/10.1007%2Fs11192-020-03382-z", "relation": "isDocumentedBy", "resource_type": "publication-article" }, { "scheme": "doi", "identifier": "10.5281/zenodo.2553522", "relation": "isVersionOf" } ] } }
All versions | This version | |
---|---|---|
Views | 3,549 | 800 |
Downloads | 24,220 | 450 |
Data volume | 494.8 TB | 8.6 TB |
Unique views | 2,842 | 700 |
Unique downloads | 3,086 | 278 |