{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2021-07-27T22:02:53.969201+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 4, "enabled": true, "entries": { "annotations.json": { "checksum": "md5:1064dca5664f99f7a859f0834b534397", "ext": "json", "id": "1912bc64-5faf-46fe-92c7-999dc485d385", "key": "annotations.json", "metadata": null, "mimetype": "application/json", "size": 226670666 }, "documents.json": { "checksum": "md5:78f4eeff780614d7dcbf080e914922fa", "ext": "json", "id": "ac83c5fc-6d99-4b28-a4be-ac8197633d8f", "key": "documents.json", "metadata": null, "mimetype": "application/json", "size": 477461239 }, "readme.md": { "checksum": "md5:83934947d0627b751416c17a6b3cd5a8", "ext": "md", "id": "12c17d1a-ff85-4da8-907c-9e8ef7cccf83", "key": "readme.md", "metadata": null, "mimetype": "application/octet-stream", "size": 4135 }, "references.json": { "checksum": "md5:08946fc3ca5b12ebb614a0715b3f30ad", "ext": "json", "id": "0875291d-12fa-42de-b9d8-23a63c05dbf4", "key": "references.json", "metadata": null, "mimetype": "application/json", "size": 47163372 } }, "order": [], "total_bytes": 751299412 }, "id": "5140437", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/5140437/access", "access_links": "https://zenodo.org/api/records/5140437/access/links", "access_request": "https://zenodo.org/api/records/5140437/access/request", "access_users": "https://zenodo.org/api/records/5140437/access/users", "archive": "https://zenodo.org/api/records/5140437/files-archive", "archive_media": "https://zenodo.org/api/records/5140437/media-files-archive", "communities": "https://zenodo.org/api/records/5140437/communities", "communities-suggestions": "https://zenodo.org/api/records/5140437/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.5140437", "draft": "https://zenodo.org/api/records/5140437/draft", "files": "https://zenodo.org/api/records/5140437/files", "latest": "https://zenodo.org/api/records/5140437/versions/latest", "latest_html": "https://zenodo.org/records/5140437/latest", "media_files": "https://zenodo.org/api/records/5140437/media-files", "parent": "https://zenodo.org/api/records/4784733", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.4784733", "parent_html": "https://zenodo.org/records/4784733", "requests": "https://zenodo.org/api/records/5140437/requests", "reserve_doi": "https://zenodo.org/api/records/5140437/draft/pids/doi", "self": "https://zenodo.org/api/records/5140437", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.5140437", "self_html": "https://zenodo.org/records/5140437", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:5140437/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:5140437/sequence/default", "versions": "https://zenodo.org/api/records/5140437/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "science-miner" } ], "person_or_org": { "family_name": "Patrice Lopez", "identifiers": [ { "identifier": "0000-0002-9959-9441", "scheme": "orcid" } ], "name": "Patrice Lopez", "type": "personal" } }, { "affiliations": [ { "name": "University of Texas at Austin" } ], "person_or_org": { "family_name": "Caifan Du", "identifiers": [ { "identifier": "0000-0003-2538-607X", "scheme": "orcid" } ], "name": "Caifan Du", "type": "personal" } }, { "affiliations": [ { "name": "University of Texas at Austin" } ], "person_or_org": { "family_name": "Hannah Cohoon", "identifiers": [ { "identifier": "0000-0002-3352-9766", "scheme": "orcid" } ], "name": "Hannah Cohoon", "type": "personal" } }, { "affiliations": [ { "name": "University of Texas at Austin" } ], "person_or_org": { "family_name": "James Howison", "identifiers": [ { "identifier": "0000-0002-5702-149X", "scheme": "orcid" } ], "name": "James Howison", "type": "personal" } } ], "description": "
Softcite software mention extraction from the CORD-19 publications
\n\nThis dataset is the first result of the extraction of software mentions from the set of publications of the CORD-19 corpus (https://allenai.org/data/cord-19) by the Softcite software recognizer, see https://github.com/ourresearch/software-mentions.
\n\nThe CORD-19 version used for this dataset is the one dated 2021-03-22, using the metadata.csv file only. We re-harvested the PDF with https://github.com/kermitt2/article-dataset-builder in order to also extract coordinates of software mentions in the PDF and to take advantage of the latest version of GROBID to produce better full text extraction from PDF.
\n\nLast update: 2021-07-27, version 0.2.1, with some additional cleaning of the annotations.
\n\nA third version of this dataset is planed for August 2021.
\n\nData format
\n\nThe extraction consists of 3 JSON files:
\n\nannotations.json contains the individual software annotations including software name and possible attached attributes (publisher, URL and version). Each annotation is associated with coordinates expressed as bounding boxes in the original PDF. See Coordinates of structures in the original PDF for more details on the coordinate format.
\n\nThe context of citation is the sentence where the software name and its attributes are extracted. It is added to the JSON structure (field context), as well as the identifier of the document where the annotation belongs (field document, pointing to entries available in documents.json) and a list of bibliographical references attached to the software name (field references, pointing to entries available in references.json, with the used reference marker string). See https://github.com/ourresearch/software-mentions for more details on the extracted attributes.
\n\nIf the software name was sucessfully disambiguated against WikiData ("entity linking"), it appears in the field wikidataId as Wikidata entity identifier and in the field wikipediaExternalRef as a Wikipedia PageID from the English Wikipedia. Entity linking is realized with entity-fishing.
\n\ndocuments.json contains the metadata of the all the CORD-19 documents containing at least one software annotation. The metadata are given as a CrossRef JSON structure. The abstract should be included in the metadata most of the time, as well as some complements extracted by GROBID directly from the PDF. In addition, the size of the pages and the unique file path to the PDF can be found to allow annotations directly on the PDF (see Coordinates of structures in the original PDF for more details on the PDF annotation display mechanism).
\n\nreferences.json contains the parsed reference entries associated to software mentions. These references are given in the field tei encoded in the XML TEI format of GROBID extraction. The extracted raw references have been matched against CrossRef to get a DOI and more complete metadata with biblio-glutton.
\n\nStatistics
\n\nCORD-19 version: 2021-03-22
\n\n- total Open Access full texts: 211,213
\n - with at least one software mention: 76,448
- total software name annotations: 295,609
\n - with linked Wikidata ID: 117,193
- associated field
\n - publisher: 61,804
\n - version: 104,199
\n - URL: 27,916
- associated bibliographical references: 49,184
\n - references with matched DOI: 15,931
\n - references with matched PMID: 10,611
\n - references with matched PMC ID: 6,421
License and acknowledgements
\n\nThis dataset is licensed under a Creative Commons Attribution 4.0 International License.
\n\nWe thank Alfred P. Sloan Foundation for supporting this work.
", "publication_date": "2021-07-27", "publisher": "Zenodo", "resource_type": { "id": "dataset", "title": { "de": "Datensatz", "en": "Dataset" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "text mining, software, scholar literature, CORD-19" } ], "title": "Softcite software mention extraction from the CORD-19 publications", "version": "0.2.1" }, "parent": { "access": { "owned_by": { "user": 95856 } }, "communities": {}, "id": "4784733", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4784733", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5140437", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:5140437", "provider": "oai" } }, "revision_id": 5, "stats": { "all_versions": { "data_volume": 32399449994.0, "downloads": 145, "unique_downloads": 89, "unique_views": 768, "views": 810 }, "this_version": { "data_volume": 8552033111.0, "downloads": 46, "unique_downloads": 26, "unique_views": 173, "views": 178 } }, "status": "published", "updated": "2021-08-23T14:19:36.509518+00:00", "versions": { "index": 3, "is_latest": false } }