{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2020-10-14T17:12:32.536698+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 1, "enabled": true, "entries": { "corpus-17.webmsd.webm": { "checksum": "md5:6e1886d5ea2f2f330aea37faeb07f1a9", "ext": "webm", "id": "3fead1ac-8885-47ec-a3b6-fad380bdeddc", "key": "corpus-17.webmsd.webm", "metadata": null, "mimetype": "video/webm", "size": 30813626 } }, "order": [], "total_bytes": 30813626 }, "id": "4088669", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/4088669/access", "access_links": "https://zenodo.org/api/records/4088669/access/links", "access_request": "https://zenodo.org/api/records/4088669/access/request", "access_users": "https://zenodo.org/api/records/4088669/access/users", "archive": "https://zenodo.org/api/records/4088669/files-archive", "archive_media": "https://zenodo.org/api/records/4088669/media-files-archive", "communities": "https://zenodo.org/api/records/4088669/communities", "communities-suggestions": "https://zenodo.org/api/records/4088669/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.4088669", "draft": "https://zenodo.org/api/records/4088669/draft", "files": "https://zenodo.org/api/records/4088669/files", "latest": "https://zenodo.org/api/records/4088669/versions/latest", "latest_html": "https://zenodo.org/records/4088669/latest", "media_files": "https://zenodo.org/api/records/4088669/media-files", "parent": "https://zenodo.org/api/records/4088668", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.4088668", "parent_html": "https://zenodo.org/records/4088668", "requests": "https://zenodo.org/api/records/4088669/requests", "reserve_doi": "https://zenodo.org/api/records/4088669/draft/pids/doi", "self": "https://zenodo.org/api/records/4088669", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.4088669", "self_html": "https://zenodo.org/records/4088669", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:4088669/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:4088669/sequence/default", "versions": "https://zenodo.org/api/records/4088669/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "Universit\u00e9s de Neuch\u00e2tel et de Gen\u00e8ve, Neuch\u00e2tel and Gen\u00e8ve, Switzerland" } ], "person_or_org": { "family_name": "Simon Gabay", "name": "Simon Gabay", "type": "personal" } }, { "affiliations": [ { "name": "\u00c9cole des Chartes, Paris, Franc" } ], "person_or_org": { "family_name": "Alexandre Bartz", "name": "Alexandre Bartz", "type": "personal" } }, { "affiliations": [ { "name": "Universit\u00e9 de Rennes, Rennes, France" } ], "person_or_org": { "family_name": "Yohann Deguin", "name": "Yohann Deguin", "type": "personal" } } ], "description": "
We investigate the creation of a 17th c. French literary corpus. We present the main options regarding available standards, the training data we created and the efficiency of the models produced for OCR, spelling normalization, and lemmatization – always with open-source solutions. We also present our encoding choices and the global logic of a corpus designed as a virtuous circle, enhancing automatically the tools that are used for its construction.
", "languages": [ { "id": "fra", "title": { "en": "French" } } ], "publication_date": "2020-10-14", "publisher": "Zenodo", "related_identifiers": [ { "identifier": "10.1145/3423603.3424002", "relation_type": { "id": "continues", "title": { "de": "Setzt fort", "en": "Continues" } }, "resource_type": { "id": "publication-conferencepaper", "title": { "de": "Konferenzbeitrag", "en": "Conference paper" } }, "scheme": "doi" } ], "resource_type": { "id": "video", "title": { "de": "Video/Audio", "en": "Video/Audio" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "17th c. French, OCR, normalisation, lemmatisation, POS-tagging,named entities, digital humanities, XML-TEI" } ], "title": "CORPUS17: a philological French corpus for 17thcentury" }, "parent": { "access": { "owned_by": { "user": 124694 } }, "communities": {}, "id": "4088668", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4088668", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4088669", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:4088669", "provider": "oai" } }, "revision_id": 1, "stats": { "all_versions": { "data_volume": 27115990880.0, "downloads": 880, "unique_downloads": 738, "unique_views": 80, "views": 85 }, "this_version": { "data_volume": 27085177254.0, "downloads": 879, "unique_downloads": 737, "unique_views": 65, "views": 70 } }, "status": "published", "updated": "2020-10-14T17:12:33.517087+00:00", "versions": { "index": 1, "is_latest": true } }