{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2020-12-22T07:18:23.526895+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 30, "enabled": true, "entries": { "HTR results 1.04.02 Oost-Indische Testamenten PAGE.zip": { "checksum": "md5:4e8e59402d4d07e95219e9714e49eac7", "ext": "zip", "id": "1adcb4d6-36f2-4cf3-a3c0-1aa25ab14cd3", "key": "HTR results 1.04.02 Oost-Indische Testamenten PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 726849272 }, "HTR results 1.04.02 Oost-Indische Testamenten TXT.zip": { "checksum": "md5:75f4cd6e6680ede52e80592e6502c90d", "ext": "zip", "id": "4abce71c-fc1f-43ca-b480-8ebb57b58c75", "key": "HTR results 1.04.02 Oost-Indische Testamenten TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 42319747 }, "HTR results 1.05.01.01 PAGE.zip": { "checksum": "md5:646d1b9dc02990542ccf02d5c2dd76a3", "ext": "zip", "id": "f57c7261-e419-4dba-9c14-a0b99c81528f", "key": "HTR results 1.05.01.01 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 887887415 }, "HTR results 1.05.01.01 TXT.zip": { "checksum": "md5:7fd6ffdcdfdba8202295ea21e77aef9f", "ext": "zip", "id": "9a588f8a-4dbd-460a-a269-9274ebef2331", "key": "HTR results 1.05.01.01 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 38190034 }, "HTR results 1.05.01.02 PAGE.zip": { "checksum": "md5:e03ed8dd91a29fe58ebbe71d57887de2", "ext": "zip", "id": "493f1fd9-41f3-4fad-a48f-cf98b4725820", "key": "HTR results 1.05.01.02 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 9647473086 }, "HTR results 1.05.01.02 TXT.zip": { "checksum": "md5:f483930bcf6aa4e7664a567dc4e48bfb", "ext": "zip", "id": "33608000-377a-46cd-b48d-319cd008aed1", "key": "HTR results 1.05.01.02 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 369189176 }, "HTR results 1.05.02 PAGE.zip": { "checksum": "md5:aeb777b6dace9c7d2ebcb0bb30c3c12d", "ext": "zip", "id": "11326958-da21-4761-b313-04a1773ead22", "key": "HTR results 1.05.02 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 955655738 }, "HTR results 1.05.02 TXT.zip": { "checksum": "md5:51300d9572875b6bfdf2bd871ba1a0c3", "ext": "zip", "id": "2c04734a-eb48-43ee-8280-bbe3937a22fd", "key": "HTR results 1.05.02 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 36457822 }, "HTR results 1.05.03 PAGE.zip": { "checksum": "md5:d782fb43c1eb54f6bb2a25ea39766563", "ext": "zip", "id": "76696d03-6d65-4a94-a0ae-92ba04e15ea9", "key": "HTR results 1.05.03 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 4807576410 }, "HTR results 1.05.03 TXT.zip": { "checksum": "md5:565ffe019bd30a75972b23188fdad8ac", "ext": "zip", "id": "92741c9d-2807-4694-a4ad-5ad8ec0b2530", "key": "HTR results 1.05.03 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 179978654 }, "HTR results 1.05.05 PAGE.zip": { "checksum": "md5:4cd2f7bd53301ef7e5570f3fda467b09", "ext": "zip", "id": "fe9b5cb4-3c39-46e1-8ccf-d3c7785f0386", "key": "HTR results 1.05.05 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 2873641574 }, "HTR results 1.05.05 TXT.zip": { "checksum": "md5:3fd4e387e03a5c51159af0b8a1d1b982", "ext": "zip", "id": "6b953a05-6c05-4720-9d0d-0352ada3110b", "key": "HTR results 1.05.05 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 109535641 }, "HTR results 1.05.06 PAGE.zip": { "checksum": "md5:512736ba7965b34782b832bd5ebad48f", "ext": "zip", "id": "fbd04734-62a7-4abe-8f15-826315e86404", "key": "HTR results 1.05.06 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 432027441 }, "HTR results 1.05.06 TXT.zip": { "checksum": "md5:afbbf32b9e7e0118d24b5cae4260087e", "ext": "zip", "id": "985eb85b-fe0b-4f73-a5b2-be877ef35d94", "key": "HTR results 1.05.06 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 18431229 }, "HTR results 1.05.21 PAGE.zip": { "checksum": "md5:478aaf4738570c01e3b81103c083d8c4", "ext": "zip", "id": "e00035b6-72f7-4608-966d-57c0a44c4dc3", "key": "HTR results 1.05.21 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 2715261506 }, "HTR results 1.05.21 TXT.zip": { "checksum": "md5:d201d030163070c75cbf3d994eb8edff", "ext": "zip", "id": "0ffbc6f3-bf7e-4c89-8040-2c001e97e1b1", "key": "HTR results 1.05.21 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 111485619 }, "HTR results 2.01.28.01 PAGE.zip": { "checksum": "md5:634475ad53ea76350e17699f5e7a8c11", "ext": "zip", "id": "2f60f22a-1083-43e1-acf7-62dc33c0d43c", "key": "HTR results 2.01.28.01 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 1374211740 }, "HTR results 2.01.28.01 TXT.zip": { "checksum": "md5:f12d5c2f22743563da543262d6441d56", "ext": "zip", "id": "b9159789-9e49-42be-8103-59c1d2091d1e", "key": "HTR results 2.01.28.01 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 53623451 }, "HTR results 2.01.28.02 PAGE.zip": { "checksum": "md5:84a01edcc9f2d87a9ac63bcf8673bc7e", "ext": "zip", "id": "9b96bebc-3fa0-42e3-ba84-fb13f9be5db4", "key": "HTR results 2.01.28.02 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 1513562886 }, "HTR results 2.01.28.02 TXT.zip": { "checksum": "md5:0ca623f664b2955772d2ea11109a9ba3", "ext": "zip", "id": "7cc0d7c3-7fbf-43f2-beaa-9e37d5bc5b62", "key": "HTR results 2.01.28.02 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 60487403 }, "HTR results NHA Notarial 1617 PAGE.zip": { "checksum": "md5:46cef4b6dd3002fdd1a060edc2d013bc", "ext": "zip", "id": "6b4e8836-0dbc-433b-a4dd-31be2b8cdc89", "key": "HTR results NHA Notarial 1617 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 3280149083 }, "HTR results NHA Notarial 1617 TXT.zip": { "checksum": "md5:d8ee4b47dd640d6cf9b264b800c914ca", "ext": "zip", "id": "6e720678-4a72-4941-acf4-e1e2266ca2be", "key": "HTR results NHA Notarial 1617 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 133387379 }, "HTR results NHA Notarial 1972 PAGE.zip": { "checksum": "md5:98abdaafec31000590390d66d457be2f", "ext": "zip", "id": "a19ba6b2-ae88-47d9-8da1-0935c1145ffe", "key": "HTR results NHA Notarial 1972 PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 10224852802 }, "HTR results NHA Notarial 1972 TXT.zip": { "checksum": "md5:80716cb025b328e0309d6e2b00f1d289", "ext": "zip", "id": "3a83b478-6e3f-40e9-bea6-5e50d2d10dd4", "key": "HTR results NHA Notarial 1972 TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 412936174 }, "HTR results VOC PAGE.zip": { "checksum": "md5:63765847db358429cdf8c12305085eb4", "ext": "zip", "id": "65d3387b-2788-4efa-9f29-ef2ca1828566", "key": "HTR results VOC PAGE.zip", "metadata": null, "mimetype": "application/zip", "size": 17887243231 }, "HTR results VOC TXT.zip": { "checksum": "md5:23431e2779e9cadce798adba47ff57ea", "ext": "zip", "id": "360c38ac-24c4-4e6c-b6bc-8531528ec886", "key": "HTR results VOC TXT.zip", "metadata": null, "mimetype": "application/zip", "size": 710830346 }, "Notarial deeds Ground Truths of the trainingset in PAGE xml.7z": { "checksum": "md5:8b8d2fa465c8d1dad71d0fd5817f93da", "ext": "7z", "id": "bcc5c35a-bbd1-4db7-8891-b5a00d928faf", "key": "Notarial deeds Ground Truths of the trainingset in PAGE xml.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 54461642 }, "Notarial deeds Images of the training set.7z": { "checksum": "md5:eaf0c2bf0033cb129d5c747a2be039ea", "ext": "7z", "id": "539c850a-4414-429b-8f26-49f69f5631bd", "key": "Notarial deeds Images of the training set.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 7203628715 }, "VOC Ground truths of the trainingset in PAGE xml.7z": { "checksum": "md5:ca68bf64bd40af7593090d1425766e94", "ext": "7z", "id": "54cce28c-f2ed-4449-8df0-4b9bf9c9cbe0", "key": "VOC Ground truths of the trainingset in PAGE xml.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 144231651 }, "VOC Images of the trainingset.7z": { "checksum": "md5:84dcb6e819ad897e81ac26150c428e3e", "ext": "7z", "id": "ab67189c-a5ee-43d4-8006-0babb9abcf27", "key": "VOC Images of the trainingset.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 18485261890 } }, "order": [], "total_bytes": 85490828757 }, "id": "4383748", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/4383748/access", "access_links": "https://zenodo.org/api/records/4383748/access/links", "access_request": "https://zenodo.org/api/records/4383748/access/request", "access_users": "https://zenodo.org/api/records/4383748/access/users", "archive": "https://zenodo.org/api/records/4383748/files-archive", "archive_media": "https://zenodo.org/api/records/4383748/media-files-archive", "communities": "https://zenodo.org/api/records/4383748/communities", "communities-suggestions": "https://zenodo.org/api/records/4383748/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.4383748", "draft": "https://zenodo.org/api/records/4383748/draft", "files": "https://zenodo.org/api/records/4383748/files", "latest": "https://zenodo.org/api/records/4383748/versions/latest", "latest_html": "https://zenodo.org/records/4383748/latest", "media_files": "https://zenodo.org/api/records/4383748/media-files", "parent": "https://zenodo.org/api/records/3517776", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.3517776", "parent_html": "https://zenodo.org/records/3517776", "requests": "https://zenodo.org/api/records/4383748/requests", "reserve_doi": "https://zenodo.org/api/records/4383748/draft/pids/doi", "self": "https://zenodo.org/api/records/4383748", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.4383748", "self_html": "https://zenodo.org/records/4383748", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:4383748/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:4383748/sequence/default", "versions": "https://zenodo.org/api/records/4383748/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "National Archive Netherlands" } ], "person_or_org": { "family_name": "Liesbeth Keijser", "name": "Liesbeth Keijser", "type": "personal" } } ], "description": "
The National Archives of the Netherlands and Noord-Hollands Archief conducted a project using the Transkribus HTR (Handwritten Text Recognition) platform. The aim was to semi automatically transcribe 2 million pages of old Dutch texts.
\n\nThe transcribed archives are 17th and 18th century documents from the Dutch East-Asia Company (VOC). And 19th century notarial deeds from Noord-Hollands Archief and other archives in the provinces.
\n\nIn order to train the HTR software a team produced transcriptions of approximately 6000 scans. The scans are randomly selected from the dataset. With the transcriptions a model is trained that can recognize more than 90% of the characters correctly. Transkribus transcribed the 2 million scans automatically using the trained model.
\n\nThe following Transkribus HTR+ model has been trained for the text recognition: "IJsberg". More information about the model can be found here. See the chapter "Dutch Handwriting". However, the Transkribus team retrained the model with PyLaia technology, which improved the HTR+ model. This PyLaia model is not publicly available.
\n\nLater on, 1 million extra scans concerning the West India Company (WIC) were transcribed automatically without adding extra ground truth or training. These archives are from the 17th and 18th century.
\n\nThe datasets published in Zenodo contain the ground truth (scans in JPG, transcription in PAGE XML) and the HTR results (in PAGE XML and TXT). See the overview below. Scroll to the bottom of the page to download the actual files.
\n\nFor more information on how the Dutch National Archive innovate on digital accessibility click here.
\n\nFor open data access of scans and inventories of the National Archives click here.
\n\nDisclaimer: due to a variety of languages used and the bad state of the documents the HTR results of "1.05.21, Dutch series Guyana" can be of poor quality.
\n\n--------------------------------------------------------------
\n\nDataset HTR
\nDataset, name archive, number archive, inventory numbers, link to inventory)
HTR results VOC, VOC, 1.04.02, 7527-9540, EAD
\nHTR results 1.04.02, Oost-Indische Testamenten, 1.04.02, 6847-6897, EAD
\nHTR results 1.05.01.01, Oude WIC, 1.05.01.01, 1-87, EAD
\nHTR results 1.05.01.02, Tweede WIC, 1.05.01.02, 1-1382, EAD
\nHTR results 1.05.02, Raad der Koloniën, 1.05.02, 1-192, EAD
\nHTR results 1.05.03, Sociëteit van Suriname, 1.05.03, 1-566, EAD
\nHTR results 1.05.05, Sociëteit van Berbice, 1.05.05, 1-445, EAD
\nHTR results 1.05.06, Verspreide West-Indische stukken, 1.05.06, 1-1413, EAD
\nHTR results 1.05.21, Dutch series Guyana, 1.05.21, AB.1.1-BB.7.1, EAD
\nHTR results 2.01.28.01, West-Indisch comité, 2.01.28.01, 1-254, EAD
\nHTR results 2.01.28.02, Raad der Amerikaanse Bezittingen, 2.01.28.02, 1-264, EAD
\nHTR results NHA Notarial 1617, Oud notarieel archief Haarlem, 1617, 5-813, EAD
\nHTR results NHA Notarial 1972, Nieuw notarieel archief Haarlem, 1972, 1593-1805 EAD
Dataset Ground Truth
\n(Name archive, number archive, inventory numbers, link to inventory, type of dataset)
Dataset: Notarial deeds Ground Truths of the trainingset
\n\nDataset: Notarial deeds Images of the trainingset,
\n\n
\nDataset: VOC Ground Truths of the trainingset,
\nVOC, 1.04.02, 4735 random scans from 7527-9540, EAD, GT Transcriptions
\nDataset: VOC Images of the trainingset,
\nVOC, 1.04.02, 4735 random scans from 7527-9540, EAD, GT Scans
--------------------------------------------------------------
\n\nVersion 3.0: The first HTR results from the VOC-collection are available in .txt format, Inventory numbers 7527-9540.
\n\nVersion 3.1: The HTR results from the VOC-collection are also available in PAGE xml format.
\n\nVersion 4.0: About 30 missing inventory numbers have been added to the VOC transcriptions. The HTR results of the Notarial Deeds from the NHA archives have been added. An example on full text searchable research can be found here (Dutch): https://kia.pleio.nl/groups/view/55812425/htr-en-ocr/blog/view/55814752/reconstructie-van-een-verijdelde-slavenopstand-met-behulp-van-automatische-handschriftherkenning-en-text-mining
\n\nVersion 5.0: Around a million pages of HTR results of the following archives have been added.
\n\nVersion 6.0: The HTR results of Oost-Indische Testamenten have been added.
", "languages": [ { "id": "odt", "title": { "en": "Old Dutch" } } ], "publication_date": "2020-01-21", "publisher": "Zenodo", "resource_type": { "id": "dataset", "title": { "de": "Datensatz", "en": "Dataset" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "Transciptions" }, { "subject": "Verenigde Oost-Indische Compagnie" }, { "subject": "West-Indische Compagnie" }, { "subject": "Notarial deeds" }, { "subject": "Nationaal Archief" }, { "subject": "Noord-Hollands Archief" }, { "subject": "Transkribus" } ], "title": "6000 ground truth of VOC and notarial deeds 3.000.000 HTR of VOC, WIC and notarial deeds", "version": "6.0" }, "parent": { "access": { "owned_by": { "user": 80914 } }, "communities": {}, "id": "3517776", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.3517776", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4383748", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:4383748", "provider": "oai" } }, "revision_id": 10, "stats": { "all_versions": { "data_volume": 25614732287782.0, "downloads": 6540, "unique_downloads": 3373, "unique_views": 26753, "views": 28808 }, "this_version": { "data_volume": 580959206886.0, "downloads": 444, "unique_downloads": 301, "unique_views": 1831, "views": 2103 } }, "status": "published", "updated": "2022-10-10T07:37:28.311471+00:00", "versions": { "index": 9, "is_latest": false } }