{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2019-06-26T11:14:42.972981+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 6, "enabled": true, "entries": { "corpus-entropy.pkl": { "checksum": "md5:683fe1c3d5c1b275c002248bddbb88e1", "ext": "pkl", "id": "40d89115-cb9c-4edd-9881-44e4cf138b4b", "key": "corpus-entropy.pkl", "metadata": null, "mimetype": "application/octet-stream", "size": 175051440 }, "corpus-language.pkl": { "checksum": "md5:014cabcad9e974174f2a590f702c63fc", "ext": "pkl", "id": "92f24a22-9042-4f07-a2f2-36b0122e24b2", "key": "corpus-language.pkl", "metadata": null, "mimetype": "application/octet-stream", "size": 198737314 }, "corpus.zip": { "checksum": "md5:11b23cddbf82cd6e0595ad367189db18", "ext": "zip", "id": "a2f100f2-8321-493f-ba19-30a314a73122", "key": "corpus.zip", "metadata": null, "mimetype": "application/zip", "size": 4173625252 }, "de_corpus.zip": { "checksum": "md5:7c9c9922dece2068252533e5b7be8536", "ext": "zip", "id": "e6755399-33a4-4084-9ff2-a99cca5a0fd3", "key": "de_corpus.zip", "metadata": null, "mimetype": "application/zip", "size": 2173534503 }, "selection_de.pkl": { "checksum": "md5:7afad26c0ab83601c6467dfd74039b97", "ext": "pkl", "id": "8b25d492-d238-48ab-8a00-b539b2d331de", "key": "selection_de.pkl", "metadata": null, "mimetype": "application/octet-stream", "size": 143317470 }, "xml2csv_alto.csv": { "checksum": "md5:0cf20919da1df6f67d634304e12c3a1a", "ext": "csv", "id": "3ee68cfe-683f-4b39-8bda-e966a550b9b3", "key": "xml2csv_alto.csv", "metadata": null, "mimetype": "text/csv", "size": 33116656376 } }, "order": [], "total_bytes": 39980922355 }, "id": "3257041", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/3257041/access", "access_links": "https://zenodo.org/api/records/3257041/access/links", "access_request": "https://zenodo.org/api/records/3257041/access/request", "access_users": "https://zenodo.org/api/records/3257041/access/users", "archive": "https://zenodo.org/api/records/3257041/files-archive", "archive_media": "https://zenodo.org/api/records/3257041/media-files-archive", "communities": "https://zenodo.org/api/records/3257041/communities", "communities-suggestions": "https://zenodo.org/api/records/3257041/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.3257041", "draft": "https://zenodo.org/api/records/3257041/draft", "files": "https://zenodo.org/api/records/3257041/files", "latest": "https://zenodo.org/api/records/3257041/versions/latest", "latest_html": "https://zenodo.org/records/3257041/latest", "media_files": "https://zenodo.org/api/records/3257041/media-files", "parent": "https://zenodo.org/api/records/3257040", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.3257040", "parent_html": "https://zenodo.org/records/3257040", "requests": "https://zenodo.org/api/records/3257041/requests", "reserve_doi": "https://zenodo.org/api/records/3257041/draft/pids/doi", "self": "https://zenodo.org/api/records/3257041", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.3257041", "self_html": "https://zenodo.org/records/3257041", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:3257041/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:3257041/sequence/default", "versions": "https://zenodo.org/api/records/3257041/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "Berlin State Library" } ], "person_or_org": { "family_name": "Labusch", "given_name": "Kai", "name": "Labusch, Kai", "type": "personal" } }, { "affiliations": [ { "name": "Berlin State Library" } ], "person_or_org": { "family_name": "Zellh\u00f6fer", "given_name": "David", "identifiers": [ { "identifier": "0000-0002-0403-457X", "scheme": "orcid" } ], "name": "Zellh\u00f6fer, David", "type": "personal" } } ], "description": "
The digital collections of the SBB contain 153,942 digitized works from the time period of 1470 to 1945.
\n\nAt the time of publication, 28,909 works have been OCR-processed resulting in 4,988,099 full-text pages.
\nFor each page with OCR text, the language has been determined by langid (Lui/Baldwin 2012).
corpus-entropy.pkl entropy rate per document page
\n\ncorpus-language.pkl language per document page
\n\ncorpus.zip fulltext corpus (extracts to .txt format)
\n\nde_corpus.zip German sub-corpus (extracts to .txt format)
\n\nselection_de.pkl Selection list of German documents
\n\nxml2csv_alto.csv fulltext corpus per document page (incl.OCR word confidences)
\n\n\n\n
Sources
\n\nMarco Lui and Timothy Baldwin. 2012. Langid.py:
\n\nAn off-the-shelf language identification tool. In Proceedings of the ACL 2012 System Demonstrations,
\n\nACL ’12, pages 25–30, Stroudsburg, PA, USA. Association for Computational Linguistics
", "publication_date": "2019-06-26", "publisher": "Zenodo", "resource_type": { "id": "dataset", "title": { "de": "Datensatz", "en": "Dataset" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "OCR fulltext" }, { "subject": "historic texts" } ], "title": "OCR fulltexts of the Digital Collections of the Berlin State Library (DC-SBB)", "version": "1.0" }, "parent": { "access": { "owned_by": { "user": 61238 } }, "communities": { "default": "d3a7d86e-cfb2-4d89-a5ec-78279f7b181f", "entries": [ { "access": { "member_policy": "open", "members_visibility": "public", "record_policy": "open", "review_policy": "open", "visibility": "public" }, "children": { "allow": false }, "created": "2019-02-22T14:57:24.676819+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "id": "d3a7d86e-cfb2-4d89-a5ec-78279f7b181f", "links": {}, "metadata": { "curation_policy": "We invite everyone to contribute datasets, demonstrators, or publications based on this material.
", "description": "This repository provides datasets and data publications from Staatsbibliothek zu Berlin (Berlin State Library). In addition, you can find demos in the Stabi Lab (https://lab.sbb.berlin/) and trained models on HuggingFace (https://huggingface.co/SBB).", "title": "Datasets of Staatsbibliothek zu Berlin - Berlin State Library", "website": "https://www.staatsbibliothek-berlin.de/" }, "revision_id": 3, "slug": "stabi", "updated": "2024-03-06T16:10:53.590958+00:00" } ], "ids": [ "d3a7d86e-cfb2-4d89-a5ec-78279f7b181f" ] }, "id": "3257040", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.3257040", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.3257041", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:3257041", "provider": "oai" } }, "revision_id": 4, "stats": { "all_versions": { "data_volume": 4144317526986.0, "downloads": 418, "unique_downloads": 240, "unique_views": 1010, "views": 1074 }, "this_version": { "data_volume": 4044450451634.0, "downloads": 412, "unique_downloads": 234, "unique_views": 1000, "views": 1064 } }, "status": "published", "updated": "2020-01-24T19:26:25.882416+00:00", "versions": { "index": 1, "is_latest": true } }