Dataset Open Access

OCR fulltexts of the Digital Collections of the Berlin State Library (DC-SBB)

Labusch, Kai; Zellhöfer, David


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/corpus-entropy.pkl"
      }, 
      "checksum": "md5:683fe1c3d5c1b275c002248bddbb88e1", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "corpus-entropy.pkl", 
      "type": "pkl", 
      "size": 175051440
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/corpus-language.pkl"
      }, 
      "checksum": "md5:014cabcad9e974174f2a590f702c63fc", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "corpus-language.pkl", 
      "type": "pkl", 
      "size": 198737314
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/corpus.zip"
      }, 
      "checksum": "md5:11b23cddbf82cd6e0595ad367189db18", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "corpus.zip", 
      "type": "zip", 
      "size": 4173625252
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/de_corpus.zip"
      }, 
      "checksum": "md5:7c9c9922dece2068252533e5b7be8536", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "de_corpus.zip", 
      "type": "zip", 
      "size": 2173534503
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/selection_de.pkl"
      }, 
      "checksum": "md5:7afad26c0ab83601c6467dfd74039b97", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "selection_de.pkl", 
      "type": "pkl", 
      "size": 143317470
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4/xml2csv_alto.csv"
      }, 
      "checksum": "md5:0cf20919da1df6f67d634304e12c3a1a", 
      "bucket": "efecb366-2b18-45c9-bd10-fd1ea88308f4", 
      "key": "xml2csv_alto.csv", 
      "type": "csv", 
      "size": 33116656376
    }
  ], 
  "owners": [
    61238
  ], 
  "doi": "10.5281/zenodo.3257041", 
  "stats": {
    "version_unique_downloads": 150.0, 
    "unique_views": 423.0, 
    "views": 458.0, 
    "version_views": 458.0, 
    "unique_downloads": 150.0, 
    "version_unique_views": 423.0, 
    "volume": 3013524710385.0, 
    "version_downloads": 305.0, 
    "downloads": 305.0, 
    "version_volume": 3013524710385.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.3257041", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.3257040", 
    "bucket": "https://zenodo.org/api/files/efecb366-2b18-45c9-bd10-fd1ea88308f4", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.3257040.svg", 
    "html": "https://zenodo.org/record/3257041", 
    "latest_html": "https://zenodo.org/record/3257041", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.3257041.svg", 
    "latest": "https://zenodo.org/api/records/3257041"
  }, 
  "conceptdoi": "10.5281/zenodo.3257040", 
  "created": "2019-06-26T11:14:42.972981+00:00", 
  "updated": "2020-01-24T19:26:25.882416+00:00", 
  "conceptrecid": "3257040", 
  "revision": 4, 
  "id": 3257041, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.3257041", 
    "description": "<p>The digital collections of the SBB contain 153,942 digitized works from the time period of 1470 to 1945.</p>\n\n<p>At the time of publication, 28,909 works have been OCR-processed resulting in 4,988,099 full-text pages.<br>\nFor each page with OCR text, the language has been determined by <em>langid </em>(Lui/Baldwin 2012).</p>\n\n<p>corpus-entropy.pkl &nbsp; &nbsp;&nbsp; entropy rate per document page</p>\n\n<p>corpus-language.pkl&nbsp;&nbsp; language per document page</p>\n\n<p>corpus.zip &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; fulltext corpus (extracts to .txt format)</p>\n\n<p>de_corpus.zip &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; German sub-corpus (extracts to .txt format)</p>\n\n<p>selection_de.pkl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Selection list of German documents</p>\n\n<p>xml2csv_alto.csv&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; fulltext corpus per document page (incl.OCR word confidences)</p>\n\n<p>&nbsp;</p>\n\n<p><em>Sources</em></p>\n\n<p>Marco Lui and Timothy Baldwin. 2012. Langid.py:</p>\n\n<p>An off-the-shelf language identification tool. In Proceedings of the ACL 2012 System Demonstrations,</p>\n\n<p>ACL &rsquo;12, pages 25&ndash;30, Stroudsburg, PA, USA. Association for Computational Linguistics</p>", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "title": "OCR fulltexts of the Digital Collections of the Berlin State Library (DC-SBB)", 
    "relations": {
      "version": [
        {
          "count": 1, 
          "index": 0, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "3257040"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "3257041"
          }
        }
      ]
    }, 
    "communities": [
      {
        "id": "stabi"
      }
    ], 
    "version": "1.0", 
    "keywords": [
      "OCR fulltext", 
      "historic texts"
    ], 
    "publication_date": "2019-06-26", 
    "creators": [
      {
        "affiliation": "Berlin State Library", 
        "name": "Labusch, Kai"
      }, 
      {
        "orcid": "0000-0002-0403-457X", 
        "affiliation": "Berlin State Library", 
        "name": "Zellh\u00f6fer, David"
      }
    ], 
    "access_right": "open", 
    "resource_type": {
      "type": "dataset", 
      "title": "Dataset"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.3257040", 
        "relation": "isVersionOf"
      }
    ]
  }
}
458
305
views
downloads
All versions This version
Views 458458
Downloads 305305
Data volume 3.0 TB3.0 TB
Unique views 423423
Unique downloads 150150

Share

Cite as