Dataset Open Access

Data set of the paper "Publishing an OCR ground truth data set for reuse in an unclear copyright setting"

David Lassner; Julius Coburger; Clemens Neudecker; Anne Baillot


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/1509d4f9-3f1e-4ef4-8878-f7ea2dc533db/2021-05-7_v1.1_ocr-data.tgz"
      }, 
      "checksum": "md5:99a25e5a8cc8942e571cd908dfc61927", 
      "bucket": "1509d4f9-3f1e-4ef4-8878-f7ea2dc533db", 
      "key": "2021-05-7_v1.1_ocr-data.tgz", 
      "type": "tgz", 
      "size": 300004
    }
  ], 
  "owners": [
    103307
  ], 
  "doi": "10.5281/zenodo.4742068", 
  "stats": {
    "version_unique_downloads": 7.0, 
    "unique_views": 28.0, 
    "views": 35.0, 
    "version_views": 35.0, 
    "unique_downloads": 7.0, 
    "version_unique_views": 28.0, 
    "volume": 3000040.0, 
    "version_downloads": 10.0, 
    "downloads": 10.0, 
    "version_volume": 3000040.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.4742068", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.4742067", 
    "bucket": "https://zenodo.org/api/files/1509d4f9-3f1e-4ef4-8878-f7ea2dc533db", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.4742067.svg", 
    "html": "https://zenodo.org/record/4742068", 
    "latest_html": "https://zenodo.org/record/4742068", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.4742068.svg", 
    "latest": "https://zenodo.org/api/records/4742068"
  }, 
  "conceptdoi": "10.5281/zenodo.4742067", 
  "created": "2021-05-07T08:50:12.350067+00:00", 
  "updated": "2021-05-12T10:45:46.244694+00:00", 
  "conceptrecid": "4742067", 
  "revision": 4, 
  "id": 4742068, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.4742068", 
    "description": "<p>The data set consists of a METS file for each of the PDFs that were used for transcription and a directory data/page_xml that contains the transcriptions of the ground truth in PAGE-XML format. In parallel to the data set publication, a data paper will be published that contains a detailed description of the data set. As soon as it is published, we will link to it. The corresponding source code can be found here&nbsp;https://github.com/millawell/ocr-data/tree/1.1</p>", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "title": "Data set of the paper \"Publishing an OCR ground truth data set for reuse in an unclear copyright setting\"", 
    "relations": {
      "version": [
        {
          "count": 1, 
          "index": 0, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "4742067"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "4742068"
          }
        }
      ]
    }, 
    "version": "1.1", 
    "keywords": [
      "OCR ground-truth"
    ], 
    "publication_date": "2021-05-07", 
    "creators": [
      {
        "orcid": "0000-0001-9013-0834", 
        "affiliation": "TU Berlin", 
        "name": "David Lassner"
      }, 
      {
        "affiliation": "TU Berlin", 
        "name": "Julius Coburger"
      }, 
      {
        "orcid": "0000-0001-5293-8322", 
        "affiliation": "Staatsbibliothek zu Berlin - Preu\u00dfischer Kulturbesitz", 
        "name": "Clemens Neudecker"
      }, 
      {
        "orcid": "0000-0002-4593-059X", 
        "affiliation": "Le Mans Universit\u00e9", 
        "name": "Anne Baillot"
      }
    ], 
    "access_right": "open", 
    "resource_type": {
      "type": "dataset", 
      "title": "Dataset"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.4742067", 
        "relation": "isVersionOf"
      }
    ]
  }
}
35
10
views
downloads
All versions This version
Views 3535
Downloads 1010
Data volume 3.0 MB3.0 MB
Unique views 2828
Unique downloads 77

Share

Cite as