Dataset Open Access

GT4HistOCR: Ground Truth for training OCR engines on historical documents in German Fraktur and Early Modern Latin

Springmann, Uwe; Reul, Christian; Dipper, Stefanie; Baiter, Johannes


DataCite XML Export

<?xml version='1.0' encoding='utf-8'?>
<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
  <identifier identifierType="DOI">10.5281/zenodo.1344132</identifier>
  <creators>
    <creator>
      <creatorName>Springmann, Uwe</creatorName>
      <givenName>Uwe</givenName>
      <familyName>Springmann</familyName>
      <affiliation>LMU</affiliation>
    </creator>
    <creator>
      <creatorName>Reul, Christian</creatorName>
      <givenName>Christian</givenName>
      <familyName>Reul</familyName>
      <affiliation>Universität Würzburg</affiliation>
    </creator>
    <creator>
      <creatorName>Dipper, Stefanie</creatorName>
      <givenName>Stefanie</givenName>
      <familyName>Dipper</familyName>
      <affiliation>Ruhr-Universität Bochum</affiliation>
    </creator>
    <creator>
      <creatorName>Baiter, Johannes</creatorName>
      <givenName>Johannes</givenName>
      <familyName>Baiter</familyName>
      <affiliation>Bayerische Staatsbibiliothek München</affiliation>
    </creator>
  </creators>
  <titles>
    <title>GT4HistOCR: Ground Truth for training OCR engines on historical documents in German Fraktur and Early Modern Latin</title>
  </titles>
  <publisher>Zenodo</publisher>
  <publicationYear>2018</publicationYear>
  <subjects>
    <subject>OCR, historical documents, digital humanities, Fraktur, Early Modern Latin, Early New High German</subject>
  </subjects>
  <dates>
    <date dateType="Issued">2018-08-12</date>
  </dates>
  <language>en</language>
  <resourceType resourceTypeGeneral="Dataset"/>
  <alternateIdentifiers>
    <alternateIdentifier alternateIdentifierType="url">https://zenodo.org/record/1344132</alternateIdentifier>
  </alternateIdentifiers>
  <relatedIdentifiers>
    <relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.1344131</relatedIdentifier>
  </relatedIdentifiers>
  <version>1.0</version>
  <rightsList>
    <rights rightsURI="http://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
    <rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
  </rightsList>
  <descriptions>
    <description descriptionType="Abstract">&lt;p&gt;&lt;strong&gt;GT4HistOCR&lt;/strong&gt; contains ground truth for research in Optical Character Recognition (OCR) technology applied to historical printings in German Fraktur and Early Modern Latin.&lt;/p&gt;

&lt;p&gt;The ground truth comes in pairs of images of single printed lines as they appear in book pages (*.png) and their corresponding diplomatic transcriptions (*.gt.txt), which are UTF-8 strings preserving the character forms (glyphs) as much as possible within the UNICODE standard. These pairs of line images and their transcriptions can be directly used to train recognition models with, e.g., the open source OCR engines &lt;em&gt;OCRopy&lt;/em&gt; or &lt;em&gt;Tesseract&lt;/em&gt;. A total of 313,173 ground truth lines are provided.&lt;/p&gt;</description>
  </descriptions>
</resource>
861
216
views
downloads
All versions This version
Views 861861
Downloads 216216
Data volume 426.7 GB426.7 GB
Unique views 804804
Unique downloads 154154

Share

Cite as