Dataset Open Access

GT4HistOCR: Ground Truth for training OCR engines on historical documents in German Fraktur and Early Modern Latin

Springmann, Uwe; Reul, Christian; Dipper, Stefanie; Baiter, Johannes


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.1344132">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.1344132</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.1344132"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Springmann, Uwe</foaf:name>
        <foaf:givenName>Uwe</foaf:givenName>
        <foaf:familyName>Springmann</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>LMU</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Reul, Christian</foaf:name>
        <foaf:givenName>Christian</foaf:givenName>
        <foaf:familyName>Reul</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Universität Würzburg</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Dipper, Stefanie</foaf:name>
        <foaf:givenName>Stefanie</foaf:givenName>
        <foaf:familyName>Dipper</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Ruhr-Universität Bochum</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Baiter, Johannes</foaf:name>
        <foaf:givenName>Johannes</foaf:givenName>
        <foaf:familyName>Baiter</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Bayerische Staatsbibiliothek München</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>GT4HistOCR: Ground Truth for training OCR engines on historical documents in German Fraktur and Early Modern Latin</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2018</dct:issued>
    <dcat:keyword>OCR, historical documents, digital humanities, Fraktur, Early Modern Latin, Early New High German</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2018-08-12</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/1344132"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/1344132</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.1344131"/>
    <owl:versionInfo>1.0</owl:versionInfo>
    <dct:description>&lt;p&gt;&lt;strong&gt;GT4HistOCR&lt;/strong&gt; contains ground truth for research in Optical Character Recognition (OCR) technology applied to historical printings in German Fraktur and Early Modern Latin.&lt;/p&gt; &lt;p&gt;The ground truth comes in pairs of images of single printed lines as they appear in book pages (*.png) and their corresponding diplomatic transcriptions (*.gt.txt), which are UTF-8 strings preserving the character forms (glyphs) as much as possible within the UNICODE standard. These pairs of line images and their transcriptions can be directly used to train recognition models with, e.g., the open source OCR engines &lt;em&gt;OCRopy&lt;/em&gt; or &lt;em&gt;Tesseract&lt;/em&gt;. A total of 313,173 ground truth lines are provided.&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Please note that the subcorpora making up this collection used different transcription guidelines, so it is a bad idea to train a recognition model on the total collection! Rather train individual models for each subcorpus.&lt;/strong&gt; Fur further information about the subcorpora, please see the README file and the accompanying publication.&lt;/p&gt; &lt;p&gt;If these data are useful for you, please cite the accompanying publication:&lt;/p&gt; &lt;pre&gt;@article{&lt;a href="http://springmann.net/publications.html#springmann2018gt4hist"&gt;springmann2018gt4hist&lt;/a&gt;, author = {Uwe Springmann and Christian Reul and Stefanie Dipper and Johannes Baiter}, title = {{Ground Truth for training {OCR} engines on historical documents in German Fraktur and Early Modern Latin}}, journal = {J. Lang. Technol. Comput. Linguistics}, volume = {33}, number = {1}, pages = {97--114}, year = {2018}, url = {https://jlcl.org/content/2-allissues/1-heft1-2018/jlcl_2018-1_5.pdf} }&lt;/pre&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1344132"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1344132">https://doi.org/10.5281/zenodo.1344132</dcat:accessURL>
        <dcat:byteSize>4025354240</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1344132/files/GT4HistOCR.tar">https://zenodo.org/record/1344132/files/GT4HistOCR.tar</dcat:downloadURL>
        <dcat:mediaType>application/x-tar</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1344132">https://doi.org/10.5281/zenodo.1344132</dcat:accessURL>
        <dcat:byteSize>2559</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1344132/files/README">https://zenodo.org/record/1344132/files/README</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
3,393
3,513
views
downloads
All versions This version
Views 3,3933,394
Downloads 3,5133,512
Data volume 12.2 TB12.2 TB
Unique views 3,0873,088
Unique downloads 1,0621,061

Share

Cite as