Conference paper Open Access

An Analysis of the Performance of Named Entity Recognition over OCRed Documents

Hamdi, Ahmed; Jean-Caurant, Axel; Sidere, Nicolas; Coustaty, Mickael; Doucet, Antoine


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.3243344">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Text"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.3243344</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.3243344"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Hamdi, Ahmed</foaf:name>
        <foaf:givenName>Ahmed</foaf:givenName>
        <foaf:familyName>Hamdi</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>L3i Laboratory, University of La Rochelle</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Jean-Caurant, Axel</foaf:name>
        <foaf:givenName>Axel</foaf:givenName>
        <foaf:familyName>Jean-Caurant</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>L3i Laboratory, University of La Rochelle</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Sidere, Nicolas</foaf:name>
        <foaf:givenName>Nicolas</foaf:givenName>
        <foaf:familyName>Sidere</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>L3i Laboratory, University of La Rochelle</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Coustaty, Mickael</foaf:name>
        <foaf:givenName>Mickael</foaf:givenName>
        <foaf:familyName>Coustaty</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>L3i Laboratory, University of La Rochelle</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Doucet, Antoine</foaf:name>
        <foaf:givenName>Antoine</foaf:givenName>
        <foaf:familyName>Doucet</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>L3i Laboratory, University of La Rochelle</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>An Analysis of the Performance of Named Entity Recognition over OCRed Documents</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2019</dct:issued>
    <dcat:keyword>Indexing,</dcat:keyword>
    <dcat:keyword>OCR</dcat:keyword>
    <dcat:keyword>Named Entity</dcat:keyword>
    <dcat:keyword>Extraction</dcat:keyword>
    <dcat:keyword>Digital Libraries</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/EC/H2020/770299/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-06-02</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/3243344"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/3243344</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.3243343"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/newseye"/>
    <dct:description>&lt;p&gt;The use of digital libraries requires an easy accessibility to documents which is strongly impacted by the quality of document indexing. Named entities are among the most important information to index digital documents. According to a recent study, 80% of the top 500 queries sent to a digital library portal contained at least one named entity [2]. However most digitized documents are indexed through their OCRed version which includes numerous errors that may hinder the access to them. Named Entity Recognition (NER) is the task that aims to locate important names in a given text and to categorize them into a set of predefined classes (person, location, organization). This paper aims to estimate the performance of NER systems through OCRed data. It exhaustively discusses NER errors arising from OCR errors; we studied the correlation between NER accuracy and OCR error rates and estimated the cost of character insertion, deletion and&amp;nbsp;substitution in named entities. Results show that even if the OCR&amp;nbsp;engine does contaminate named entities with errors, NER systems can overcome this issue and correctly recognise&amp;nbsp;some of them.&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.3243344"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3243344</dcat:accessURL>
        <dcat:byteSize>320906</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/3243344/files/JCDL_2019_An Analysis of the Performance of Named Entity Recognition.pdf">https://zenodo.org/record/3243344/files/JCDL_2019_An Analysis of the Performance of Named Entity Recognition.pdf</dcat:downloadURL>
        <dcat:mediaType>application/pdf</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/EC/H2020/770299/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">770299</dct:identifier>
    <dct:title>NewsEye: A Digital Investigator for Historical Newspapers</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
1,065
707
views
downloads
All versions This version
Views 1,0651,067
Downloads 707707
Data volume 226.9 MB226.9 MB
Unique views 824826
Unique downloads 665665

Share

Cite as