Video/Audio Open Access

CORPUS17: a philological French corpus for 17thcentury

Simon Gabay; Alexandre Bartz; Yohann Deguin

MARC21 XML Export

<?xml version='1.0' encoding='UTF-8'?>
<record xmlns="">
  <datafield tag="041" ind1=" " ind2=" ">
    <subfield code="a">fra</subfield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">17th c. French, OCR, normalisation, lemmatisation, POS-tagging,named entities, digital humanities, XML-TEI</subfield>
  <controlfield tag="005">20201014171233.0</controlfield>
  <controlfield tag="001">4088669</controlfield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">École des Chartes, Paris, Franc</subfield>
    <subfield code="a">Alexandre Bartz</subfield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">Université de Rennes, Rennes, France</subfield>
    <subfield code="a">Yohann Deguin</subfield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">30813626</subfield>
    <subfield code="z">md5:6e1886d5ea2f2f330aea37faeb07f1a9</subfield>
    <subfield code="u"></subfield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="l">open</subfield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2020-10-14</subfield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="o"></subfield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="u">Universités de Neuchâtel et de Genève, Neuchâtel and Genève, Switzerland</subfield>
    <subfield code="a">Simon Gabay</subfield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">CORPUS17: a philological French corpus for 17thcentury</subfield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="u"></subfield>
    <subfield code="a">Creative Commons Attribution 4.0 International</subfield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="a">cc-by</subfield>
    <subfield code="2"></subfield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">&lt;p&gt;We investigate the creation of a 17th c. French literary corpus. We present the main options regarding available standards, the training data we created and the efficiency of the models produced for OCR, spelling normalization, and lemmatization &amp;ndash; always with open-source solutions. We also present our encoding choices and the global logic of a corpus designed as a virtuous circle, enhancing automatically the tools that are used for its construction.&lt;/p&gt;</subfield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">continues</subfield>
    <subfield code="a">10.1145/3423603.3424002</subfield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">isVersionOf</subfield>
    <subfield code="a">10.5281/zenodo.4088668</subfield>
  <datafield tag="024" ind1=" " ind2=" ">
    <subfield code="a">10.5281/zenodo.4088669</subfield>
    <subfield code="2">doi</subfield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">video</subfield>
All versions This version
Views 2525
Downloads 718718
Data volume 22.1 GB22.1 GB
Unique views 2323
Unique downloads 607607


Cite as