There is a newer version of this record available.

Dataset Open Access

MESINESP2 Corpora: Annotated data for medical semantic indexing in Spanish

Gasco, Luis; Krallinger, Martin


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.4612275">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.4612275</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.4612275"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-4976-9879">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-4976-9879</dct:identifier>
        <foaf:name>Gasco, Luis</foaf:name>
        <foaf:givenName>Luis</foaf:givenName>
        <foaf:familyName>Gasco</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Barcelona Supercomputing Center</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-2646-8782">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-2646-8782</dct:identifier>
        <foaf:name>Krallinger, Martin</foaf:name>
        <foaf:givenName>Martin</foaf:givenName>
        <foaf:familyName>Krallinger</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Barcelona Supercomputing Center</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>MESINESP2 Corpora: Annotated data for medical semantic indexing in Spanish</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2021</dct:issued>
    <dcat:keyword>NLP</dcat:keyword>
    <dcat:keyword>clinical NLP</dcat:keyword>
    <dcat:keyword>MESINESP</dcat:keyword>
    <dcat:keyword>BioASQ</dcat:keyword>
    <dcat:keyword>Indexing</dcat:keyword>
    <dcat:keyword>Semantic Indexing</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-03-17</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/SPA"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/4612275"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/4612275</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.4612274"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/medicalnlp"/>
    <owl:versionInfo>1.0.0</owl:versionInfo>
    <dct:description>&lt;p&gt;Annotated corpora for MESINESP2 shared-task (Spanish BioASQ track, see &lt;a href="https://temu.bsc.es/mesinesp2"&gt;https://temu.bsc.es/mesinesp2&lt;/a&gt;). BioASQ 2021 will be held at CLEF 2021 (scheduled in Bucharest, Romania in September)&amp;nbsp;&lt;a href="http://clef2021.clef-initiative.eu/"&gt;http://clef2021.clef-initiative.eu/&amp;nbsp;&lt;/a&gt;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Introduction:&lt;/strong&gt;&lt;br&gt; These corpora contain the data for each of the sub-tracks of MESINESP2 shared-task:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;&lt;strong&gt;Track 1- Medical indexing&lt;/strong&gt;: &amp;nbsp; &lt;ul&gt; &lt;li&gt;&lt;em&gt;&lt;strong&gt;Training set: &lt;/strong&gt;&lt;/em&gt;It contains all spanish records from LILACS and IBECS databases at the Virtual Health Library (VHL) with non-empty abstract written in Spanish.&amp;nbsp;We have filtered out empty abstracts and non-Spanish abstracts.&amp;nbsp;&amp;nbsp;We have built the training dataset with the data crawled on 01/29/2021. This means that the data is a snapshot of that moment and that may change over time since LILACS and IBECS usually add or modify indexes after the first inclusion in the database.&amp;nbsp;We distribute two different datasets: &lt;ul&gt; &lt;li&gt;&lt;strong&gt;Articles training set:&amp;nbsp;&lt;/strong&gt;This corpus contains the set of 237574 Spanish scientific papers in VHL that have at least one DeCS code assigned to them.&lt;/li&gt; &lt;li&gt;&lt;strong&gt;Full training set&lt;/strong&gt;: This corpus contains the whole set of 249474 Spanish documents from VHL that have at leas one DeCS code assigned to them.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;&lt;strong&gt;Development set:&amp;nbsp;&lt;/strong&gt;We provide a development set manually indexed by expert annotators. This dataset includes 1065 articles annotated with DeCS by three expert indexers in this controlled vocabulary. The articles were initially indexed by 7 annotators, after analyzing the Inter-Annotator Agreement among their annotations we decided to select the 3 best ones, considering their annotations the valid ones to build the test set. From those 1065 records: &lt;ul&gt; &lt;li&gt;213 articles were annotated by more than one annotator. We have selected de union between annotations.&lt;/li&gt; &lt;li&gt;852 articles were annotated by only one of the three selected annotators with better performance.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;&lt;strong&gt;Test set:&amp;nbsp;&lt;/strong&gt;To be published&amp;nbsp;&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;&lt;strong&gt;Track 2- Clinical trials&lt;/strong&gt;: &amp;nbsp;&lt;br&gt; &lt;ul&gt; &lt;li&gt;&lt;strong&gt;Training set:&amp;nbsp;&lt;/strong&gt;The training dataset contains records from&amp;nbsp;&lt;a href="https://reec.aemps.es/reec/public/web.html"&gt;Registro Espa&amp;ntilde;ol de Estudios Cl&amp;iacute;nicos (REEC)&lt;/a&gt;. REEC doesn&amp;#39;t&amp;nbsp;provide documents with the structure title/abstract needed in BioASQ, for that reason we have built artificial abstracts based on the content available in the data crawled using the REEC&amp;nbsp;&lt;a href="https://github.com/luisgasco/REECapi"&gt;API&lt;/a&gt;.&amp;nbsp;Clinical trials are not indexed with DeCS terminology, we have used as training data a set of 3592 clinical trials that were automatically annotated in the first edition of MESINESP and that were published as a&amp;nbsp;&lt;a href="https://zenodo.org/record/3946558#.YFHyhZ1KiUk"&gt;Silver Standard outcome&lt;/a&gt;. Because the performance of the models used by the participants was variable, we have only selected predictions from runs with a MiF higher than 0.30, which corresponds with the submission of the best three teams. We have selected the union of all codes assigned by those team.&lt;/li&gt; &lt;li&gt;&lt;strong&gt;Development set: &lt;/strong&gt;We provide a development set manually indexed by expert annotators. This dataset includes 147 clinical trials annotated with DeCS by seven expert indexers in this controlled vocabulary.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;&lt;strong&gt;Track 3- Patents:&amp;nbsp;&lt;/strong&gt;To be published&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;&lt;strong&gt;Files structure:&lt;/strong&gt;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;MESINESP2_corpus.zip&lt;/strong&gt; contains the corpora generated for the shared task. Content:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;Subtrack1: &lt;ul&gt; &lt;li&gt;Train &lt;ul&gt; &lt;li&gt;training_set_track1_all.json: Full training set for sub-track 1.&lt;/li&gt; &lt;li&gt;training_set_track1_only_articles.json:&amp;nbsp;Articles training set for sub-track 1.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;Test &lt;ul&gt; &lt;li&gt;development_set_subtrack1.json: Manually annotated&amp;nbsp;development set for sub-track 1.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;Subtrack2: &lt;ul&gt; &lt;li&gt;Train &lt;ul&gt; &lt;li&gt;training_set_subtrack2.json: Training set for sub-track 2.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;Test &lt;ul&gt; &lt;li&gt;development_set_subtrack2.json:&amp;nbsp;Manually annotated&amp;nbsp;development set for sub-track 2.&lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;/ul&gt; &lt;/li&gt; &lt;li&gt;Subtrack3: This folder is empty. Data for sub-track&amp;nbsp;3 will be published soon.&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;DeCS2020.tsv&lt;/strong&gt; contains a DeCS table with the following structure:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;DeCS code&lt;/li&gt; &lt;li&gt;Preferred descriptor (the preferred label in the &lt;code&gt;Latin Spanish Decs&amp;nbsp;&lt;/code&gt;2020 set)&lt;/li&gt; &lt;li&gt;List of synonyms (the descriptors and synonyms from&amp;nbsp;&lt;code&gt;Latin Spanish DeCS 2020, separate by pipes)&lt;/code&gt;&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;DeCS2020.obo&amp;nbsp;&lt;/strong&gt;contains the *.obo file with the hierarchical relationships between DeCS descriptors.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;For further information, please visit&amp;nbsp;&lt;a href="https://temu.bsc.es/smm4h-spanish/"&gt;https://temu.bsc.es/mesinesp2/&lt;/a&gt;&amp;nbsp;or email us at encargo-pln-life@bsc.es&lt;/p&gt;</dct:description>
    <dct:description>Funded by the Plan de Impulso de las Tecnologías del Lenguaje (Plan TL).</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.4612275"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4612275</dcat:accessURL>
        <dcat:byteSize>21364231</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4612275/files/DeCS2020.obo</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4612275</dcat:accessURL>
        <dcat:byteSize>8058001</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4612275/files/DeCS2020.tsv</dcat:downloadURL>
        <dcat:mediaType>text/tab-separated-values</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4612275</dcat:accessURL>
        <dcat:byteSize>255195699</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4612275/files/MESINESP2_corpus.zip.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
2,171
481
views
downloads
All versions This version
Views 2,171131
Downloads 48135
Data volume 37.4 GB4.1 GB
Unique views 1,617108
Unique downloads 18715

Share

Cite as