Dataset Open Access

Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection

Tahmasebi, Nina; Hengchen, Simon; Schlechtweg, Dominik; McGillivray, Barbara; Dubossarsky, Haim


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.3730550">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.3730550</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.3730550"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Tahmasebi, Nina</foaf:name>
        <foaf:givenName>Nina</foaf:givenName>
        <foaf:familyName>Tahmasebi</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Språkbanken, University of Gothenburg</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Hengchen, Simon</foaf:name>
        <foaf:givenName>Simon</foaf:givenName>
        <foaf:familyName>Hengchen</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Helsinki</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Schlechtweg, Dominik</foaf:name>
        <foaf:givenName>Dominik</foaf:givenName>
        <foaf:familyName>Schlechtweg</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IMS, University of Stuttgart</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>McGillivray, Barbara</foaf:name>
        <foaf:givenName>Barbara</foaf:givenName>
        <foaf:familyName>McGillivray</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>The Alan Turing Institute</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Dubossarsky, Haim</foaf:name>
        <foaf:givenName>Haim</foaf:givenName>
        <foaf:familyName>Dubossarsky</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Cambridge</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2020</dct:issued>
    <dcat:keyword>unsupervised lexical semantic change detection, semantic change, SemEval2020, Kubhist2</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2020-02-19</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/SWE"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/3730550"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/3730550</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.3672949"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/natural-language-processing"/>
    <owl:versionInfo>v2</owl:versionInfo>
    <dct:description>&lt;p&gt;This data collection contains the Swedish test data for &lt;a href="https://competitions.codalab.org/competitions/20948"&gt;SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection:&lt;/a&gt;&lt;/p&gt; &lt;p&gt;- a Swedish text corpus pair (`corpus1/`, `corpus2/`)&lt;br&gt; - 31 lemmas which have been annotated for their lexical semantic change between the two corpora (`targets.txt`)&lt;br&gt; - the annotated binary change scores of the targets for subtask 1, and their annotated graded change scores for subtask 2 (`truth/`)&lt;/p&gt; &lt;p&gt;We sample from the KubHist2 corpus, digitized by the National Library of Sweden, and available through the Spr&amp;aring;kbanken corpus infrastructure Korp (&lt;a href="https://www.researchgate.net/profile/Markus_Forsberg/publication/266352576_Korp_-_the_corpus_infrastructure_of_Sprakbanken/links/55bf1ee008aed621de121ba3/Korp-the-corpus-infrastructure-of-Sprakbanken.pdf"&gt;Borin et al., 2012&lt;/a&gt;). The full corpus is available through a CC BY (attribution) license. Each word for which the lemmatizer in the Korp pipelien has found a lemma is replaced with the lemma. In cases where the lemmatizer cannot find a lemma, we leave the word as is (i.e., unlemmatized, no lower-casing). KubHist contains very frequent OCR errors, especially for the older data.More detail about the properties and quality of the Kubhist corpus can be found in (&lt;a href="https://www.diva-portal.org/smash/get/diva2:1358014/FULLTEXT01.pdf#page=28"&gt;Adesam et al., 2019&lt;/a&gt;).&lt;/p&gt; &lt;p&gt;Lars Borin, Markus Forsberg, and Johan Roxendal. &amp;quot;Korp-the corpus infrastructure of Spr&amp;aring;kbanken.&amp;quot; &lt;em&gt;LREC&lt;/em&gt;. 2012.&lt;/p&gt; &lt;p&gt;Adesam, Yvonne, Dana Dann&amp;eacute;lls, and Nina Tahmasebi. &amp;quot;Exploring the Quality of the Digital Historical Newspaper Archive KubHist.&amp;quot; &lt;em&gt;DHN&lt;/em&gt;. 2019.&lt;/p&gt; &lt;p&gt;__Corpus 1__&lt;/p&gt; &lt;p&gt;- based on: &lt;a href="https://spraakbanken.gu.se/korp/?mode=kubhist"&gt;Kubhist2&lt;/a&gt;&lt;br&gt; - language: Swedish&lt;br&gt; - time covered: 1790-1830&lt;br&gt; - size: ~71 million tokens&lt;br&gt; - format: lemmatized, sentence length &amp;gt; 9 (before removal of punctuation), no punctuation, sentences randomly shuffled&lt;br&gt; - encoding: UTF-8&lt;br&gt; - note: contains frequent OCR errors&lt;/p&gt; &lt;p&gt;__Corpus 2__&lt;/p&gt; &lt;p&gt;- based on:&amp;nbsp;&lt;a href="https://spraakbanken.gu.se/korp/?mode=kubhist"&gt;Kubhist2&lt;/a&gt;&lt;br&gt; - language: Swedish&lt;br&gt; - time covered: 1895-1903&lt;br&gt; - size: ~111 million tokens&lt;br&gt; - format: lemmatized, sentence length &amp;gt; 9 (before removal of punctuation), no punctuation, sentences randomly shuffled&lt;br&gt; - encoding: UTF-8&lt;br&gt; - note: contains OCR errors&lt;/p&gt; &lt;p&gt;Besides the official lemma version of the corpora for SemEval-2020 Task 1 we also provide the raw token version (`corpus1/token/`, `corpus2/token/`). It contains the raw sentences in the same order as in the lemma version. Find more information on the data and SemEval-2020 Task 1 in the paper referenced below.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;Reference:&lt;/p&gt; &lt;p&gt;Dominik Schlechtweg, Barbara McGillivray, Simon Hengchen, Haim Dubossarsky and Nina Tahmasebi.&lt;a href="https://competitions.codalab.org/competitions/20948"&gt;SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection&lt;/a&gt;. To appear in SemEval@COLING2020.&lt;/p&gt;</dct:description>
    <dct:description>The creation of the data was supported by the project Towards Computational Lexical Semantic Change Detection funded by a project grant from the Swedish Research Council (2019–2022; dnr 2018-01184). It has also been created as part of the effort to construct and develop a Swedish national research infrastructure in support of research based on language data. This infrastructure -- Nationella språkbanken (the Swedish National Language Bank) -- is jointly funded for the period 2018--2024 by the Swedish Research Council (grant number 2017-00626) and its 10 partner institutions.</dct:description>
    <dct:description>{"references": ["Dominik Schlechtweg, Barbara McGillivray, Simon Hengchen, Haim Dubossarsky and Nina Tahmasebi.SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection. To appear in SemEval@COLING2020."]}</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/2.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.3730550"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3730550</dcat:accessURL>
        <dcat:byteSize>1002486930</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3730550/files/semeval2020_ulscd_swe.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
775
1,016
views
downloads
All versions This version
Views 775187
Downloads 1,01694
Data volume 502.4 GB94.2 GB
Unique views 687155
Unique downloads 60081

Share

Cite as