Dataset Open Access

Webis-TLDR-17 Corpus

Syed, Shahbaz; Voelske, Michael; Potthast, Martin; Stein, Benno


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.1043504">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.1043504</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.1043504"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Syed, Shahbaz</foaf:name>
        <foaf:givenName>Shahbaz</foaf:givenName>
        <foaf:familyName>Syed</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Bauhaus-Universität Weimar</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Voelske, Michael</foaf:name>
        <foaf:givenName>Michael</foaf:givenName>
        <foaf:familyName>Voelske</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Bauhaus-Universität Weimar</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Potthast, Martin</foaf:name>
        <foaf:givenName>Martin</foaf:givenName>
        <foaf:familyName>Potthast</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Bauhaus-Universität Weimar</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Stein, Benno</foaf:name>
        <foaf:givenName>Benno</foaf:givenName>
        <foaf:familyName>Stein</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Bauhaus-Universität Weimar</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Webis-TLDR-17 Corpus</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2017</dct:issued>
    <dcat:keyword>tl;dr</dcat:keyword>
    <dcat:keyword>Abstractive Summarization</dcat:keyword>
    <dcat:keyword>Social Media Dataset</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2017-11-07</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/1043504"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/1043504</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <foaf:page rdf:resource="https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-webis-tldr-17/"/>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.1043503"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/webis"/>
    <dct:description>&lt;p&gt;This corpus contains preprocessed posts from the Reddit dataset, suitable for abstractive summarization using deep learning. The format is a json file where each line is a JSON object representing a post. The schema of each post is shown below:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;author: string (nullable = true)&lt;/li&gt; &lt;li&gt;body: string (nullable = true)&lt;/li&gt; &lt;li&gt;normalizedBody: string (nullable = true)&lt;/li&gt; &lt;li&gt;content: string (nullable = true)&lt;/li&gt; &lt;li&gt;content_len: long (nullable = true)&lt;/li&gt; &lt;li&gt;summary: string (nullable = true)&lt;/li&gt; &lt;li&gt;summary_len: long (nullable = true)&lt;/li&gt; &lt;li&gt;id: string (nullable = true)&lt;/li&gt; &lt;li&gt;subreddit: string (nullable = true)&lt;/li&gt; &lt;li&gt;subreddit_id: string (nullable = true)&lt;/li&gt; &lt;li&gt;title: string (nullable = true)&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;Specifically, the &lt;strong&gt;content&lt;/strong&gt; and &lt;strong&gt;summary&lt;/strong&gt; fields can be directly used as inputs to a deep learning model (e.g. Sequence to Sequence model ). The dataset consists of 3,848,330 posts with an average length of 270 words for content, and 28 words for the summary. The dataset is a combination of both the Submissions and Comments merged on the common schema. As a result, most of the comments which do not belong to any submission have &lt;strong&gt;null&lt;/strong&gt; as their title.&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Note :&amp;nbsp;&lt;/strong&gt;This corpus does not contain a separate test set. Thus it is up to the users to divide the corpus into appropriate training, validation and test sets.&lt;br&gt; &lt;br&gt; &amp;nbsp;&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1043504"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.1043504</dcat:accessURL>
        <dcat:byteSize>3141854161</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
1,104
1,147
views
downloads
All versions This version
Views 1,1041,106
Downloads 1,1471,147
Data volume 3.6 TB3.6 TB
Unique views 993995
Unique downloads 887887

Share

Cite as