Dataset Open Access

Webis ChangeMyView Corpus 2020 (Webis-CMV-20)

Al-Khatib, Khalid; Völske, Michael; Syed, Shahbaz; Kolyada, Nikolay; Stein, Benno


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/author_entity_category.jsonl.bz2"
      }, 
      "checksum": "md5:e41bdc8e1a48b900e89d1b8f55c820a0", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "author_entity_category.jsonl.bz2", 
      "type": "bz2", 
      "size": 1651067128
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/author_liwc.jsonl.bz2"
      }, 
      "checksum": "md5:3c5edd9ceeac9ddbe2d6d9e6695ec006", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "author_liwc.jsonl.bz2", 
      "type": "bz2", 
      "size": 14720534
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/author_subreddit_category.jsonl.bz2"
      }, 
      "checksum": "md5:76ab3db22c805e22847362dcf680911d", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "author_subreddit_category.jsonl.bz2", 
      "type": "bz2", 
      "size": 6040755
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/author_subreddit.jsonl.bz2"
      }, 
      "checksum": "md5:4dd316a68b35ae58ccf4b0d726f31817", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "author_subreddit.jsonl.bz2", 
      "type": "bz2", 
      "size": 7229520
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/pairs.jsonl.bz2"
      }, 
      "checksum": "md5:f68d4129c7063488832f16e927dbfa1d", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "pairs.jsonl.bz2", 
      "type": "bz2", 
      "size": 18093525
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/posts_malleability.jsonl.bz2"
      }, 
      "checksum": "md5:5a413ea0dd5c6ee391b623588ec71e00", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "posts_malleability.jsonl.bz2", 
      "type": "bz2", 
      "size": 426602510
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25/threads.jsonl.bz2"
      }, 
      "checksum": "md5:d9cd9aa21bf22d80dc298059af823310", 
      "bucket": "b82010cf-0103-4216-8474-fa6fe0252f25", 
      "key": "threads.jsonl.bz2", 
      "type": "bz2", 
      "size": 660888940
    }
  ], 
  "owners": [
    65747
  ], 
  "doi": "10.5281/zenodo.3778298", 
  "stats": {
    "version_unique_downloads": 4.0, 
    "unique_views": 42.0, 
    "views": 53.0, 
    "version_views": 53.0, 
    "unique_downloads": 4.0, 
    "version_unique_views": 42.0, 
    "volume": 5442919206.0, 
    "version_downloads": 12.0, 
    "downloads": 12.0, 
    "version_volume": 5442919206.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.3778298", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.3778297", 
    "bucket": "https://zenodo.org/api/files/b82010cf-0103-4216-8474-fa6fe0252f25", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.3778297.svg", 
    "html": "https://zenodo.org/record/3778298", 
    "latest_html": "https://zenodo.org/record/3778298", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.3778298.svg", 
    "latest": "https://zenodo.org/api/records/3778298"
  }, 
  "conceptdoi": "10.5281/zenodo.3778297", 
  "created": "2020-05-08T07:49:38.431745+00:00", 
  "updated": "2020-05-13T20:20:42.137372+00:00", 
  "conceptrecid": "3778297", 
  "revision": 2, 
  "id": 3778298, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.3778298", 
    "description": "<p>The Webis-CMV-20 dataset comprises all&nbsp;available posts and comments in the <a href=\"https://reddit.com/r/changemyview\">ChangeMyView</a>&nbsp;subreddit&nbsp;from the foundation of the subreddit&nbsp;in 2005, until September 2017. From these, we have derived two sub-datasets for the tasks of persuasiveness prediction, and opinion malleability prediction. In addition, the corpus comprises historical posts by CMV authors, and derived personal characteristics.</p>\n\n<p><strong>Dataset specification</strong></p>\n\n<p>All files are in bzip2-compressed <a href=\"http://jsonlines.org/\">JSON Lines</a> format.</p>\n\n<ul>\n\t<li><strong>threads.jsonl:</strong> contains all the selected discussion threads from CMV</li>\n\t<li><strong>pairs.jsonl:</strong> each record contains submission, delta-comment and nondelta-comment and the comments&#39;&nbsp;similarity score</li>\n\t<li><strong>posts-malleability.jsonl:</strong> contains&nbsp;posts&nbsp;for&nbsp;opinion mallebility prediction,&nbsp;in the format provided in the original <a href=\"https://files.pushshift.io/reddit/\">Reddit Crawl</a> dataset</li>\n\t<li><strong>author_entity_category.jsonl:</strong> each record contains the author and list of Wikipedia entities mentioned by the author in the messages across all subreddits. For each mentioned entity we provide the following data:&nbsp;</li>\n</ul>\n\n<pre><code class=\"language-json\">[title, wikidata_id, wikipedia_page_id, mentioned_entity_title, wikifier_score, subreddit_name, subreddit_id, subreddit_category_name, subreddit_topcategory_name]</code></pre>\n\n<ul>\n\t<li><strong>author_liwc.jsonl:</strong>&nbsp;personality traits features computed with <a href=\"https://liwc.wpengine.com/\">LIWC</a> for the authors from pairs.jsonl and post_malleability.jsonl datasets</li>\n\t<li><strong>author_subreddit.jsonl:</strong> for each author statistics of all number of all posts (submissions/comments) across all subreddits are provided</li>\n\t<li><strong>author_subreddit_category.jsonl:</strong> similar to author_subreddit.jsonl, the statistics of all author posts is grouped by top-categories and categories according to <a href=\"https://snoopsnoo.com/subreddits/\">snoopsnoo.com</a><br>\n\t&nbsp;</li>\n</ul>", 
    "language": "eng", 
    "title": "Webis ChangeMyView Corpus 2020 (Webis-CMV-20)", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "relations": {
      "version": [
        {
          "count": 1, 
          "index": 0, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "3778297"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "3778298"
          }
        }
      ]
    }, 
    "communities": [
      {
        "id": "webis"
      }
    ], 
    "keywords": [
      "social media", 
      "argumentation", 
      "persuasiveness"
    ], 
    "publication_date": "2020-04-30", 
    "creators": [
      {
        "affiliation": "Bauhaus-Universit\u00e4t Weimar", 
        "name": "Al-Khatib, Khalid"
      }, 
      {
        "orcid": "0000-0002-9283-6846", 
        "affiliation": "Bauhaus-Universit\u00e4t Weimar", 
        "name": "V\u00f6lske, Michael"
      }, 
      {
        "orcid": "0000-0002-4821-1507", 
        "affiliation": "Leipzig University", 
        "name": "Syed, Shahbaz"
      }, 
      {
        "orcid": "0000-0002-6493-9557", 
        "affiliation": "Bauhaus-Universit\u00e4t Weimar", 
        "name": "Kolyada, Nikolay"
      }, 
      {
        "orcid": "0000-0001-9033-2217", 
        "affiliation": "Bauhaus-Universit\u00e4t Weimar", 
        "name": "Stein, Benno"
      }
    ], 
    "access_right": "open", 
    "resource_type": {
      "type": "dataset", 
      "title": "Dataset"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.3778297", 
        "relation": "isVersionOf"
      }
    ]
  }
}
53
12
views
downloads
All versions This version
Views 5353
Downloads 1212
Data volume 5.4 GB5.4 GB
Unique views 4242
Unique downloads 44

Share

Cite as