{ "access": { "embargo": { "active": false, "reason": null }, "files": "restricted", "record": "public", "status": "restricted" }, "created": "2022-02-14T08:26:03.465027+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "enabled": true }, "id": "5996864", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/5996864/access", "access_links": "https://zenodo.org/api/records/5996864/access/links", "access_request": "https://zenodo.org/api/records/5996864/access/request", "access_users": "https://zenodo.org/api/records/5996864/access/users", "archive": "https://zenodo.org/api/records/5996864/files-archive", "archive_media": "https://zenodo.org/api/records/5996864/media-files-archive", "communities": "https://zenodo.org/api/records/5996864/communities", "communities-suggestions": "https://zenodo.org/api/records/5996864/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.5996864", "draft": "https://zenodo.org/api/records/5996864/draft", "files": "https://zenodo.org/api/records/5996864/files", "latest": "https://zenodo.org/api/records/5996864/versions/latest", "latest_html": "https://zenodo.org/records/5996864/latest", "media_files": "https://zenodo.org/api/records/5996864/media-files", "parent": "https://zenodo.org/api/records/5996863", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.5996863", "parent_html": "https://zenodo.org/records/5996863", "requests": "https://zenodo.org/api/records/5996864/requests", "reserve_doi": "https://zenodo.org/api/records/5996864/draft/pids/doi", "self": "https://zenodo.org/api/records/5996864", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.5996864", "self_html": "https://zenodo.org/records/5996864", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:5996864/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:5996864/sequence/default", "versions": "https://zenodo.org/api/records/5996864/versions" }, "media_files": { "enabled": false }, "metadata": { "creators": [ { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Ivan Srba", "identifiers": [ { "identifier": "0000-0003-3511-5337", "scheme": "orcid" } ], "name": "Ivan Srba", "type": "personal" } }, { "affiliations": [ { "name": "Faculty of Information Technology, Brno University of Technology; Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Branislav Pecher", "identifiers": [ { "identifier": "0000-0003-0344-8620", "scheme": "orcid" } ], "name": "Branislav Pecher", "type": "personal" } }, { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Matus Tomlein", "identifiers": [ { "identifier": "0000-0002-9960-700X", "scheme": "orcid" } ], "name": "Matus Tomlein", "type": "personal" } }, { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Robert Moro", "identifiers": [ { "identifier": "0000-0002-3052-8290", "scheme": "orcid" } ], "name": "Robert Moro", "type": "personal" } }, { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Elena Stefancova", "identifiers": [ { "identifier": "0000-0001-8683-939X", "scheme": "orcid" } ], "name": "Elena Stefancova", "type": "personal" } }, { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Jakub Simko", "identifiers": [ { "identifier": "0000-0003-0239-4237", "scheme": "orcid" } ], "name": "Jakub Simko", "type": "personal" } }, { "affiliations": [ { "name": "Kempelen Institute of Intelligent Technologies" } ], "person_or_org": { "family_name": "Maria Bielikova", "identifiers": [ { "identifier": "0000-0003-4105-3494", "scheme": "orcid" } ], "name": "Maria Bielikova", "type": "personal" } } ], "description": "
Overview
\n\nThis dataset of medical misinformation was collected and is published by Kempelen Institute of Intelligent Technologies (KInIT). It consists of approx. 317k news articles and blog posts on medical topics published between January 1, 1998 and February 1, 2022 from a total of 207 reliable and unreliable sources. The dataset contains full-texts of the articles, their original source URL and other extracted metadata. If a source has a credibility score available (e.g., from Media Bias/Fact Check), it is also included in the form of annotation. Besides the articles, the dataset contains around 3.5k fact-checks and extracted verified medical claims with their unified veracity ratings published by fact-checking organisations such as Snopes or FullFact. Lastly and most importantly, the dataset contains 573 manually and more than 51k automatically labelled mappings between previously verified claims and the articles; mappings consist of two values: claim presence (i.e., whether a claim is contained in the given article) and article stance (i.e., whether the given article supports or rejects the claim or provides both sides of the argument).
\n\nThe dataset is primarily intended to be used as a training and evaluation set for machine learning methods for claim presence detection and article stance classification, but it enables a range of other misinformation related tasks, such as misinformation characterisation or analyses of misinformation spreading.
\n\nIts novelty and our main contributions lie in (1) focus on medical news article and blog posts as opposed to social media posts or political discussions; (2) providing multiple modalities (beside full-texts of the articles, there are also images and videos), thus enabling research of multimodal approaches; (3) mapping of the articles to the fact-checked claims (with manual as well as predicted labels); (4) providing source credibility labels for 95% of all articles and other potential sources of weak labels that can be mined from the articles' content and metadata.
\n\nThe dataset is associated with the research paper "Monant Medical Misinformation Dataset: Mapping Articles to Fact-Checked Claims" accepted and presented at ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR '22).
\n\nThe accompanying Github repository provides a small static sample of the dataset and the dataset's descriptive analysis in a form of Jupyter notebooks.
\n\n\n\n
Options to access the dataset
\n\nThere are two ways how to get access to the dataset:
\n\n1. Static dump of the dataset available in the CSV format
\n2. Continuously updated dataset available via REST API
In order to obtain an access to the dataset (either to full static dump or REST API), please, request the access by following instructions provided below.
\n\nReferences
\n\nIf you use this dataset in any publication, project, tool or in any other form, please, cite the following papers:
\n\n@inproceedings{SrbaMonantPlatform,\n author = {Srba, Ivan and Moro, Robert and Simko, Jakub and Sevcech, Jakub and Chuda, Daniela and Navrat, Pavol and Bielikova, Maria},\n booktitle = {Proceedings of Workshop on Reducing Online Misinformation Exposure (ROME 2019)},\n pages = {1--7},\n title = {Monant: Universal and Extensible Platform for Monitoring, Detection and Mitigation of Antisocial Behavior},\n year = {2019}\n}
\n\n@inproceedings{SrbaMonantMedicalDataset,\n author = {Srba, Ivan and Pecher, Branislav and Tomlein Matus and Moro, Robert and Stefancova, Elena and Simko, Jakub and Bielikova, Maria},\n booktitle = {Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR '22)},\n numpages = {11},\n title = {Monant Medical Misinformation Dataset: Mapping Articles to Fact-Checked Claims},\n year = {2022},\n doi = {10.1145/3477495.3531726},\n publisher = {Association for Computing Machinery},\n address = {New York, NY, USA},\n url = {https://doi.org/10.1145/3477495.3531726},\n}\n
\n\n
\nDataset creation process
In order to create this dataset (and to continuously obtain new data), we used our research platform Monant. The Monant platform provides so called data providers to extract news articles/blogs from news/blog sites as well as fact-checking articles from fact-checking sites. General parsers (from RSS feeds, Wordpress sites, Google Fact Check Tool, etc.) as well as custom crawler and parsers were implemented (e.g., for fact checking site Snopes.com). All data is stored in the unified format in a central data storage.
\n
\n
\nEthical considerations
The dataset was collected and is published for research purposes only. We collected only publicly available content of news/blog articles. The dataset contains identities of authors of the articles if they were stated in the original source; we left this information, since the presence of an author's name can be a strong credibility indicator. However, we anonymised the identities of the authors of discussion posts included in the dataset.
\n\nThe main identified ethical issue related to the presented dataset lies in the risk of mislabelling of an article as supporting a false fact-checked claim and, to a lesser extent, in mislabelling an article as not containing a false claim or not supporting it when it actually does. To minimise these risks, we developed a labelling methodology and require an agreement of at least two independent annotators to assign a claim presence or article stance label to an article. It is also worth noting that we do not label an article as a whole as false or true. Nevertheless, we provide partial article-claim pair veracities based on the combination of claim presence and article stance labels.
\n\nAs to the veracity labels of the fact-checked claims and the credibility (reliability) labels of the articles' sources, we take these from the fact-checking sites and external listings such as Media Bias/Fact Check as they are and refer to their methodologies for more details on how they were established.
\n\nLastly, the dataset also contains automatically predicted labels of claim presence and article stance using our baselines described in the next section. These methods have their limitations and work with certain accuracy as reported in this paper. This should be taken into account when interpreting them.
\n
\n
\nReporting mistakes in the dataset
\nThe mean to report considerable mistakes in raw collected data or in manual annotations is by creating a new issue in the accompanying Github repository. Alternately, general enquiries or requests can be sent at info [at] kinit.sk.
\nDataset structure
Raw data
\n\nAt first, the dataset contains so called raw data (i.e., data extracted by the Web monitoring module of Monant platform and stored in exactly the same form as they appear at the original websites). Raw data consist of articles from news sites and blogs (e.g. naturalnews.com), discussions attached to such articles, fact-checking articles from fact-checking portals (e.g. snopes.com). In addition, the dataset contains feedback (number of likes, shares, comments) provided by user on social network Facebook which is regularly extracted for all news/blogs articles.
\n\nRaw data are contained in these CSV files (and corresponding REST API endpoints):
\n\nNote: Personal information about discussion posts' authors (name, website, gravatar) are anonymised.
\n\n
\nAnnotations
Secondly, the dataset contains so called annotations. Entity annotations describe the individual raw data entities (e.g., article, source). Relation annotations describe relation between two of such entities.
\n\nEach annotation is described by the following attributes:
\n\n
\nAt the same time, annotations are associated with a particular object identified by:
\nThe dataset provides specifically these entity annotations:
The dataset provides specifically these relation annotations:
\n\n
\nAnnotations are contained in these CSV files (and corresponding REST API endpoints):
Note: Identification of human annotators authors (email provided in the annotation app) is anonymised.
\n\n\n\n
Enumerations
\n\nFinally, the dataset provides additional CSV files with enumerations:
\n\nIn order to share the dataset with you, please agree to the following terms:
\n\nIf you require also access to REST API, please, state it explicitly in your access request and elaborate in details how do you intend to use the REST API access (please, note that using the static dump is a preferable option due to performance reasons).
", "allow_guest_requests": true, "allow_user_requests": true, "secret_link_expiration": 30 } }, "communities": {}, "id": "5996863", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5996863", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5996864", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:5996864", "provider": "oai" } }, "revision_id": 10, "stats": { "all_versions": { "data_volume": 179709114111.0, "downloads": 61, "unique_downloads": 55, "unique_views": 905, "views": 1137 }, "this_version": { "data_volume": 176763063060.0, "downloads": 60, "unique_downloads": 54, "unique_views": 891, "views": 1118 } }, "status": "published", "updated": "2022-04-22T10:43:46.075931+00:00", "versions": { "index": 1, "is_latest": true } }