{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2017-07-27T14:20:34.178817+00:00", "custom_fields": { "meeting:meeting": { "acronym": "ICWSM2017", "dates": "15-18 May 2017", "place": "Montreal, Canada", "title": "11TH INTERNATIONAL AAAI CONFERENCE ON WEB AND SOCIAL MEDIA 2017", "url": "http://icwsm.org/2017/" } }, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 28, "enabled": true, "entries": { "20161101-current_content-parts-1-50-pageids-12-117215.7z": { "checksum": "md5:8ca900ad3932434e24039901b55c58d0", "ext": "7z", "id": "67ba6e24-0325-4377-9042-6a58f987e3d6", "key": "20161101-current_content-parts-1-50-pageids-12-117215.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 1606972806 }, "20161101-current_content-parts-101-150-pageids-418317-1081580.7z": { "checksum": "md5:5bd0f0e8ff8334aac02a8297c8baf61c", "ext": "7z", "id": "fe5470d2-39f0-4a9f-9790-0be8e3902a48", "key": "20161101-current_content-parts-101-150-pageids-418317-1081580.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2067239753 }, "20161101-current_content-parts-151-200-pageids-1081586-2203796.7z": { "checksum": "md5:4ca06f0ccf24ff75046f8ef40b260aaa", "ext": "7z", "id": "9d8329e8-e293-4414-ac9c-cbffe785fa5a", "key": "20161101-current_content-parts-151-200-pageids-1081586-2203796.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2222868811 }, "20161101-current_content-parts-201-250-pageids-2203809-4051322.7z": { "checksum": "md5:9f896a2447f95aa0cda91f00bd44669a", "ext": "7z", "id": "85c3b686-9d96-4443-bce3-f8f820b03ade", "key": "20161101-current_content-parts-201-250-pageids-2203809-4051322.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2321978845 }, "20161101-current_content-parts-251-300-pageids-4051356-7027309.7z": { "checksum": "md5:ee988500d7731b57c1f65d7ff2fd3ffd", "ext": "7z", "id": "d4e44fd0-fa70-42e3-a8b4-4225b43ac58c", "key": "20161101-current_content-parts-251-300-pageids-4051356-7027309.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2429316947 }, "20161101-current_content-parts-301-350-pageids-7027310-11781922.7z": { "checksum": "md5:8c67e68ac1c252c02f4366cbd44a1fb6", "ext": "7z", "id": "cff43d98-7f14-43be-a83b-9b38bd2f091d", "key": "20161101-current_content-parts-301-350-pageids-7027310-11781922.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2549071729 }, "20161101-current_content-parts-351-400-pageids-11781924-17443368.7z": { "checksum": "md5:6f082f6e57d03acc2129a22981ee6cd6", "ext": "7z", "id": "4823deb0-0c83-44a1-8463-a6d40847a716", "key": "20161101-current_content-parts-351-400-pageids-11781924-17443368.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2683767121 }, "20161101-current_content-parts-401-450-pageids-17443414-23281466.7z": { "checksum": "md5:f6445d0b099dac02d9f1926e778711a1", "ext": "7z", "id": "d281b07a-504a-488c-bb98-8634d25a95c1", "key": "20161101-current_content-parts-401-450-pageids-17443414-23281466.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2705700680 }, "20161101-current_content-parts-451-500-pageids-23281469-29590519.7z": { "checksum": "md5:d94e8c44658d7f9a1878355a48f3adad", "ext": "7z", "id": "8e0d8e8d-b9e2-4990-adb7-1c890fa9a775", "key": "20161101-current_content-parts-451-500-pageids-23281469-29590519.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2853523418 }, "20161101-current_content-parts-501-550-pageids-29590554-36522618.7z": { "checksum": "md5:83d3cfbf6b0d3a21a7431667444de2e5", "ext": "7z", "id": "464fc4dc-ae9d-4b15-a59b-036c83a66a3e", "key": "20161101-current_content-parts-501-550-pageids-29590554-36522618.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2965062988 }, "20161101-current_content-parts-51-100-pageids-117216-418311.7z": { "checksum": "md5:7e06d1a4a2e4467ab056e6ed719b955d", "ext": "7z", "id": "763bf22c-98cd-4f45-84b9-85e0df218a98", "key": "20161101-current_content-parts-51-100-pageids-117216-418311.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 1970307010 }, "20161101-current_content-parts-551-600-pageids-36522655-43525178.7z": { "checksum": "md5:579468afdf3848406f090c0b56809bcd", "ext": "7z", "id": "61cd53df-cd2f-4cfb-8e61-041c231e3bae", "key": "20161101-current_content-parts-551-600-pageids-36522655-43525178.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3106646080 }, "20161101-current_content-parts-601-646-pageids-43525205-52158752.7z": { "checksum": "md5:8128ec72ed59a926a11cb4bcc8515cb9", "ext": "7z", "id": "b8da2c8f-472a-4b08-92bd-b7c576d949b3", "key": "20161101-current_content-parts-601-646-pageids-43525205-52158752.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3025220899 }, "20161101-deleted_content-parts-1-50-pageids-12-117215.7z": { "checksum": "md5:d2720327ed6afb13c996072bda08fe64", "ext": "7z", "id": "859eca71-06e9-491e-995e-1cd61897d8c7", "key": "20161101-deleted_content-parts-1-50-pageids-12-117215.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3815769938 }, "20161101-deleted_content-parts-101-150-pageids-418317-1081580.7z": { "checksum": "md5:9873fbdbf5cb41c6ce77b489cd52464c", "ext": "7z", "id": "7dd71258-9338-4c28-9f9e-5fbfd267c45d", "key": "20161101-deleted_content-parts-101-150-pageids-418317-1081580.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3284102124 }, "20161101-deleted_content-parts-151-200-pageids-1081586-2203796.7z": { "checksum": "md5:af36d2cc05c4ade7df1e3fc5b91dbe28", "ext": "7z", "id": "b40d4173-fdee-4be7-ba78-bfc4f902bff0", "key": "20161101-deleted_content-parts-151-200-pageids-1081586-2203796.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3069851432 }, "20161101-deleted_content-parts-201-250-pageids-2203809-4051322.7z": { "checksum": "md5:f2eb903001ff4737698a434a0141521a", "ext": "7z", "id": "b684de2d-a49d-4d84-8131-9711dcbd8b4b", "key": "20161101-deleted_content-parts-201-250-pageids-2203809-4051322.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2923534210 }, "20161101-deleted_content-parts-251-300-pageids-4051356-7027309.7z": { "checksum": "md5:dfde922adedd029793ae4112737a48cd", "ext": "7z", "id": "3862801a-cb9e-4648-b5d8-3cb5fa2d2388", "key": "20161101-deleted_content-parts-251-300-pageids-4051356-7027309.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2725493141 }, "20161101-deleted_content-parts-301-350-pageids-7027310-11781922.7z": { "checksum": "md5:3ece1cfa4326c624d043aebf280fe7f5", "ext": "7z", "id": "4c87499f-47e4-4dda-b607-7285f05c4c65", "key": "20161101-deleted_content-parts-301-350-pageids-7027310-11781922.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2482457767 }, "20161101-deleted_content-parts-351-400-pageids-11781924-17443368.7z": { "checksum": "md5:32984e79e729a63b53f5558b7d7b8ff2", "ext": "7z", "id": "d741ffee-6454-4c93-b65f-e1bfa9b67518", "key": "20161101-deleted_content-parts-351-400-pageids-11781924-17443368.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2183873820 }, "20161101-deleted_content-parts-401-450-pageids-17443414-23281466.7z": { "checksum": "md5:d4d2fcc456cc275eb136a491f2ede1b9", "ext": "7z", "id": "4fe14c60-187c-4c13-abab-87780fa107bd", "key": "20161101-deleted_content-parts-401-450-pageids-17443414-23281466.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 2132694007 }, "20161101-deleted_content-parts-451-500-pageids-23281469-29590519.7z": { "checksum": "md5:4f2a91ef02f2201b5ca08bb677101272", "ext": "7z", "id": "d2b2f412-f9dc-4ead-8dd1-6eec128e58b0", "key": "20161101-deleted_content-parts-451-500-pageids-23281469-29590519.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 1872315160 }, "20161101-deleted_content-parts-501-550-pageids-29590554-36522618.7z": { "checksum": "md5:36f4189a308b53d20e355ecf759f1768", "ext": "7z", "id": "7a19fdcb-5d23-4da2-b7a1-375caaef323f", "key": "20161101-deleted_content-parts-501-550-pageids-29590554-36522618.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 1664376552 }, "20161101-deleted_content-parts-51-100-pageids-117216-418311.7z": { "checksum": "md5:1b7d2f6e670b4f896fbd12cc85cbc120", "ext": "7z", "id": "fabdee0e-233e-44e4-a04d-5866f10728c8", "key": "20161101-deleted_content-parts-51-100-pageids-117216-418311.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 3427327627 }, "20161101-deleted_content-parts-551-600-pageids-36522655-43525178.7z": { "checksum": "md5:e00115390203eef125b045c93e41dfe3", "ext": "7z", "id": "17d992c8-df2a-4389-87cc-c4f234e0e42a", "key": "20161101-deleted_content-parts-551-600-pageids-36522655-43525178.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 1381843071 }, "20161101-deleted_content-parts-601-646-pageids-43525205-52158752.7z": { "checksum": "md5:c80884b1a83af575d66c033a4e0dcec3", "ext": "7z", "id": "26a97859-b0a5-496d-bc3b-55290453ee3a", "key": "20161101-deleted_content-parts-601-646-pageids-43525205-52158752.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 882081898 }, "dataset_readme.txt": { "checksum": "md5:222d935035ba5aec699c0210802668d9", "ext": "txt", "id": "c4c38ee7-e760-415b-b8fa-c62bdf51bcaa", "key": "dataset_readme.txt", "metadata": null, "mimetype": "text/plain", "size": 4894 }, "revisions.7z": { "checksum": "md5:7ba169bad9fd5c6778cf16f11707976c", "ext": "7z", "id": "04b05c68-858f-4755-8963-7d48fc4de663", "key": "revisions.7z", "metadata": null, "mimetype": "application/octet-stream", "size": 4550119100 } }, "order": [], "total_bytes": 68903521828 }, "id": "834557", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/834557/access", "access_links": "https://zenodo.org/api/records/834557/access/links", "access_request": "https://zenodo.org/api/records/834557/access/request", "access_users": "https://zenodo.org/api/records/834557/access/users", "archive": "https://zenodo.org/api/records/834557/files-archive", "archive_media": "https://zenodo.org/api/records/834557/media-files-archive", "communities": "https://zenodo.org/api/records/834557/communities", "communities-suggestions": "https://zenodo.org/api/records/834557/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.834557", "draft": "https://zenodo.org/api/records/834557/draft", "files": "https://zenodo.org/api/records/834557/files", "latest": "https://zenodo.org/api/records/834557/versions/latest", "latest_html": "https://zenodo.org/records/834557/latest", "media_files": "https://zenodo.org/api/records/834557/media-files", "parent": "https://zenodo.org/api/records/789289", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.789289", "parent_html": "https://zenodo.org/records/789289", "requests": "https://zenodo.org/api/records/834557/requests", "reserve_doi": "https://zenodo.org/api/records/834557/draft/pids/doi", "self": "https://zenodo.org/api/records/834557", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.834557", "self_html": "https://zenodo.org/records/834557", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:834557/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:834557/sequence/default", "versions": "https://zenodo.org/api/records/834557/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "additional_descriptions": [ { "description": "Please cite\u00a010.5281/zenodo.789289 for all versions of this dataset, which will always resolve to the latest.", "type": { "id": "notes", "title": { "de": "Anmerkungen", "en": "Notes" } } } ], "creators": [ { "affiliations": [ { "name": "GESIS - Leibniz Institute for the Social Sciences" } ], "person_or_org": { "family_name": "Fl\u00f6ck", "given_name": "Fabian", "name": "Fl\u00f6ck, Fabian", "type": "personal" } }, { "affiliations": [ { "name": "GESIS - Leibniz Institute for the Social Sciences" } ], "person_or_org": { "family_name": "Erdogan", "given_name": "Kenan", "name": "Erdogan, Kenan", "type": "personal" } }, { "affiliations": [ { "name": "Karlsruhe Institute of Technology" } ], "person_or_org": { "family_name": "Acosta", "given_name": "Maribel", "name": "Acosta, Maribel", "type": "personal" } } ], "description": "
Fixes in version 1.1 (= Zenodo's\u00a0\"version 2\")
\n\n*In 20161101-revisions-part1-12-1728.csv, missing first data line is added.
\n\n*In Current_content and Deleted_content files, some token values ('str' column) which contain regular quotes ('\"') are fixed.
\n\n*In Current_content and Deleted_content files, some wrong revision ID values for 'origin_rev_id', 'in' and 'out' columns are fixed.
\n\n\u00a0------
\n\nThis dataset contains every instance of all tokens (\u2248 words) ever written in undeleted, non-redirect English Wikipedia articles until October 2016, in total 13,545,349,787\u00a0instances. Each token is annotated with (i) the article revision it was originally created in, and (ii) lists with all the revisions in which the token was ever deleted and (potentially) re-added and re-deleted from its article, enabling a complete and straightforward tracking of its history.
\n\nThis data would be exceedingly hard to create by an average potential user as it is (i) very expensive to compute and as (ii) accurately tracking the history of each token in revisioned documents is a non-trivial task.\u00a0
\nAdapting a state-of-the-art algorithm, we have produced a dataset that allows for a range of analyses and metrics, already popular in research and going beyond, to be generated on complete-Wikipedia scale; ensuring quality and allowing researchers to forego expensive text-comparison computation, which so far has hindered scalable usage.
This dataset, its creation process and use cases are described in a dedicated dataset paper of the same name, published at the ICWSM 2017 conference. In this paper, we show how this data enables, on token level, computation of provenance, measuring survival of content over time, very detailed conflict metrics, and fine-grained interactions of editors like partial reverts, re-additions and other metrics.
\n\nTokenization used: https://gist.github.com/faflo/3f5f30b1224c38b1836d63fa05d1ac94
\n\nToy example for how the token metadata is generated:\u00a0
\nhttps://gist.github.com/faflo/8bd212e81e594676f8d002b175b79de8
Be sure to read the ReadMe.txt or - even more detailed - the supporting paper which is referenced under \"related identifiers\".
", "publication_date": "2017-07-27", "publisher": "Zenodo", "references": [ { "reference": "Fl\u00f6ck, Fabian, and Acosta, Maribel. \"WikiWho: Precise and efficient attribution of authorship of revisioned content.\" Proceedings of the 23rd international conference on World Wide Web. ACM, 2014." }, { "reference": "Fabian Fl\u00f6ck, Kenan Erdogan, Maribel Acosta. \"TokTrack: A Complete Token Provenance and Change Tracking Dataset for the English Wikipedia.\" Proceedings of ICWSM2017 (to appear). Preprint: https://arxiv.org/abs/1703.08244" } ], "related_identifiers": [ { "identifier": "https://arxiv.org/abs/1703.08244", "relation_type": { "id": "issupplementedby", "title": { "de": "Wird erg\u00e4nzt durch", "en": "Is supplemented by" } }, "scheme": "url" }, { "identifier": "10.5281/zenodo.439699", "relation_type": { "id": "issupplementedby", "title": { "de": "Wird erg\u00e4nzt durch", "en": "Is supplemented by" } }, "scheme": "doi" }, { "identifier": "10.5281/zenodo.345571", "relation_type": { "id": "isnewversionof", "title": { "de": "Ist eine neue Version von", "en": "Is new version of" } }, "scheme": "doi" } ], "resource_type": { "id": "dataset", "title": { "de": "Datensatz", "en": "Dataset" } }, "rights": [ { "description": { "en": "Permits almost any use subject to providing credit and license notice. Frequently used for media assets and educational materials. The most common license for Open Access scientific publications. Not recommended for software." }, "icon": "cc-by-sa-icon", "id": "cc-by-sa-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by-sa/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution Share Alike 4.0 International" } } ], "subjects": [ { "subject": "Wikipedia" }, { "subject": "Collaborative Writing" }, { "subject": "Provenance" }, { "subject": "Authorship" }, { "subject": "Controversy" }, { "subject": "Content Survival" }, { "subject": "Content Persistence" }, { "subject": "Conflict" }, { "subject": "Reverts" }, { "subject": "Dataset" }, { "subject": "Computational Linguistics" } ], "title": "TokTrack: A Complete Token Provenance and Change Tracking Dataset for the English Wikipedia" }, "parent": { "access": { "owned_by": { "user": 28781 } }, "communities": { "entries": [ { "access": { "member_policy": "open", "members_visibility": "public", "record_policy": "open", "review_policy": "open", "visibility": "public" }, "children": { "allow": false }, "created": "2017-03-12T15:54:19.766905+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "id": "9b71b9de-135a-41d5-b8d3-e83fa4042139", "links": {}, "metadata": { "curation_policy": "Don't mirror/upload datasets that you did not create yourself. Make sure which copyright to use. When in doubt, contact Wikimedia Legal.
\r\n", "description": "Resources related to research of Wikipedia in its different language editions. \n\nAlso consider the community \"Wikimedia\" for a broader spectrum.", "page": "", "title": "Wikipedia-related research" }, "revision_id": 0, "slug": "wikipedia_data", "updated": "2017-04-02T18:57:33.608030+00:00" }, { "access": { "member_policy": "open", "members_visibility": "public", "record_policy": "open", "review_policy": "open", "visibility": "public" }, "children": { "allow": false }, "created": "2013-05-15T10:37:41+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "id": "9bacf7b2-e324-4e1b-b74c-263dda076d35", "links": {}, "metadata": { "curation_policy": "", "description": "Collection of papers related to Wikimedia projects (i.e. Wikipedia), mass collaboration and online communities.", "page": "", "title": "Wikimedia" }, "revision_id": 0, "slug": "wikimedia", "updated": "2017-03-13T17:27:14.027088+00:00" } ], "ids": [ "9b71b9de-135a-41d5-b8d3-e83fa4042139", "9bacf7b2-e324-4e1b-b74c-263dda076d35" ] }, "id": "789289", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.789289", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.834557", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:834557", "provider": "oai" } }, "revision_id": 13, "stats": { "all_versions": { "data_volume": 4699157572781.0, "downloads": 2070, "unique_downloads": 1457, "unique_views": 3289, "views": 3448 }, "this_version": { "data_volume": 3047888893593.0, "downloads": 1290, "unique_downloads": 811, "unique_views": 1551, "views": 1652 } }, "status": "published", "updated": "2020-01-24T19:25:54.507343+00:00", "versions": { "index": 2, "is_latest": true } }