{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2021-11-30T11:21:33.283482+00:00", "custom_fields": { "meeting:meeting": { "acronym": "EDDI2021", "dates": "November 30 + December 1, 2021", "place": "virtual (Paris)", "session": "Enhancing Metadata (1)", "title": "13th Annual European DDI User Conference", "url": "https://www.eddi-conferences.eu/eddi-2021/" } }, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 1, "enabled": true, "entries": { "SDe EDDI Enhancing Metadata.pdf": { "checksum": "md5:6e848ad1cfa49c8a8971600d3b1fd3ef", "ext": "pdf", "id": "9bb5be68-b904-46d3-ac2f-4ea5e0551d2d", "key": "SDe EDDI Enhancing Metadata.pdf", "metadata": null, "mimetype": "application/pdf", "size": 1080533 } }, "order": [], "total_bytes": 1080533 }, "id": "5742916", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/5742916/access", "access_links": "https://zenodo.org/api/records/5742916/access/links", "access_request": "https://zenodo.org/api/records/5742916/access/request", "access_users": "https://zenodo.org/api/records/5742916/access/users", "archive": "https://zenodo.org/api/records/5742916/files-archive", "archive_media": "https://zenodo.org/api/records/5742916/media-files-archive", "communities": "https://zenodo.org/api/records/5742916/communities", "communities-suggestions": "https://zenodo.org/api/records/5742916/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.5742916", "draft": "https://zenodo.org/api/records/5742916/draft", "files": "https://zenodo.org/api/records/5742916/files", "latest": "https://zenodo.org/api/records/5742916/versions/latest", "latest_html": "https://zenodo.org/records/5742916/latest", "media_files": "https://zenodo.org/api/records/5742916/media-files", "parent": "https://zenodo.org/api/records/5742915", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.5742915", "parent_html": "https://zenodo.org/records/5742915", "requests": "https://zenodo.org/api/records/5742916/requests", "reserve_doi": "https://zenodo.org/api/records/5742916/draft/pids/doi", "self": "https://zenodo.org/api/records/5742916", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.5742916", "self_html": "https://zenodo.org/records/5742916", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:5742916/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:5742916/sequence/default", "versions": "https://zenodo.org/api/records/5742916/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "Department of Computer Science, University of Surrey" } ], "person_or_org": { "family_name": "De", "given_name": "Suparna", "name": "De, Suparna", "type": "personal" } }, { "affiliations": [ { "name": "Centre for Advanced Research Computing, UCL" } ], "person_or_org": { "family_name": "Moss", "given_name": "Harry", "name": "Moss, Harry", "type": "personal" } }, { "affiliations": [ { "name": "Centre for Advanced Research Computing, UCL" } ], "person_or_org": { "family_name": "Jabbari", "given_name": "Sanaz", "name": "Jabbari, Sanaz", "type": "personal" } }, { "affiliations": [ { "name": "Department of Computer Science, University of Surrey" } ], "person_or_org": { "family_name": "Pereira", "given_name": "Haeron", "name": "Pereira, Haeron", "type": "personal" } }, { "affiliations": [ { "name": "CLOSER, UCL Institute of Education" } ], "person_or_org": { "family_name": "Johnson", "given_name": "Jon", "name": "Johnson, Jon", "type": "personal" } }, { "affiliations": [ { "name": "CLOSER, UCL Institute of Education" } ], "person_or_org": { "family_name": "Li", "given_name": "Jenny", "name": "Li, Jenny", "type": "personal" } } ], "description": "
Data Documentation Initiative-Lifecycle (DDI-L) introduced a robust metadata model to support the capture of questionnaire content and flow, and encouraged through support for versioning and provenancing, objects such as BasedOn for the reuse of existing question items. However, the dearth of questionnaire banks including both question text and response domains has meant that an ecosystem to support the development of DDI ready CAI tools has been limited. Archives hold the information in PDFs associated with surveys, but extracting that in an efficient manner into DDI-Lifecycle is a significant challenge.
\n\nWhile CLOSER Discovery has been championing the provision of high-quality questionnaire metadata in DDI-Lifecycle, this has primarily been done manually. More automated methods need to be explored to ensure scalable metadata annotation and uplift.
\n\nThis paper presents initial results in engineering a machine learning (ML) pipeline to automate the extraction of questions from survey questionnaires as PDFs. Using CLOSER Discovery as a ‘training dataset', a number of machine learning approaches have been explored to classify parsed text from questionnaires to be output as valid DDI items for inclusion in a DDI-L compliant repository.
\n\nThe developed ML pipeline adopts a continuous build and integrate approach, with processes in place to keep track of various combinations of the structured DDI-L input metadata, ML models and model parameters against the defined evaluation metrics, thus enabling reproducibility and comparative analysis of the experiments. Tangible outputs include a map of the various metadata and model parameters with the corresponding evaluation metrics' values, which enable model tuning as well as transparent management of data and experiments.
\n\nA recorded version of this presentation can be found here: https://youtu.be/klFsyCDE960
", "languages": [ { "id": "eng", "title": { "en": "English" } } ], "publication_date": "2021-11-30", "publisher": "Zenodo", "resource_type": { "id": "presentation", "title": { "de": "Pr\u00e4sentation", "en": "Presentation" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "title": "Engineering a Machine Learning Pipeline for Automating Metadata Extraction from Longitudinal Survey Questionnaires" }, "parent": { "access": { "owned_by": { "user": 22658 } }, "communities": { "default": "59a258c1-eaf1-43e9-95e4-9660a3000161", "entries": [ { "access": { "member_policy": "open", "members_visibility": "public", "record_policy": "open", "review_policy": "open", "visibility": "public" }, "children": { "allow": false }, "created": "2021-11-25T06:11:20.061706+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "id": "59a258c1-eaf1-43e9-95e4-9660a3000161", "links": {}, "metadata": { "page": "Information on EDDI 2021 can be found on the conference's webpage: https://www.eddi-conferences.eu/eddi-2021/
\n\nRecordings of presentations and tutorials can be found on the YouTube playlists: https://www.youtube.com/playlist?list=PLii5T1O4gQHk6xDePurzc6uMgOv4gzNHX
", "title": "13th Annual European DDI User Conference", "type": { "id": "event" }, "website": "https://www.eddi-conferences.eu" }, "revision_id": 1, "slug": "eddi21", "updated": "2023-12-05T10:34:16.201618+00:00" } ], "ids": [ "59a258c1-eaf1-43e9-95e4-9660a3000161" ] }, "id": "5742915", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5742915", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5742916", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:5742916", "provider": "oai" } }, "revision_id": 3, "stats": { "all_versions": { "data_volume": 133986092.0, "downloads": 124, "unique_downloads": 116, "unique_views": 212, "views": 224 }, "this_version": { "data_volume": 133986092.0, "downloads": 124, "unique_downloads": 116, "unique_views": 210, "views": 222 } }, "status": "published", "updated": "2023-01-19T15:43:17.500245+00:00", "versions": { "index": 1, "is_latest": true } }