Conference paper Open Access

Text Simplification from Professionally Produced Corpora

Carolina Scarton; Gustavo Henrique Paetzold; Lucia Specia


DataCite XML Export

<?xml version='1.0' encoding='utf-8'?>
<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
  <identifier identifierType="DOI">10.5281/zenodo.1410451</identifier>
  <creators>
    <creator>
      <creatorName>Carolina Scarton</creatorName>
      <affiliation>University of Sheffield</affiliation>
    </creator>
    <creator>
      <creatorName>Gustavo Henrique Paetzold</creatorName>
      <affiliation>University of Sheffield</affiliation>
    </creator>
    <creator>
      <creatorName>Lucia Specia</creatorName>
      <affiliation>University of Sheffield</affiliation>
    </creator>
  </creators>
  <titles>
    <title>Text Simplification from Professionally Produced Corpora</title>
  </titles>
  <publisher>Zenodo</publisher>
  <publicationYear>2018</publicationYear>
  <dates>
    <date dateType="Issued">2018-05-07</date>
  </dates>
  <resourceType resourceTypeGeneral="ConferencePaper"/>
  <alternateIdentifiers>
    <alternateIdentifier alternateIdentifierType="url">https://zenodo.org/record/1410451</alternateIdentifier>
  </alternateIdentifiers>
  <relatedIdentifiers>
    <relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.1410450</relatedIdentifier>
    <relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">https://zenodo.org/communities/h2020-simpatico-692819</relatedIdentifier>
  </relatedIdentifiers>
  <rightsList>
    <rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
    <rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
  </rightsList>
  <descriptions>
    <description descriptionType="Abstract">&lt;p&gt;The lack of large and reliable datasets has been hindering progress in Text Simplification (TS). We investigate the application of the recently created Newsela corpus, the largest collection of professionally written simplifications available, in TS tasks. Using new alignment algorithms, we extract 550,644 complex-simple sentence pairs from the corpus. This data is explored in different ways: (i) we show that traditional readability metrics capture surprisingly well the different complexity levels in this corpus, (ii) we build machine learning models to classify sentences into complex vs. simple and to predict complexity levels that outperform their respective baselines, (iii) we introduce a lexical simplifier that uses the corpus to generate candidate simplifications and outperforms the state of the art approaches, and (iv) we show that the corpus can be used to learn sentence simplification patterns in more effective ways than corpora used in previous work.&lt;/p&gt;</description>
  </descriptions>
  <fundingReferences>
    <fundingReference>
      <funderName>European Commission</funderName>
      <funderIdentifier funderIdentifierType="Crossref Funder ID">10.13039/501100000780</funderIdentifier>
      <awardNumber awardURI="info:eu-repo/grantAgreement/EC/H2020/692819/">692819</awardNumber>
      <awardTitle>SIMplifying the interaction with Public Administration Through Information technology for Citizens and cOmpanies</awardTitle>
    </fundingReference>
  </fundingReferences>
</resource>
56
31
views
downloads
All versions This version
Views 5656
Downloads 3131
Data volume 7.5 MB7.5 MB
Unique views 5252
Unique downloads 2828

Share

Cite as