Software Open Access

On the Performance of Oversampling Techniques for Class Imbalance Problems

Jiawen Kong; Thiago Rios; Wojtek Kowalczyk; Stefan Menzel; Thomas Bäck


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.3855094">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Software"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.3855094</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.3855094"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Jiawen Kong</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Leiden</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Thiago Rios</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Honda Research Institute Europe GmBH</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-6973-1341">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-6973-1341</dct:identifier>
        <foaf:name>Wojtek Kowalczyk</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Leiden</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Stefan Menzel</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Honda Research Institute Europe GmBH</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0001-6768-1478">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0001-6768-1478</dct:identifier>
        <foaf:name>Thomas Bäck</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Leiden</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>On the Performance of Oversampling Techniques for Class Imbalance Problems</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2020</dct:issued>
    <dcat:keyword>Class imbalance</dcat:keyword>
    <dcat:keyword>Minority class distribution</dcat:keyword>
    <dcat:keyword>Data complexity measures</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/EC/H2020/766186/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2020-05-06</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/3855094"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/3855094</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.3855093"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/ecole_itn"/>
    <dct:description>&lt;p&gt;This file is the source code used in the paper below:&lt;/p&gt; &lt;p&gt;Jiawen Kong, Thiago Rios, Wojtek Kowalczyk, Stefan Menzel and Thomas B&amp;auml;ck, &amp;ldquo;On the Performance of Oversampling Techniques for Class Imbalance Problems&amp;rdquo; in the 24th Pacific-Asia Conference on Knowledge Discovery and Data Mining, Singapore, 11-14 May 2020, doi: 10.1007/978-3-030-47436-2_7&lt;/p&gt; &lt;p&gt;Although over 90 oversampling approaches have been developed in the imbalance learning domain, most of the empirical study and application work are still based on the &amp;ldquo;classical&amp;rdquo; resampling techniques. In this paper, several experiments on 19 benchmark datasets are set up to study the efficiency of six powerful oversampling approaches, including both &amp;ldquo;classical&amp;rdquo; and new ones. According to our experimental results, oversampling techniques that consider the minority class distribution (new ones) perform better in most cases and RACOG gives the best performance among the six reviewed approaches. We further validate our conclusion on our real-world inspired vehicle datasets and also find applying oversampling techniques can improve the performance by around 10%. In addition, seven data complexity measures are considered for the initial purpose of investigating the relationship between data complexity measures and the choice of resampling techniques. Although no obvious relationship can be abstracted in our experiments, we find F1v value, a measure for evaluating the overlap which most researchers ignore, has a strong negative correlation with the potential AUC value (after resampling).&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:rights>
          <dct:RightsStatement rdf:about="https://opensource.org/licenses/GPL-3.0">
            <rdfs:label>GNU General Public License v3.0 or later</rdfs:label>
          </dct:RightsStatement>
        </dct:rights>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.3855094"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3855094</dcat:accessURL>
        <dcat:byteSize>13228</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3855094/files/PAKDD_experiment.R</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/EC/H2020/766186/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">766186</dct:identifier>
    <dct:title>Experience-based Computation: Learning to Optimise</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
25
6
views
downloads
All versions This version
Views 2525
Downloads 66
Data volume 79.4 kB79.4 kB
Unique views 2323
Unique downloads 66

Share

Cite as