Dataset Open Access

Data and material for: "Content classification of development emails"

Alberto Bacchelli; Tommaso Dal Sasso; Marco D'Ambros; Michele Lanza


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.1345172">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.1345172</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.1345172"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-0193-6823">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-0193-6823</dct:identifier>
        <foaf:name>Alberto Bacchelli</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Zurich</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Tommaso Dal Sasso</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Università della Svizzera Italiana</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Marco D'Ambros</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Università della Svizzera Italiana</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Michele Lanza</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Università della Svizzera Italiana</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Data and material for: "Content classification of development emails"</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2012</dct:issued>
    <dcat:keyword>Empirical software engineering</dcat:keyword>
    <dcat:keyword>Unstructured Data Mining</dcat:keyword>
    <dcat:keyword>Email</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/SNSF/Project+funding/200020_132175/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100001711</dct:identifier>
        <foaf:name>Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2012-06-02</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/1345172"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/1345172</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:relation rdf:resource="https://doi.org/10.1109/ICSE.2012.6227177"/>
    <dct:hasPart rdf:resource="https://github.com/ilredeitopi/mucca-dataset"/>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.1345171"/>
    <dct:description>&lt;p&gt;This data and material support the paper &amp;quot;Content classification of development emails&amp;quot; published in the proceedings of the&amp;nbsp;34th International Conference on Software Engineering (ICSE 2012).&lt;/p&gt; &lt;p&gt;Every software system has a history.&amp;nbsp;&lt;strong&gt;We find traces of a system&amp;#39;s history in software repositories&lt;/strong&gt;, which are used by developers when building and maintaining their systems. Each repository tells us a part of the history, from its perspective: Issue repositories murmur dark events involving defective and flawed entities; versioning system repositories narrate about restless artifacts and classes that nobody would ever touch; mailing list archives report of unexpected stories on developers&amp;rsquo; interactions and opinions.&lt;/p&gt; &lt;p&gt;But...&amp;nbsp;&lt;strong&gt;can we seriously trust these repositories?&lt;/strong&gt;&amp;nbsp;Can we just listen to what they tell us and behave accordingly? Many wise researchers warmly warned us about the risks of showing such a naive faith in data repositories: Versioning system repositories might seduce us with enchanting stories of always changing entities, but in reality many of these entities may just modify their make up and maintain their old behaviour; or issue repositories might tell us a partial truth about certain very special entities, or developers. We do agree with these researchers: Especially&amp;nbsp;&lt;strong&gt;natural language documents contain information in different languages, surrounded by much noise&lt;/strong&gt;. We must pay a special attention when using them.&lt;/p&gt; &lt;p&gt;We created&amp;nbsp;&lt;strong&gt;MUCCA&lt;/strong&gt;, a classification method to use when dealing with natural language documents. It recognizes source code fragments, patches, stack traces, noise, and natural language with significantly high accuracy. In this way, it allows one to subsequently apply ad hoc analysis techniques to exploit the peculiarities of each category, and extract reliable information.&lt;/p&gt; &lt;p&gt;This Zenodo upload&amp;nbsp;supports the paper that describe our work on this topic.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;1. Source code &amp;amp; Virtual Image&lt;/strong&gt;&lt;/p&gt; &lt;p&gt;MUCCA is written in&amp;nbsp;&lt;a href="http://www.cincomsmalltalk.com/main/products/visualworks/overview/"&gt;Cincom VisualWorks Smalltalk&lt;/a&gt;&amp;nbsp;and is composed of several components.&lt;br&gt; You can download the source code of the following MUCCA components from this upload (&lt;code&gt;mucca-source_code&lt;/code&gt; folder):&lt;/p&gt; &lt;ul&gt; &lt;li&gt;Miler2, the core of MUCCA, including metamodels, importers, classification engine, etc.;&lt;/li&gt; &lt;li&gt;MailPeek, our web application for the manual classification of email content;&lt;/li&gt; &lt;li&gt;PetitIsland, our grammar to generate island parsers;&lt;/li&gt; &lt;li&gt;PetitJava, the grammar of Java, which we implemented for PetitParser;&lt;/li&gt; &lt;li&gt;PetitSTrace, our island parser for java stack traces.&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;Note that, in order to make Miler2 run, you will also need the following external Smalltalk components:&amp;nbsp;&lt;a href="http://www.moosetechnology.org/"&gt;Moose&lt;/a&gt;,&amp;nbsp;&lt;a href="http://www.glorp.org/"&gt;Glorp&lt;/a&gt;,&amp;nbsp;&lt;a href="http://www.seaside.st/"&gt;Seaside&lt;/a&gt;, TwoFlower, MetaDB,&amp;nbsp;&lt;a href="http://www.lukas-renggli.ch/blog/petitparser-1"&gt;PetitParser&lt;/a&gt;.&lt;/p&gt; &lt;p&gt;In addition, we make use of the&amp;nbsp;&lt;a href="http://www.cs.waikato.ac.nz/ml/weka/"&gt;Weka&lt;/a&gt;&amp;nbsp;workbench, for the machine learning tasks. You can download the two trained classifiers that compose MUCCA (&lt;code&gt;mucca-classifiers&lt;/code&gt;&amp;nbsp;folder): Naive Bayes based&amp;nbsp;classifier (&lt;code&gt;classifier1-nb.model&lt;/code&gt;), Decision Tree based&amp;nbsp;classifier (&lt;code&gt;classifier2-dt.model&lt;/code&gt;).&lt;/p&gt; &lt;p&gt;Alternatively, we created a&amp;nbsp;&lt;a href="http://www.virtualbox.org/"&gt;VirtualBox&lt;/a&gt;&amp;nbsp;image with a pre-configured VisualWorks environment, which includes all the MUCCA components, and pre-requisites (both Smalltalk and Java): &lt;code&gt;MUCCA.ova&lt;/code&gt; (Both user and password are &lt;code&gt;muccauser&lt;/code&gt;).&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;2. Benchmark&lt;/strong&gt;&lt;/p&gt; &lt;p&gt;To train machine-learning classifiers and evaluate the effectiveness of the different approaches, we&amp;nbsp;&lt;strong&gt;manually&lt;/strong&gt;&amp;nbsp;create a benchmark, in which emails are classified at character granularity.&lt;/p&gt; &lt;p&gt;Given the time and effort needed to create such a benchmark, we humbly think it is a valuable contribution to the community. With the help of this benchmark, other researchers can reproduce our experiments and devise new classification methods, which can be immediately compared to ours.&lt;/p&gt; &lt;p&gt;You can download the dataset from the &lt;a href="https://github.com/ilredeitopi/mucca-dataset"&gt;GitHub repository&lt;/a&gt;&amp;nbsp;(a dump of the GitHub repository is uploaded here (&lt;code&gt;benchmark/githubDump.zip&lt;/code&gt;), or download the full database dump in PostgreSQL format (&lt;code&gt;benchmark/benchmarkDump.tar.bz2&lt;/code&gt;).&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1345172"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.1345172</dcat:accessURL>
        <dcat:byteSize>2915898057</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1345172/files/mucca.zip">https://zenodo.org/record/1345172/files/mucca.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/SNSF/Project+funding/200020_132175/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">200020_132175</dct:identifier>
    <dct:title>SOSYA - Systems of Systems Analysis</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100001711</dct:identifier>
        <foaf:name>Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
110
22
views
downloads
All versions This version
Views 110110
Downloads 2222
Data volume 64.1 GB64.1 GB
Unique views 102102
Unique downloads 2121

Share

Cite as