Report Open Access

Big Data Analysis and Machine Learning at Scale with Oracle Cloud Infrastructure

Michał Bień


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.3550777">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.3550777</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.3550777"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Michał Bień</foaf:name>
      </rdf:Description>
    </dct:creator>
    <dct:title>Big Data Analysis and Machine Learning at Scale with Oracle Cloud Infrastructure</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2019</dct:issued>
    <dcat:keyword>CERN openlab</dcat:keyword>
    <dcat:keyword>summer student programme</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-11-22</dct:issued>
    <owl:sameAs rdf:resource="https://zenodo.org/record/3550777"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/3550777</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.3550776"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/cernopenlab"/>
    <dct:description>&lt;p&gt;This work has successfully deployed two different use cases of interest for High Energy Physics&amp;nbsp;&lt;br&gt; using cloud resources:&amp;nbsp;&lt;br&gt;  CMS Big data reduction: This use case consists in running a data reduction workloads for&amp;nbsp;&lt;br&gt; physics data. The code and implementation has originally been developed by CERN openlab&amp;nbsp;&lt;br&gt; in collaboration with CMS and Intel in 2017-2018. It aims at demonstrating the scalability of a&amp;nbsp;&lt;br&gt; data reduction workflow, by processing ROOT files using Apache Spark&amp;nbsp;&lt;br&gt;  Spark DL Trigger: This use case consists in the deployment of a full data preparation and&amp;nbsp;&lt;br&gt; machine learning pipeline, starting from data ingestion (4.5 TB of ROOT data), to the training&amp;nbsp;&lt;br&gt; of classifier using neural networks. This use case is implemented using Apache Spark and&amp;nbsp;&lt;br&gt; the Keras API, following previous work in collaboration with CERN openlab.&amp;nbsp;&lt;br&gt; Resources for this work have been deployed using Oracle Cloud Infrastructure (OCI). In particular&amp;nbsp;&lt;br&gt; this project has allowed to complete:&amp;nbsp;&lt;br&gt;  Setup of the project using Oracle Container Engine for Kubernetes and Oracle Cloud&amp;nbsp;&lt;br&gt; resources&amp;nbsp;&lt;br&gt;  Troubleshooting of the oci-hdfs-connector to run Apache Spark at scale on OCI Object&amp;nbsp;&lt;br&gt; Storage&amp;nbsp;&lt;br&gt;  Measurements of OCI Object Storage performance for the selected use cases&amp;nbsp;&lt;br&gt;  Investigations and performance measurements of the resource utilisation on Oracle&amp;nbsp;&lt;br&gt; Container Engine for Kubernetes (OKE), when running the TensorFlow/Keras neural network&amp;nbsp;&lt;br&gt; model training at scale, using CPU resources, and when using GPU.&amp;nbsp;&lt;br&gt; Notable results of this project:&amp;nbsp;&lt;br&gt;  Produced several key improvements to the oci-hdfs-connector. The improvements are&amp;nbsp;&lt;br&gt; necessary to run the latest Spark version (Spark 2.4.x) on Oracle Cloud. The connector is&amp;nbsp;&lt;br&gt; distributed by Oracle with open source licensing, and the improvements will be fed back to&amp;nbsp;&lt;br&gt; Oracle.&amp;nbsp;&lt;br&gt;  Improved instrumentation infrastructure for measuring Spark workloads on cloud resources,&amp;nbsp;&lt;br&gt; by streamlining the deployment of Spark performance dashboard on Kubernetes and&amp;nbsp;&lt;br&gt; developing a Helm chart&amp;nbsp;&lt;br&gt;  Produced a solution for direct measurement of I/O latency for Spark workloads reading from&amp;nbsp;&lt;br&gt; OCI or S3 storage. The results are of general interest for Spark users, notably including the&amp;nbsp;&lt;br&gt; Spark service at CERN&amp;nbsp;&lt;br&gt;  Developed methods to parallelize TensorFlow/Keras on Kubernetes using TensorFlow 2.0&amp;nbsp;&lt;br&gt; new tf.distribute features. These are of general interest for ML practitioners, notably including&amp;nbsp;&lt;br&gt; the users of CERN cloud services.&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.3550777">https://doi.org/10.5281/zenodo.3550777</dcat:accessURL>
        <dcat:byteSize>1943637</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/3550777/files/Report_Michal_Bien.pdf">https://zenodo.org/record/3550777/files/Report_Michal_Bien.pdf</dcat:downloadURL>
        <dcat:mediaType>application/pdf</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
385
469
views
downloads
All versions This version
Views 385384
Downloads 469469
Data volume 911.6 MB911.6 MB
Unique views 359358
Unique downloads 446446

Share

Cite as