Software Open Access

Word Segmentation in Sanskrit Using Energy Based Models

Amrith Krishna; Bishal Santra; Sasi Prasanth Bandaru; Gaurav Sahu; Vishnu Dutt Sharma; Pavankumar Satuluri; Pawan Goyal


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.1035413">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Software"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.1035413</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.1035413"/>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Amrith Krishna</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IIT Kharagpur</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Bishal Santra</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IIT Kharagpur</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Sasi Prasanth Bandaru</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IIT Kharagpur</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Gaurav Sahu</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IIT Kharagpur</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Vishnu Dutt Sharma</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>American Express</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Pavankumar Satuluri</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Chinmya Visvavidyapeeth</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Pawan Goyal</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>IIT Kharagpur</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Word Segmentation in Sanskrit Using Energy Based Models</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2018</dct:issued>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2018-08-23</dct:issued>
    <owl:sameAs rdf:resource="https://zenodo.org/record/1035413"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/1035413</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.1035412"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/cnerg"/>
    <dct:description>&lt;p&gt;This is the repository for word segmentation in sanskrit using energy based models.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;# Word Segmentation in Sanskrit Using Energy Based Models&lt;br&gt; &lt;br&gt; &amp;nbsp;&lt;br&gt; ## Getting Started&lt;br&gt; &amp;nbsp;&lt;br&gt; Please download the 2 compressed files &amp;#39;dir.zip&amp;#39; and &amp;#39;wordsegmentation.rar&amp;#39; to your working directory and extract them into folders named &amp;#39;dir&amp;#39; and &amp;#39;wordsegmentation&amp;#39; respectively.&lt;br&gt; &amp;nbsp;&lt;br&gt; Your working directory should be as follows&lt;br&gt; * Working Directory&lt;br&gt; &amp;nbsp; * wordsegmentation&lt;br&gt; &amp;nbsp;&amp;nbsp;&amp;nbsp; * skt_dcs_DS.bz2_4K_bigram_mir_10K&lt;br&gt; &amp;nbsp;&amp;nbsp;&amp;nbsp; * skt_dcs_DS.bz2_4K_bigram_mir_heldout&lt;br&gt; &amp;nbsp; * dir&lt;br&gt; &amp;nbsp;&lt;br&gt; ## Prerequisites&lt;br&gt; * Python3&lt;br&gt; &amp;nbsp; * scipy&lt;br&gt; &amp;nbsp; * numpy&lt;br&gt; &amp;nbsp; * csv&lt;br&gt; &amp;nbsp; * pickle&lt;br&gt; &amp;nbsp; * multiprocessing&lt;br&gt; &amp;nbsp; * bz2&lt;br&gt; ## Instructions for Training&lt;br&gt; Change your current directory to &amp;#39;dir&amp;#39;&lt;br&gt; &amp;nbsp;&lt;br&gt; Run the file Train_clique.py by using the following command&lt;br&gt; &amp;nbsp;&lt;br&gt; * python Train_clique.py&lt;br&gt; &amp;nbsp;&lt;br&gt; To train on different input features like BM2,BM3,BR2,BR3,PM2,PM3,PR,PR3 please modify the bz2_input_folder value in the main function before beginning the training.&lt;br&gt; &amp;nbsp;&lt;br&gt; Feature&amp;nbsp; | bz2_input_folder&lt;br&gt; ------------- | -------------&lt;br&gt; BM2 | wordsegmentation/skt_dcs_DS.bz2_4K_bigram_mir_10K/&lt;br&gt; BM3 | wordsegmentation/skt_dcs_DS.bz2_1L_bigram_mir_10K&lt;br&gt; BR2 | wordsegmentation/skt_dcs_DS.bz2_4K_bigram_rfe_10K/&lt;br&gt; BR3 | wordsegmentation/skt_dcs_DS.bz2_1L_bigram_rfe_10K/&lt;br&gt; PM2 | wordsegmentation/skt_dcs_DS.bz2_4K_pmi_mir_10K/&lt;br&gt; PM3 | wordsegmentation/skt_dcs_DS.bz2_1L_pmi_mir_10K2/&lt;br&gt; PR2 | wordsegmentation/skt_dcs_DS.bz2_4K_pmi_rfe_10K/&lt;br&gt; PR3 | wordsegmentation/skt_dcs_DS.bz2_1L_pmi_rfe_10K/&lt;br&gt; &amp;nbsp;&lt;br&gt; ## Instructions for Testing&lt;br&gt; &amp;nbsp;&lt;br&gt; After training, please modify the &amp;#39;modelList&amp;#39; dictionary&amp;nbsp; in &amp;#39;test_clique.py&amp;#39; with the name of the neural network that has been saved during training. While testing for a feature, please provide the name of the neural net which was trained for the same feature.&lt;br&gt; &amp;nbsp;&lt;br&gt; We only provide the trained model for the feature BM2 which was our best performing feature. If the name of the neural net is not changed, then the testing will be performed on the pre-trained model for BM2 provided in outputs/train_t7978754709018&lt;br&gt; &amp;nbsp;&lt;br&gt; To test with a particular feature vector use the tag of the feature while execution&lt;br&gt; &amp;nbsp;&lt;br&gt; * python test_clique.py -t &amp;lt;tag&amp;gt;&lt;br&gt; &amp;nbsp;&lt;br&gt; For example: &amp;nbsp;&lt;br&gt; &amp;nbsp; * python test_clique.py -t BM2&lt;br&gt; &amp;nbsp;&lt;br&gt; After finishing the testing please run the following command to see the precision and recall values for both the word and word++ prediction tasks&lt;br&gt; &amp;nbsp;&lt;br&gt; * python evaluate.py &amp;lt;tag&amp;gt;&lt;br&gt; &amp;nbsp;&lt;br&gt; For example: &amp;nbsp;&lt;br&gt; &amp;nbsp; * python evaluate.py BM2&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1035413"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1035413"/>
        <dcat:byteSize>453229783</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1035413/files/dir.zip"/>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1035413"/>
        <dcat:byteSize>2418</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1035413/files/README.md"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.1035413"/>
        <dcat:byteSize>41733267455</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/1035413/files/wordsegmentation.rar"/>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
653
531
views
downloads
All versions This version
Views 653654
Downloads 531531
Data volume 13.0 TB13.0 TB
Unique views 609610
Unique downloads 248248

Share

Cite as