Other Open Access

segmented Sanskrit corpus (proof of concept)

Ligeia Lugli


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.3903262">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.3903262</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.3903262"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-0473-4290">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-0473-4290</dct:identifier>
        <foaf:name>Ligeia Lugli</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Mangalam Research Center</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>segmented Sanskrit corpus (proof of concept)</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2019</dct:issued>
    <dcat:keyword>corpus</dcat:keyword>
    <dcat:keyword>Sanskrit</dcat:keyword>
    <dcat:keyword>Buddhist Sanskrit</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-09-23</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/SA"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/3903262"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/3903262</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.3457821"/>
    <owl:versionInfo>1.5</owl:versionInfo>
    <dct:description>&lt;p&gt;This is a proof-of-concept Sanskrit corpus developed for the study of Buddhist Sanskrit lexicology.&lt;/p&gt; &lt;p&gt;It comprises:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;&amp;nbsp;172&amp;nbsp;metadata-enriched Buddhist&amp;nbsp;Sanskrit texts for a total of ~ 5&amp;nbsp;million words. The corpus contains all Mahāyāna and &amp;#39;mainstream&amp;#39; Buddhist based on Sanskrit editions texts available on GRETIL (reconstructed editions based on Tibetan translations have been filtered out).&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;The corpus is in romanised Sanskrit (UTF-8 encoding) and is available in three&amp;nbsp;configurations:&lt;/p&gt; &lt;ol&gt; &lt;li&gt;&amp;nbsp;segmented (with dash-separated words)&lt;/li&gt; &lt;li&gt;&amp;nbsp;segmented and stemmed (with capitalised word stem and compounds separated by an @ sign).&lt;/li&gt; &lt;li&gt;segmented, stemmed and normalised (normalisation treats some spelling variation and&amp;nbsp;solves sandhi of stems&amp;#39; initials in most cases), recommended for Word Sketches.&lt;/li&gt; &lt;/ol&gt; &lt;p&gt;The latter version can be used to generate word sketches&amp;nbsp;in Sketch Engine if used in&amp;nbsp;conjunction with the included sketch grammar, which&amp;nbsp;infers likely syntactic dependencies from morphological cues.&lt;/p&gt; &lt;p&gt;**&lt;em&gt;avagraha&lt;/em&gt; has been replaced with &lt;em&gt;a&lt;/em&gt;** in the stemmed versions&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Limitations&lt;/strong&gt;&lt;br&gt; As a proof of concept, this corpus suffers from several limitations. It is very small by contemporary standards, it has not been proof-read and it is currently only segmented and stemmed (not lemmatised or PoS tagged).&amp;nbsp;&lt;br&gt; A funding bid has been submitted to expand and lemmatise the corpus.&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Data Quality&lt;/strong&gt;&lt;br&gt; The corpus has been segmented with Lugli&amp;#39;s Sanskrit segmenter (10.5281/zenodo.3459215).&amp;nbsp;The accuracy of this segmenter has been evaluated at 97% on a sample of Buddhist Sanskrit literature.&lt;/p&gt; &lt;p&gt;Please refer to the segmenter documentation stored at&amp;nbsp;10.5281/zenodo.3459215 for details on evaluation and stemming conventions.&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Acknowledgments&lt;/strong&gt;&lt;br&gt; The corpus has been realised as part of the project &amp;#39;Lexis and Tradition: variation in the vocabulary of Sanskrit Mahāyāna literature&amp;#39;. This project was&amp;nbsp;funded by the British Academy through a Newton International Fellowship (NF161436) and hosted at the Department of Theology and Religious Studies at King&amp;#39;s College London&amp;nbsp;under the supervision of Prof. Henrietta Kate Crosby.&amp;nbsp;&lt;/p&gt; &lt;p&gt;Dr. Bruno Galasek-Hul has contributed to versions 1.4 &amp;amp; 1.5 thanks to funding from the Mangalam Research Center for Buddhist Languages.&lt;/p&gt; &lt;p&gt;Thanks to GRETIL, Dr. Vinita Tseng and Prof. Steinkellner for kindly giving their&amp;nbsp;permission to include automatically processed versions of some of their editions&amp;nbsp;in this corpus.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;Changelog&lt;/strong&gt;&lt;/p&gt; &lt;p&gt;version 1.5&amp;nbsp;adds more&amp;nbsp;Buddhist texts, removes the reference corpus&amp;nbsp;and improves segmentation&lt;/p&gt; &lt;p&gt;version 1.4 adds 59 Buddhist texts and fixes some recurrent segmentation errors&lt;/p&gt; &lt;p&gt;version 1.4.1 corrects some spacing and sentence parsing errors&lt;/p&gt;</dct:description>
    <dct:description>Also included: bibliography cum metadata summary and a sketch grammar + corpus configuration file for use in Sketch Engine</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3903262</dcat:accessURL>
        <dcat:byteSize>15085</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3903262/files/Lugli2019_BuddhSktSketchGrammar.txt</dcat:downloadURL>
        <dcat:mediaType>text/plain</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3903262</dcat:accessURL>
        <dcat:byteSize>50788</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3903262/files/Lugli_BuddhistSanskritCorpusMetadata2020-06-22.csv</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3903262</dcat:accessURL>
        <dcat:byteSize>12660562</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3903262/files/Lugli_BuddhistSanskritCorpusSegmented_v1_5.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3903262</dcat:accessURL>
        <dcat:byteSize>13768128</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3903262/files/Lugli_BuddhistSanskritCorpusStemmedNormalisedForGramrels_v1_5.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.3903262</dcat:accessURL>
        <dcat:byteSize>13772551</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/3903262/files/Lugli_BuddhistSanskritCorpusStemmed_v1_5.zip</dcat:downloadURL>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
240
125
views
downloads
All versions This version
Views 24086
Downloads 12526
Data volume 683.7 MB147.6 MB
Unique views 21681
Unique downloads 5314

Share

Cite as