There is a newer version of this record available.

Software Open Access

Buddhist Sanskrit Segmenter

Ligeia Lugli


MARC21 XML Export

<?xml version='1.0' encoding='UTF-8'?>
<record xmlns="http://www.loc.gov/MARC21/slim">
  <leader>00000nmm##2200000uu#4500</leader>
  <datafield tag="041" ind1=" " ind2=" ">
    <subfield code="a">eng</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Buddhist Sanskrit</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">Natural Language Processing</subfield>
  </datafield>
  <controlfield tag="005">20200125072651.0</controlfield>
  <controlfield tag="001">3459219</controlfield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">192850</subfield>
    <subfield code="z">md5:ebeceb54230207b55968c113486c979f</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_BuddhFoundCorpusNgramsRedux.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">244573</subfield>
    <subfield code="z">md5:432313287a5a2d084ac64b70b14b9a2a</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_BuddhSktSegmenterLemmatiser2019.R</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">9160483</subfield>
    <subfield code="z">md5:394107767ce92f6be6b64e9c8cec9923</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_CL2019_BuddhistSanskritSegmenterPresentation.pptx</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">212507</subfield>
    <subfield code="z">md5:37ce1893b1f8a9f0aa55b3f6a850e3f0</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">473717</subfield>
    <subfield code="z">md5:5e169775fc20db5ea684bb35015fe11a</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_GretilBuddhRelLit_NgramsRedux.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">311888</subfield>
    <subfield code="z">md5:63a22b8b1d18c08c6b12506d90c3fc16</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_GretilBuddhSastraSastra_NgramsRedux.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">3037610</subfield>
    <subfield code="z">md5:baee76cc1ec672d92cdb8deb6ba52a51</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_NonStemmedWordlist.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">75297</subfield>
    <subfield code="z">md5:56a7ab6ba81ceac3954c38c5ad6a7525</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_Segmenter_Eva_AllGoldSent.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">34650</subfield>
    <subfield code="z">md5:4aa0a4e3672b4ce21e927a7420b07e5f</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_SegmenterEva_RawOneSentencePerLine.zip</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">1105346</subfield>
    <subfield code="z">md5:d64b2a4b9e10dd9e45e95d4f2f701648</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_WordlistNoA_June2019.csv</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">62929</subfield>
    <subfield code="z">md5:a48598508f02a794ee6fd021c937962c</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Lugli_Wordlist_ReadMe.html</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">25325</subfield>
    <subfield code="z">md5:5d507c0ac8219998e5150944db8461e5</subfield>
    <subfield code="u">https://zenodo.org/record/3459219/files/Seq2Seq_segmentertest-full-vocabulary_GeoffroyNoel.txt</subfield>
  </datafield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="l">open</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2019-09-24</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="p">software</subfield>
    <subfield code="o">oai:zenodo.org:3459219</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="u">King's College London</subfield>
    <subfield code="0">(orcid)0000-0003-0473-4290</subfield>
    <subfield code="a">Ligeia Lugli</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">Buddhist Sanskrit Segmenter</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="u">https://creativecommons.org/licenses/by/4.0/legalcode</subfield>
    <subfield code="a">Creative Commons Attribution 4.0 International</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="a">cc-by</subfield>
    <subfield code="2">opendefinition.org</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">&lt;p&gt;This folder contains R code for a rule-based Buddhist Sanskrit Segmenter and Lemmatiser, as well as data necessary to use and evaluate the Segmenter and explanatory materials.&lt;/p&gt;

&lt;p&gt;The segmenter has been tested on&amp;nbsp;639 sentences from 13 Buddhist text (9 sūtras, 4 śāstra) and has been evaluated as achieving 97% accuracy.&lt;/p&gt;

&lt;p&gt;The code and materials contained in this folder have been developed as part of a Newton&amp;nbsp;International Fellowship at King&amp;#39;s College London, funded by the British Academy (NF161436)&lt;/p&gt;

&lt;p&gt;&amp;nbsp;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Contents&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;R code for segmentation, lemmatisation and evaluation (includes instructions to run code)&lt;/p&gt;

&lt;p&gt;powerpoint presentation with background and explanation of project&lt;/p&gt;

&lt;p&gt;Wordlists and Wordlists documentation&lt;/p&gt;

&lt;p&gt;ngrams and stems frequency tables necessary for segmentation&lt;/p&gt;

&lt;p&gt;gold standard set of manually segmented and stemmed sentences for evaluation&lt;/p&gt;

&lt;p&gt;set of raw sentences for evaluation&lt;/p&gt;

&lt;p&gt;evaluation of&amp;nbsp;Krisha et al. seq2seq segmenter on Buddhist sentences for reference purposes&lt;/p&gt;

&lt;p&gt;&amp;nbsp;&lt;/p&gt;

&lt;p&gt;This segmenter has been used to prepare the Sanskrit Corpus at DOI&amp;nbsp;10.5281/zenodo.3457822&lt;/p&gt;</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">isVersionOf</subfield>
    <subfield code="a">10.5281/zenodo.3459218</subfield>
  </datafield>
  <datafield tag="024" ind1=" " ind2=" ">
    <subfield code="a">10.5281/zenodo.3459219</subfield>
    <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">software</subfield>
  </datafield>
</record>
111
557
views
downloads
All versions This version
Views 11133
Downloads 557463
Data volume 577.2 MB383.3 MB
Unique views 10130
Unique downloads 448406

Share

Cite as