Software Open Access

Buddhist Sanskrit Segmenter

Ligeia Lugli


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli2019_HorizontalNormalizer.R"
      }, 
      "checksum": "md5:9e2098321870bda7a82c0fc314449795", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli2019_HorizontalNormalizer.R", 
      "type": "r", 
      "size": 4895
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_BuddhFoundCorpusNgramsRedux.csv"
      }, 
      "checksum": "md5:ebeceb54230207b55968c113486c979f", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_BuddhFoundCorpusNgramsRedux.csv", 
      "type": "csv", 
      "size": 192850
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_BuddhSktSegmenterLemmatiser2019.R"
      }, 
      "checksum": "md5:432313287a5a2d084ac64b70b14b9a2a", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_BuddhSktSegmenterLemmatiser2019.R", 
      "type": "r", 
      "size": 244573
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_CL2019_BuddhistSanskritSegmenterPresentation.pptx"
      }, 
      "checksum": "md5:394107767ce92f6be6b64e9c8cec9923", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_CL2019_BuddhistSanskritSegmenterPresentation.pptx", 
      "type": "pptx", 
      "size": 9160483
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv"
      }, 
      "checksum": "md5:37ce1893b1f8a9f0aa55b3f6a850e3f0", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv", 
      "type": "csv", 
      "size": 212507
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_GretilBuddhRelLit_NgramsRedux.csv"
      }, 
      "checksum": "md5:5e169775fc20db5ea684bb35015fe11a", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_GretilBuddhRelLit_NgramsRedux.csv", 
      "type": "csv", 
      "size": 473717
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_GretilBuddhSastraSastra_NgramsRedux.csv"
      }, 
      "checksum": "md5:63a22b8b1d18c08c6b12506d90c3fc16", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_GretilBuddhSastraSastra_NgramsRedux.csv", 
      "type": "csv", 
      "size": 311888
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_NonStemmedWordlist.csv"
      }, 
      "checksum": "md5:baee76cc1ec672d92cdb8deb6ba52a51", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_NonStemmedWordlist.csv", 
      "type": "csv", 
      "size": 3037610
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_Segmenter_Eva_AllGoldSent.csv"
      }, 
      "checksum": "md5:56a7ab6ba81ceac3954c38c5ad6a7525", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_Segmenter_Eva_AllGoldSent.csv", 
      "type": "csv", 
      "size": 75297
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_SegmenterEva_RawOneSentencePerLine.zip"
      }, 
      "checksum": "md5:4aa0a4e3672b4ce21e927a7420b07e5f", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_SegmenterEva_RawOneSentencePerLine.zip", 
      "type": "zip", 
      "size": 34650
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_StemmedWordlist.csv"
      }, 
      "checksum": "md5:dc9996dd5b97530e194f64add6f913e1", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_StemmedWordlist.csv", 
      "type": "csv", 
      "size": 1001881
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_WordlistNoA_June2019.csv"
      }, 
      "checksum": "md5:d64b2a4b9e10dd9e45e95d4f2f701648", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_WordlistNoA_June2019.csv", 
      "type": "csv", 
      "size": 1105346
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_Wordlist_ReadMe.html"
      }, 
      "checksum": "md5:a48598508f02a794ee6fd021c937962c", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_Wordlist_ReadMe.html", 
      "type": "html", 
      "size": 62929
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_WordlistWithStemmedAndNotStemmedLemmata.csv"
      }, 
      "checksum": "md5:fb8504f60355452420e87d2f9953fa1c", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Lugli_WordlistWithStemmedAndNotStemmedLemmata.csv", 
      "type": "csv", 
      "size": 4541833
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Seq2Seq_segmentertest-full-vocabulary_GeoffroyNoel.txt"
      }, 
      "checksum": "md5:5d507c0ac8219998e5150944db8461e5", 
      "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", 
      "key": "Seq2Seq_segmentertest-full-vocabulary_GeoffroyNoel.txt", 
      "type": "txt", 
      "size": 25325
    }
  ], 
  "owners": [
    76604
  ], 
  "doi": "10.5281/zenodo.3526469", 
  "stats": {
    "version_unique_downloads": 445.0, 
    "unique_views": 70.0, 
    "views": 76.0, 
    "version_views": 109.0, 
    "unique_downloads": 38.0, 
    "version_unique_views": 99.0, 
    "volume": 164610483.0, 
    "version_downloads": 549.0, 
    "downloads": 85.0, 
    "version_volume": 548771019.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.3526469", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.3459218", 
    "bucket": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.3459218.svg", 
    "html": "https://zenodo.org/record/3526469", 
    "latest_html": "https://zenodo.org/record/3526469", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.3526469.svg", 
    "latest": "https://zenodo.org/api/records/3526469"
  }, 
  "conceptdoi": "10.5281/zenodo.3459218", 
  "created": "2019-11-03T13:03:39.117702+00:00", 
  "updated": "2020-01-25T07:26:51.833768+00:00", 
  "conceptrecid": "3459218", 
  "revision": 2, 
  "id": 3526469, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.3526469", 
    "description": "<p>This folder contains R code for a rule-based Buddhist Sanskrit Segmenter and Lemmatiser, as well as data necessary to use and evaluate the Segmenter and explanatory materials.</p>\n\n<p>The segmenter has been tested on&nbsp;639 sentences from 13 Buddhist text (9 s\u016btras, 4 \u015b\u0101stra) and has been evaluated as achieving 97% accuracy.</p>\n\n<p>The code and materials contained in this folder have been developed as part of a Newton&nbsp;International Fellowship at King&#39;s College London, funded by the British Academy (NF161436)</p>\n\n<p>&nbsp;</p>\n\n<p><strong>Contents</strong></p>\n\n<p>R code for segmentation, lemmatisation, normalization and evaluation (includes instructions to run code)</p>\n\n<p>powerpoint presentation with background and explanation of project</p>\n\n<p>Wordlists and Wordlists documentation</p>\n\n<p>ngrams and stems frequency tables necessary for segmentation</p>\n\n<p>gold standard set of manually segmented and stemmed sentences for evaluation</p>\n\n<p>set of raw sentences for evaluation</p>\n\n<p>evaluation of&nbsp;Krisha et al. seq2seq segmenter on Buddhist sentences for reference purposes</p>\n\n<p>&nbsp;</p>\n\n<p>This segmenter has been used to prepare the Sanskrit Corpus at DOI&nbsp;10.5281/zenodo.3457822 and&nbsp; its later version at 10.5281/zenodo.3526035</p>", 
    "language": "eng", 
    "title": "Buddhist Sanskrit Segmenter", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "relations": {
      "version": [
        {
          "count": 2, 
          "index": 1, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "3459218"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "3526469"
          }
        }
      ]
    }, 
    "version": "1", 
    "keywords": [
      "Buddhist Sanskrit", 
      "Natural Language Processing"
    ], 
    "publication_date": "2019-09-24", 
    "creators": [
      {
        "orcid": "0000-0003-0473-4290", 
        "affiliation": "King's College London", 
        "name": "Ligeia Lugli"
      }
    ], 
    "access_right": "open", 
    "resource_type": {
      "type": "software", 
      "title": "Software"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.3459218", 
        "relation": "isVersionOf"
      }
    ]
  }
}
109
549
views
downloads
All versions This version
Views 10976
Downloads 54985
Data volume 548.8 MB164.6 MB
Unique views 9970
Unique downloads 44538

Share

Cite as