Software Open Access
{ "files": [ { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli2019_HorizontalNormalizer.R" }, "checksum": "md5:9e2098321870bda7a82c0fc314449795", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli2019_HorizontalNormalizer.R", "type": "r", "size": 4895 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_BuddhFoundCorpusNgramsRedux.csv" }, "checksum": "md5:ebeceb54230207b55968c113486c979f", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_BuddhFoundCorpusNgramsRedux.csv", "type": "csv", "size": 192850 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_BuddhSktSegmenterLemmatiser2019.R" }, "checksum": "md5:432313287a5a2d084ac64b70b14b9a2a", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_BuddhSktSegmenterLemmatiser2019.R", "type": "r", "size": 244573 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_CL2019_BuddhistSanskritSegmenterPresentation.pptx" }, "checksum": "md5:394107767ce92f6be6b64e9c8cec9923", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_CL2019_BuddhistSanskritSegmenterPresentation.pptx", "type": "pptx", "size": 9160483 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv" }, "checksum": "md5:37ce1893b1f8a9f0aa55b3f6a850e3f0", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_FiveTextsSegmentedTokensDFWithCleanFreq.csv", "type": "csv", "size": 212507 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_GretilBuddhRelLit_NgramsRedux.csv" }, "checksum": "md5:5e169775fc20db5ea684bb35015fe11a", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_GretilBuddhRelLit_NgramsRedux.csv", "type": "csv", "size": 473717 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_GretilBuddhSastraSastra_NgramsRedux.csv" }, "checksum": "md5:63a22b8b1d18c08c6b12506d90c3fc16", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_GretilBuddhSastraSastra_NgramsRedux.csv", "type": "csv", "size": 311888 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_NonStemmedWordlist.csv" }, "checksum": "md5:baee76cc1ec672d92cdb8deb6ba52a51", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_NonStemmedWordlist.csv", "type": "csv", "size": 3037610 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_Segmenter_Eva_AllGoldSent.csv" }, "checksum": "md5:56a7ab6ba81ceac3954c38c5ad6a7525", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_Segmenter_Eva_AllGoldSent.csv", "type": "csv", "size": 75297 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_SegmenterEva_RawOneSentencePerLine.zip" }, "checksum": "md5:4aa0a4e3672b4ce21e927a7420b07e5f", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_SegmenterEva_RawOneSentencePerLine.zip", "type": "zip", "size": 34650 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_StemmedWordlist.csv" }, "checksum": "md5:dc9996dd5b97530e194f64add6f913e1", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_StemmedWordlist.csv", "type": "csv", "size": 1001881 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_WordlistNoA_June2019.csv" }, "checksum": "md5:d64b2a4b9e10dd9e45e95d4f2f701648", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_WordlistNoA_June2019.csv", "type": "csv", "size": 1105346 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_Wordlist_ReadMe.html" }, "checksum": "md5:a48598508f02a794ee6fd021c937962c", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_Wordlist_ReadMe.html", "type": "html", "size": 62929 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Lugli_WordlistWithStemmedAndNotStemmedLemmata.csv" }, "checksum": "md5:fb8504f60355452420e87d2f9953fa1c", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Lugli_WordlistWithStemmedAndNotStemmedLemmata.csv", "type": "csv", "size": 4541833 }, { "links": { "self": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b/Seq2Seq_segmentertest-full-vocabulary_GeoffroyNoel.txt" }, "checksum": "md5:5d507c0ac8219998e5150944db8461e5", "bucket": "b1808411-27f9-46c8-a353-94b6cd53fe2b", "key": "Seq2Seq_segmentertest-full-vocabulary_GeoffroyNoel.txt", "type": "txt", "size": 25325 } ], "owners": [ 76604 ], "doi": "10.5281/zenodo.3526469", "stats": { "version_unique_downloads": 445.0, "unique_views": 70.0, "views": 76.0, "version_views": 109.0, "unique_downloads": 38.0, "version_unique_views": 99.0, "volume": 164610483.0, "version_downloads": 549.0, "downloads": 85.0, "version_volume": 548771019.0 }, "links": { "doi": "https://doi.org/10.5281/zenodo.3526469", "conceptdoi": "https://doi.org/10.5281/zenodo.3459218", "bucket": "https://zenodo.org/api/files/b1808411-27f9-46c8-a353-94b6cd53fe2b", "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.3459218.svg", "html": "https://zenodo.org/record/3526469", "latest_html": "https://zenodo.org/record/3526469", "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.3526469.svg", "latest": "https://zenodo.org/api/records/3526469" }, "conceptdoi": "10.5281/zenodo.3459218", "created": "2019-11-03T13:03:39.117702+00:00", "updated": "2020-01-25T07:26:51.833768+00:00", "conceptrecid": "3459218", "revision": 2, "id": 3526469, "metadata": { "access_right_category": "success", "doi": "10.5281/zenodo.3526469", "description": "<p>This folder contains R code for a rule-based Buddhist Sanskrit Segmenter and Lemmatiser, as well as data necessary to use and evaluate the Segmenter and explanatory materials.</p>\n\n<p>The segmenter has been tested on 639 sentences from 13 Buddhist text (9 s\u016btras, 4 \u015b\u0101stra) and has been evaluated as achieving 97% accuracy.</p>\n\n<p>The code and materials contained in this folder have been developed as part of a Newton International Fellowship at King's College London, funded by the British Academy (NF161436)</p>\n\n<p> </p>\n\n<p><strong>Contents</strong></p>\n\n<p>R code for segmentation, lemmatisation, normalization and evaluation (includes instructions to run code)</p>\n\n<p>powerpoint presentation with background and explanation of project</p>\n\n<p>Wordlists and Wordlists documentation</p>\n\n<p>ngrams and stems frequency tables necessary for segmentation</p>\n\n<p>gold standard set of manually segmented and stemmed sentences for evaluation</p>\n\n<p>set of raw sentences for evaluation</p>\n\n<p>evaluation of Krisha et al. seq2seq segmenter on Buddhist sentences for reference purposes</p>\n\n<p> </p>\n\n<p>This segmenter has been used to prepare the Sanskrit Corpus at DOI 10.5281/zenodo.3457822 and its later version at 10.5281/zenodo.3526035</p>", "language": "eng", "title": "Buddhist Sanskrit Segmenter", "license": { "id": "CC-BY-4.0" }, "relations": { "version": [ { "count": 2, "index": 1, "parent": { "pid_type": "recid", "pid_value": "3459218" }, "is_last": true, "last_child": { "pid_type": "recid", "pid_value": "3526469" } } ] }, "version": "1", "keywords": [ "Buddhist Sanskrit", "Natural Language Processing" ], "publication_date": "2019-09-24", "creators": [ { "orcid": "0000-0003-0473-4290", "affiliation": "King's College London", "name": "Ligeia Lugli" } ], "access_right": "open", "resource_type": { "type": "software", "title": "Software" }, "related_identifiers": [ { "scheme": "doi", "identifier": "10.5281/zenodo.3459218", "relation": "isVersionOf" } ] } }
All versions | This version | |
---|---|---|
Views | 109 | 76 |
Downloads | 549 | 85 |
Data volume | 548.8 MB | 164.6 MB |
Unique views | 99 | 70 |
Unique downloads | 445 | 38 |