{
  "access": {
    "embargo": {
      "active": false,
      "reason": null
    },
    "files": "public",
    "record": "public",
    "status": "open"
  },
  "created": "2020-06-22T11:47:44.398113+00:00",
  "custom_fields": {},
  "deletion_status": {
    "is_deleted": false,
    "status": "P"
  },
  "files": {
    "count": 5,
    "enabled": true,
    "entries": {
      "Lugli2019_BuddhSktSketchGrammar.txt": {
        "checksum": "md5:c58791f712ac0174c2363523179d12f7",
        "ext": "txt",
        "id": "eebc24c6-8bad-404e-b24d-d0109a59b538",
        "key": "Lugli2019_BuddhSktSketchGrammar.txt",
        "metadata": null,
        "mimetype": "text/plain",
        "size": 15085
      },
      "Lugli_BuddhistSanskritCorpusMetadata2020-06-22.csv": {
        "checksum": "md5:aa2c9f2071623329468796ac90f39ce0",
        "ext": "csv",
        "id": "7287e54c-16ac-4129-aa56-e093869eddd9",
        "key": "Lugli_BuddhistSanskritCorpusMetadata2020-06-22.csv",
        "metadata": null,
        "mimetype": "text/csv",
        "size": 50788
      },
      "Lugli_BuddhistSanskritCorpusSegmented_v1_5.zip": {
        "checksum": "md5:43e8793746f43c4d86be12936c0d0c9c",
        "ext": "zip",
        "id": "245b9b61-e830-43aa-89c8-e416737df2c6",
        "key": "Lugli_BuddhistSanskritCorpusSegmented_v1_5.zip",
        "metadata": null,
        "mimetype": "application/zip",
        "size": 12660562
      },
      "Lugli_BuddhistSanskritCorpusStemmedNormalisedForGramrels_v1_5.zip": {
        "checksum": "md5:e63c00b914b6d5f62db1829ea39d4be5",
        "ext": "zip",
        "id": "fd1e2aaa-d3ce-4849-af4a-57292486a96d",
        "key": "Lugli_BuddhistSanskritCorpusStemmedNormalisedForGramrels_v1_5.zip",
        "metadata": null,
        "mimetype": "application/zip",
        "size": 13768128
      },
      "Lugli_BuddhistSanskritCorpusStemmed_v1_5.zip": {
        "checksum": "md5:5aa6c8d2acabc99cc387a2a6c544514a",
        "ext": "zip",
        "id": "15883a65-ffed-446e-8557-af42c54184e4",
        "key": "Lugli_BuddhistSanskritCorpusStemmed_v1_5.zip",
        "metadata": null,
        "mimetype": "application/zip",
        "size": 13772551
      }
    },
    "order": [],
    "total_bytes": 40267114
  },
  "id": "3903262",
  "is_draft": false,
  "is_published": true,
  "links": {
    "access": "https://zenodo.org/api/records/3903262/access",
    "access_grants": "https://zenodo.org/api/records/3903262/access/grants",
    "access_links": "https://zenodo.org/api/records/3903262/access/links",
    "access_request": "https://zenodo.org/api/records/3903262/access/request",
    "access_users": "https://zenodo.org/api/records/3903262/access/users",
    "archive": "https://zenodo.org/api/records/3903262/files-archive",
    "archive_media": "https://zenodo.org/api/records/3903262/media-files-archive",
    "communities": "https://zenodo.org/api/records/3903262/communities",
    "communities-suggestions": "https://zenodo.org/api/records/3903262/communities-suggestions",
    "doi": "https://doi.org/10.5281/zenodo.3903262",
    "draft": "https://zenodo.org/api/records/3903262/draft",
    "files": "https://zenodo.org/api/records/3903262/files",
    "latest": "https://zenodo.org/api/records/3903262/versions/latest",
    "latest_html": "https://zenodo.org/records/3903262/latest",
    "media_files": "https://zenodo.org/api/records/3903262/media-files",
    "parent": "https://zenodo.org/api/records/3457821",
    "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.3457821",
    "parent_html": "https://zenodo.org/records/3457821",
    "requests": "https://zenodo.org/api/records/3903262/requests",
    "reserve_doi": "https://zenodo.org/api/records/3903262/draft/pids/doi",
    "self": "https://zenodo.org/api/records/3903262",
    "self_doi": "https://zenodo.org/doi/10.5281/zenodo.3903262",
    "self_html": "https://zenodo.org/records/3903262",
    "self_iiif_manifest": "https://zenodo.org/api/iiif/record:3903262/manifest",
    "self_iiif_sequence": "https://zenodo.org/api/iiif/record:3903262/sequence/default",
    "versions": "https://zenodo.org/api/records/3903262/versions"
  },
  "media_files": {
    "count": 0,
    "enabled": false,
    "entries": {},
    "order": [],
    "total_bytes": 0
  },
  "metadata": {
    "additional_descriptions": [
      {
        "description": "Also included: bibliography cum metadata summary and a sketch grammar + corpus configuration file for use in Sketch Engine",
        "type": {
          "id": "notes",
          "title": {
            "de": "Anmerkungen",
            "en": "Notes"
          }
        }
      }
    ],
    "contributors": [
      {
        "person_or_org": {
          "family_name": "Bruno Galasek-Hul",
          "name": "Bruno Galasek-Hul",
          "type": "personal"
        },
        "role": {
          "id": "projectmember",
          "title": {
            "de": "Projektmitglied",
            "en": "Project member"
          }
        }
      }
    ],
    "creators": [
      {
        "affiliations": [
          {
            "name": "Mangalam Research Center"
          }
        ],
        "person_or_org": {
          "family_name": "Ligeia Lugli",
          "identifiers": [
            {
              "identifier": "0000-0003-0473-4290",
              "scheme": "orcid"
            }
          ],
          "name": "Ligeia Lugli",
          "type": "personal"
        }
      }
    ],
    "description": "<p>This is a proof-of-concept Sanskrit corpus developed for the study of Buddhist Sanskrit lexicology.</p>\n\n<p>It comprises:</p>\n\n<ul>\n\t<li>&nbsp;172&nbsp;metadata-enriched Buddhist&nbsp;Sanskrit texts for a total of ~ 5&nbsp;million words. The corpus contains all Mah\u0101y\u0101na and &#39;mainstream&#39; Buddhist based on Sanskrit editions texts available on GRETIL (reconstructed editions based on Tibetan translations have been filtered out).</li>\n</ul>\n\n<p>The corpus is in romanised Sanskrit (UTF-8 encoding) and is available in three&nbsp;configurations:</p>\n\n<ol>\n\t<li>&nbsp;segmented (with dash-separated words)</li>\n\t<li>&nbsp;segmented and stemmed (with capitalised word stem and compounds separated by an @ sign).</li>\n\t<li>segmented, stemmed and normalised (normalisation treats some spelling variation and&nbsp;solves sandhi of stems&#39; initials in most cases), recommended for Word Sketches.</li>\n</ol>\n\n<p>The latter version can be used to generate word sketches&nbsp;in Sketch Engine if used in&nbsp;conjunction with the included sketch grammar, which&nbsp;infers likely syntactic dependencies from morphological cues.</p>\n\n<p>**<em>avagraha</em> has been replaced with <em>a</em>** in the stemmed versions</p>\n\n<p><strong>Limitations</strong><br>\nAs a proof of concept, this corpus suffers from several limitations. It is very small by contemporary standards, it has not been proof-read and it is currently only segmented and stemmed (not lemmatised or PoS tagged).&nbsp;<br>\nA funding bid has been submitted to expand and lemmatise the corpus.</p>\n\n<p><strong>Data Quality</strong><br>\nThe corpus has been segmented with Lugli&#39;s Sanskrit segmenter (10.5281/zenodo.3459215).&nbsp;The accuracy of this segmenter has been evaluated at 97% on a sample of Buddhist Sanskrit literature.</p>\n\n<p>Please refer to the segmenter documentation stored at&nbsp;10.5281/zenodo.3459215 for details on evaluation and stemming conventions.</p>\n\n<p><strong>Acknowledgments</strong><br>\nThe corpus has been realised as part of the project &#39;Lexis and Tradition: variation in the vocabulary of Sanskrit Mah\u0101y\u0101na literature&#39;. This project was&nbsp;funded by the British Academy through a Newton International Fellowship (NF161436) and hosted at the Department of Theology and Religious Studies at King&#39;s College London&nbsp;under the supervision of Prof. Henrietta Kate Crosby.&nbsp;</p>\n\n<p>Dr. Bruno Galasek-Hul has contributed to versions 1.4 &amp; 1.5 thanks to funding from the Mangalam Research Center for Buddhist Languages.</p>\n\n<p>Thanks to GRETIL, Dr. Vinita Tseng and Prof. Steinkellner for kindly giving their&nbsp;permission to include automatically processed versions of some of their editions&nbsp;in this corpus.</p>\n\n<p>&nbsp;</p>\n\n<p><strong>Changelog</strong></p>\n\n<p>version 1.5&nbsp;adds more&nbsp;Buddhist texts, removes the reference corpus&nbsp;and improves segmentation</p>\n\n<p>version 1.4 adds 59 Buddhist texts and fixes some recurrent segmentation errors</p>\n\n<p>version 1.4.1 corrects some spacing and sentence parsing errors</p>",
    "languages": [
      {
        "id": "san",
        "title": {
          "en": "Sanskrit"
        }
      }
    ],
    "publication_date": "2019-09-23",
    "publisher": "Zenodo",
    "resource_type": {
      "id": "other",
      "title": {
        "de": "Sonstige",
        "en": "Other"
      }
    },
    "rights": [
      {
        "description": {
          "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited."
        },
        "icon": "cc-by-icon",
        "id": "cc-by-4.0",
        "props": {
          "scheme": "spdx",
          "url": "https://creativecommons.org/licenses/by/4.0/legalcode"
        },
        "title": {
          "en": "Creative Commons Attribution 4.0 International"
        }
      }
    ],
    "subjects": [
      {
        "subject": "corpus"
      },
      {
        "subject": "Sanskrit"
      },
      {
        "subject": "Buddhist Sanskrit"
      }
    ],
    "title": "segmented Sanskrit corpus (proof of concept)",
    "version": "1.5"
  },
  "parent": {
    "access": {
      "owned_by": {
        "user": "76604"
      }
    },
    "communities": {},
    "id": "3457821",
    "pids": {
      "doi": {
        "client": "datacite",
        "identifier": "10.5281/zenodo.3457821",
        "provider": "datacite"
      }
    }
  },
  "pids": {
    "doi": {
      "client": "datacite",
      "identifier": "10.5281/zenodo.3903262",
      "provider": "datacite"
    },
    "oai": {
      "identifier": "oai:zenodo.org:3903262",
      "provider": "oai"
    }
  },
  "revision_id": 17,
  "stats": {
    "all_versions": {
      "data_volume": 22223992057.0,
      "downloads": 1149,
      "unique_downloads": 892,
      "unique_views": 2001,
      "views": 2173
    },
    "this_version": {
      "data_volume": 522735517.0,
      "downloads": 73,
      "unique_downloads": 58,
      "unique_views": 278,
      "views": 297
    }
  },
  "status": "published",
  "updated": "2023-08-29T15:56:55.906289+00:00",
  "versions": {
    "index": 7,
    "is_latest": false
  }
}