{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2022-01-15T09:03:00.937617+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 10, "enabled": true, "entries": { "HeLI.class": { "checksum": "md5:95657280ee492a6ab4844eeb4454a5c0", "ext": "bin", "id": "9a7797de-8058-476b-9af0-1d2c750e95b4", "key": "HeLI.class", "metadata": null, "mimetype": "application/octet-stream", "size": 13674 }, "HeLI.jar": { "checksum": "md5:8537531e1e6f74f67a58fcfc0ac302e3", "ext": "jar", "id": "bd610b67-b404-40d8-972d-d0383799f010", "key": "HeLI.jar", "metadata": null, "mimetype": "application/octet-stream", "size": 44050741 }, "HeLI.java": { "checksum": "md5:c71b0f3cd044bf424908d905fa0a7a97", "ext": "java", "id": "8aaa1922-4ea6-4dbe-aa67-8c5fd352b847", "key": "HeLI.java", "metadata": null, "mimetype": "application/octet-stream", "size": 22452 }, "HeLI.mf": { "checksum": "md5:bb91c0c41fd40f3fb8a7c4f98c9a7c87", "ext": "mf", "id": "e91396d1-7aab-4e7a-a98d-707a75fec69b", "key": "HeLI.mf", "metadata": null, "mimetype": "application/octet-stream", "size": 39 }, "LICENSE": { "checksum": "md5:bb0ae3b700049fd806e2a043e01265d6", "ext": "bin", "id": "bacfac4f-0a2d-453f-8536-1482d41bb598", "key": "LICENSE", "metadata": null, "mimetype": "application/octet-stream", "size": 11419 }, "LanguageModels.zip": { "checksum": "md5:efd3371472a6b3a93133773a6c09d87b", "ext": "zip", "id": "475d4f41-9fbd-401f-a3a4-d4ecc51e9415", "key": "LanguageModels.zip", "metadata": null, "mimetype": "application/zip", "size": 44132999 }, "README.md": { "checksum": "md5:e6f06930e25726624e53eb7a901e0874", "ext": "md", "id": "19084e7a-169b-43b0-95ee-3b18793cc0ed", "key": "README.md", "metadata": null, "mimetype": "application/octet-stream", "size": 2734 }, "languagelist": { "checksum": "md5:f44bcfe8a8a8108095b6bc35cea8e31d", "ext": "bin", "id": "7c196c9a-6136-41de-9f67-57a36e9b78b6", "key": "languagelist", "metadata": null, "mimetype": "application/octet-stream", "size": 884 }, "run_HeLI.py": { "checksum": "md5:fa3de39cf2e93085759e3f97cb9f4d0f", "ext": "py", "id": "210f4679-6459-4829-be2d-ddbf03e3dfd0", "key": "run_HeLI.py", "metadata": null, "mimetype": "text/x-python", "size": 1003 }, "supporting_functions.py": { "checksum": "md5:5d551dcb80653aaac5ecebae98842826", "ext": "py", "id": "b026ad0e-b923-41e5-acaf-7562089a69e8", "key": "supporting_functions.py", "metadata": null, "mimetype": "text/x-python", "size": 745 } }, "order": [], "total_bytes": 88236690 }, "id": "5853116", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/5853116/access", "access_links": "https://zenodo.org/api/records/5853116/access/links", "access_request": "https://zenodo.org/api/records/5853116/access/request", "access_users": "https://zenodo.org/api/records/5853116/access/users", "archive": "https://zenodo.org/api/records/5853116/files-archive", "archive_media": "https://zenodo.org/api/records/5853116/media-files-archive", "communities": "https://zenodo.org/api/records/5853116/communities", "communities-suggestions": "https://zenodo.org/api/records/5853116/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.5853116", "draft": "https://zenodo.org/api/records/5853116/draft", "files": "https://zenodo.org/api/records/5853116/files", "latest": "https://zenodo.org/api/records/5853116/versions/latest", "latest_html": "https://zenodo.org/records/5853116/latest", "media_files": "https://zenodo.org/api/records/5853116/media-files", "parent": "https://zenodo.org/api/records/4780897", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.4780897", "parent_html": "https://zenodo.org/records/4780897", "requests": "https://zenodo.org/api/records/5853116/requests", "reserve_doi": "https://zenodo.org/api/records/5853116/draft/pids/doi", "self": "https://zenodo.org/api/records/5853116", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.5853116", "self_html": "https://zenodo.org/records/5853116", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:5853116/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:5853116/sequence/default", "versions": "https://zenodo.org/api/records/5853116/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "University of Helsinki" } ], "person_or_org": { "family_name": "Jauhiainen", "given_name": "Tommi", "identifiers": [ { "identifier": "0000-0002-6474-3570", "scheme": "orcid" } ], "name": "Jauhiainen, Tommi", "type": "personal" } }, { "affiliations": [ { "name": "University of Helsinki" } ], "person_or_org": { "family_name": "Jauhiainen", "given_name": "Heidi", "identifiers": [ { "identifier": "0000-0002-8227-5627", "scheme": "orcid" } ], "name": "Jauhiainen, Heidi", "type": "personal" } } ], "description": "
HeLI off-the-shelf language identifier with language models for 200 languages.
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile>
The program will read the <infile> and classify the language of each line as one of the 200 languages it knows
\nand writes the results, one ISO 639-3 code per line, into file <outfile>.
You can use the -c option to make the program print a confidence score for the identification after each language code.
\n\nUsage:
\njava -jar HeLI.jar -c -r <infile> -w <outfile>
You can give the list of comma-separated ISO 639-3 identifiers for relevant languages after -l option.
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile> -l fin,swe,eng
You can give the number of top-scored languages to print after the -t option. (overrides confidence)
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile> -l fin,swe,eng -t 2
If you omit both of the filenames, the program will read the standard input one line at a time and write the result to standard output.
\n\nIt can identify c. 3000 sentences per second using one core on a 2021 laptop and around 3 gigabytes of memory.
\n\nIf you use this program in producing scientific publications, please refer to:
\n @inproceedings{jauhiainen-etal-2017-evaluation,
\n title = "Evaluation of language identification methods using 285 languages",
\n author = "Jauhiainen, Tommi and
\n Lind{\\'e}n, Krister and
\n Jauhiainen, Heidi",
\n booktitle = "Proceedings of the 21st Nordic Conference on Computational Linguistics",
\n month = may,
\n year = "2017",
\n address = "Gothenburg, Sweden",
\n publisher = "Association for Computational Linguistics",
\n url = "https://www.aclweb.org/anthology/W17-0221",
\n pages = "183--191",
\n }
Producing and publishing this software has been partly supported by The Finnish Research Impact Foundation Tandem Industry Academia -funding in cooperation with Lingsoft.
", "languages": [ { "id": "eng", "title": { "en": "English" } } ], "publication_date": "2022-01-15", "publisher": "Zenodo", "references": [ { "reference": "Jauhiainen, Tommi et al. (2017). Evaluation of language identification methods using 285 languages. https://www.aclweb.org/anthology/W17-0221" } ], "resource_type": { "id": "software", "title": { "de": "Software", "en": "Software" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "language identification" } ], "title": "HeLI-OTS 1.2 with Python examples", "version": "1.2" }, "parent": { "access": { "owned_by": { "user": 189271 }, "settings": { "accept_conditions_text": null, "allow_guest_requests": false, "allow_user_requests": false, "secret_link_expiration": 0 } }, "communities": {}, "id": "4780897", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4780897", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.5853116", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:5853116", "provider": "oai" } }, "revision_id": 7, "stats": { "all_versions": { "data_volume": 32616597257.0, "downloads": 1337, "unique_downloads": 606, "unique_views": 6161, "views": 6513 }, "this_version": { "data_volume": 661482678.0, "downloads": 88, "unique_downloads": 45, "unique_views": 156, "views": 190 } }, "status": "published", "updated": "2022-09-10T09:57:39.580496+00:00", "versions": { "index": 4, "is_latest": false } }