{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2022-02-15T08:18:36.885189+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 11, "enabled": true, "entries": { "HeLI.class": { "checksum": "md5:4fc85597dbbcf42273557a46f3cf874d", "ext": "bin", "id": "09063781-e821-4f4a-a4f9-baeb21249958", "key": "HeLI.class", "metadata": null, "mimetype": "application/octet-stream", "size": 14493 }, "HeLI.jar": { "checksum": "md5:bcf692e57c129230daee02542f2497a5", "ext": "jar", "id": "e637f40e-ac79-48c9-b048-7a87a0680621", "key": "HeLI.jar", "metadata": null, "mimetype": "application/octet-stream", "size": 44040040 }, "HeLI.java": { "checksum": "md5:30544a34f93cc4e29adc5e93464962bb", "ext": "java", "id": "36f6d051-87e4-45e4-b284-df99650a257d", "key": "HeLI.java", "metadata": null, "mimetype": "application/octet-stream", "size": 24983 }, "HeLI.mf": { "checksum": "md5:9116726e14375f413b758c1eeedcffba", "ext": "mf", "id": "833f365f-8ab0-4769-8c34-fb9ffc87a601", "key": "HeLI.mf", "metadata": null, "mimetype": "application/octet-stream", "size": 39 }, "LICENSE": { "checksum": "md5:bb0ae3b700049fd806e2a043e01265d6", "ext": "bin", "id": "aed071d5-080c-4960-bcf1-8647892578f9", "key": "LICENSE", "metadata": null, "mimetype": "application/octet-stream", "size": 11419 }, "LanguageModels.zip": { "checksum": "md5:804ea9827831d6eceb0b1713ab117923", "ext": "zip", "id": "6b9915eb-0bd1-49ba-bafe-0a8d055f6c37", "key": "LanguageModels.zip", "metadata": null, "mimetype": "application/zip", "size": 44121507 }, "README.md": { "checksum": "md5:6e061777b81257a4acf821ebf891a8ce", "ext": "md", "id": "30c099ad-befc-4656-ae82-cf492d45549d", "key": "README.md", "metadata": null, "mimetype": "application/octet-stream", "size": 2734 }, "createmodels.java": { "checksum": "md5:6812296c1a1397e778b674e66327feaf", "ext": "java", "id": "64413959-f0ad-48a0-ba7e-788f5ada0591", "key": "createmodels.java", "metadata": null, "mimetype": "application/octet-stream", "size": 6672 }, "languagelist": { "checksum": "md5:f44bcfe8a8a8108095b6bc35cea8e31d", "ext": "bin", "id": "be213046-422f-4d73-bb4c-49e388297499", "key": "languagelist", "metadata": null, "mimetype": "application/octet-stream", "size": 884 }, "run_HeLI.py": { "checksum": "md5:fa3de39cf2e93085759e3f97cb9f4d0f", "ext": "py", "id": "2eec9724-0c9a-40f4-88a4-c811f95a41fa", "key": "run_HeLI.py", "metadata": null, "mimetype": "text/x-python", "size": 1003 }, "supporting_functions.py": { "checksum": "md5:5d551dcb80653aaac5ecebae98842826", "ext": "py", "id": "7c9f8f44-3e8f-467d-abfa-bfb0f14eb65e", "key": "supporting_functions.py", "metadata": null, "mimetype": "text/x-python", "size": 745 } }, "order": [], "total_bytes": 88224519 }, "id": "6077089", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/6077089/access", "access_links": "https://zenodo.org/api/records/6077089/access/links", "access_request": "https://zenodo.org/api/records/6077089/access/request", "access_users": "https://zenodo.org/api/records/6077089/access/users", "archive": "https://zenodo.org/api/records/6077089/files-archive", "archive_media": "https://zenodo.org/api/records/6077089/media-files-archive", "communities": "https://zenodo.org/api/records/6077089/communities", "communities-suggestions": "https://zenodo.org/api/records/6077089/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.6077089", "draft": "https://zenodo.org/api/records/6077089/draft", "files": "https://zenodo.org/api/records/6077089/files", "latest": "https://zenodo.org/api/records/6077089/versions/latest", "latest_html": "https://zenodo.org/records/6077089/latest", "media_files": "https://zenodo.org/api/records/6077089/media-files", "parent": "https://zenodo.org/api/records/4780897", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.4780897", "parent_html": "https://zenodo.org/records/4780897", "requests": "https://zenodo.org/api/records/6077089/requests", "reserve_doi": "https://zenodo.org/api/records/6077089/draft/pids/doi", "self": "https://zenodo.org/api/records/6077089", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.6077089", "self_html": "https://zenodo.org/records/6077089", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:6077089/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:6077089/sequence/default", "versions": "https://zenodo.org/api/records/6077089/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "affiliations": [ { "name": "University of Helsinki" } ], "person_or_org": { "family_name": "Jauhiainen", "given_name": "Tommi", "identifiers": [ { "identifier": "0000-0002-6474-3570", "scheme": "orcid" } ], "name": "Jauhiainen, Tommi", "type": "personal" } }, { "affiliations": [ { "name": "University of Helsinki" } ], "person_or_org": { "family_name": "Jauhiainen", "given_name": "Heidi", "identifiers": [ { "identifier": "0000-0002-8227-5627", "scheme": "orcid" } ], "name": "Jauhiainen, Heidi", "type": "personal" } } ], "description": "
HeLI off-the-shelf language identifier with language models for 200 languages.
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile>
The program will read the <infile> and classify the language of each line as one of the 200 languages it knows
\nand writes the results, one ISO 639-3 code per line, into file <outfile>.
You can use the -c option to make the program print a confidence score for the identification after each language code.
\n\nUsage:
\njava -jar HeLI.jar -c -r <infile> -w <outfile>
You can give the list of comma-separated ISO 639-3 identifiers for relevant languages after -l option.
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile> -l fin,swe,eng
You can give the number of top-scored languages to print after the -t option. (overrides confidence)
\n\nUsage:
\njava -jar HeLI.jar -r <infile> -w <outfile> -l fin,swe,eng -t 2
If you omit both of the filenames, the program will read the standard input one line at a time and write the result to standard output.
\n\nIt can identify c. 3000 sentences per second using one core on a 2021 laptop and around 3 gigabytes of memory.
\n\nIf you use this program in producing scientific publications, please refer to:
\n @inproceedings{jauhiainen-etal-2017-evaluation,
\n title = "Evaluation of language identification methods using 285 languages",
\n author = "Jauhiainen, Tommi and
\n Lind{\\'e}n, Krister and
\n Jauhiainen, Heidi",
\n booktitle = "Proceedings of the 21st Nordic Conference on Computational Linguistics",
\n month = may,
\n year = "2017",
\n address = "Gothenburg, Sweden",
\n publisher = "Association for Computational Linguistics",
\n url = "https://www.aclweb.org/anthology/W17-0221",
\n pages = "183--191",
\n }
Producing and publishing this software has been partly supported by The Finnish Research Impact Foundation Tandem Industry Academia -funding in cooperation with Lingsoft.
", "languages": [ { "id": "eng", "title": { "en": "English" } } ], "publication_date": "2022-02-15", "publisher": "Zenodo", "references": [ { "reference": "Jauhiainen, Tommi et al. (2017). Evaluation of language identification methods using 285 languages. https://www.aclweb.org/anthology/W17-0221" } ], "resource_type": { "id": "software", "title": { "de": "Software", "en": "Software" } }, "rights": [ { "description": { "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited." }, "icon": "cc-by-icon", "id": "cc-by-4.0", "props": { "scheme": "spdx", "url": "https://creativecommons.org/licenses/by/4.0/legalcode" }, "title": { "en": "Creative Commons Attribution 4.0 International" } } ], "subjects": [ { "subject": "language identification" } ], "title": "HeLI-OTS 1.3", "version": "1.3" }, "parent": { "access": { "owned_by": { "user": 189271 }, "settings": { "accept_conditions_text": null, "allow_guest_requests": false, "allow_user_requests": false, "secret_link_expiration": 0 } }, "communities": {}, "id": "4780897", "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.4780897", "provider": "datacite" } } }, "pids": { "doi": { "client": "datacite", "identifier": "10.5281/zenodo.6077089", "provider": "datacite" }, "oai": { "identifier": "oai:zenodo.org:6077089", "provider": "oai" } }, "revision_id": 3, "stats": { "all_versions": { "data_volume": 32616597257.0, "downloads": 1337, "unique_downloads": 606, "unique_views": 6149, "views": 6501 }, "this_version": { "data_volume": 3216288198.0, "downloads": 170, "unique_downloads": 92, "unique_views": 1352, "views": 1401 } }, "status": "published", "updated": "2022-09-10T09:57:39.878021+00:00", "versions": { "index": 6, "is_latest": false } }