There is a newer version of this record available.

Software Open Access

HeLI-OTS 1.2 with Python examples

Jauhiainen, Tommi; Jauhiainen, Heidi


MARC21 XML Export

<?xml version='1.0' encoding='UTF-8'?>
<record xmlns="http://www.loc.gov/MARC21/slim">
  <leader>00000nmm##2200000uu#4500</leader>
  <datafield tag="999" ind1="C" ind2="5">
    <subfield code="x">Jauhiainen, Tommi et al. (2017). Evaluation of language identification methods using 285 languages. https://www.aclweb.org/anthology/W17-0221</subfield>
  </datafield>
  <datafield tag="041" ind1=" " ind2=" ">
    <subfield code="a">eng</subfield>
  </datafield>
  <datafield tag="653" ind1=" " ind2=" ">
    <subfield code="a">language identification</subfield>
  </datafield>
  <controlfield tag="005">20220215081838.0</controlfield>
  <controlfield tag="001">5853116</controlfield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="u">University of Helsinki</subfield>
    <subfield code="0">(orcid)0000-0002-8227-5627</subfield>
    <subfield code="a">Jauhiainen, Heidi</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">13674</subfield>
    <subfield code="z">md5:95657280ee492a6ab4844eeb4454a5c0</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/HeLI.class</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">44050741</subfield>
    <subfield code="z">md5:8537531e1e6f74f67a58fcfc0ac302e3</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/HeLI.jar</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">22452</subfield>
    <subfield code="z">md5:c71b0f3cd044bf424908d905fa0a7a97</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/HeLI.java</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">39</subfield>
    <subfield code="z">md5:bb91c0c41fd40f3fb8a7c4f98c9a7c87</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/HeLI.mf</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">884</subfield>
    <subfield code="z">md5:f44bcfe8a8a8108095b6bc35cea8e31d</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/languagelist</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">44132999</subfield>
    <subfield code="z">md5:efd3371472a6b3a93133773a6c09d87b</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/LanguageModels.zip</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">11419</subfield>
    <subfield code="z">md5:bb0ae3b700049fd806e2a043e01265d6</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/LICENSE</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">2734</subfield>
    <subfield code="z">md5:e6f06930e25726624e53eb7a901e0874</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/README.md</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">1003</subfield>
    <subfield code="z">md5:fa3de39cf2e93085759e3f97cb9f4d0f</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/run_HeLI.py</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="s">745</subfield>
    <subfield code="z">md5:5d551dcb80653aaac5ecebae98842826</subfield>
    <subfield code="u">https://zenodo.org/record/5853116/files/supporting_functions.py</subfield>
  </datafield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="l">open</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2022-01-15</subfield>
  </datafield>
  <datafield tag="909" ind1="C" ind2="O">
    <subfield code="p">software</subfield>
    <subfield code="o">oai:zenodo.org:5853116</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="u">University of Helsinki</subfield>
    <subfield code="0">(orcid)0000-0002-6474-3570</subfield>
    <subfield code="a">Jauhiainen, Tommi</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">HeLI-OTS 1.2 with Python examples</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
    <subfield code="u">https://creativecommons.org/licenses/by/4.0/legalcode</subfield>
    <subfield code="a">Creative Commons Attribution 4.0 International</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="a">cc-by</subfield>
    <subfield code="2">opendefinition.org</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">&lt;p&gt;HeLI off-the-shelf language identifier with language models for 200 languages.&lt;/p&gt;

&lt;p&gt;Usage:&lt;br&gt;
java -jar HeLI.jar -r &amp;lt;infile&amp;gt; -w &amp;lt;outfile&amp;gt;&lt;/p&gt;

&lt;p&gt;The program will read the &amp;lt;infile&amp;gt; and classify the language of each line as one of the 200 languages it knows&lt;br&gt;
and writes the results, one ISO 639-3 code per line, into file &amp;lt;outfile&amp;gt;.&lt;/p&gt;

&lt;p&gt;You can use the -c option to make the program print a confidence score for the identification after each language code.&lt;/p&gt;

&lt;p&gt;Usage:&lt;br&gt;
java -jar HeLI.jar -c -r &amp;lt;infile&amp;gt; -w &amp;lt;outfile&amp;gt;&lt;/p&gt;

&lt;p&gt;You can give the list of comma-separated ISO 639-3 identifiers for relevant languages after -l option.&lt;/p&gt;

&lt;p&gt;Usage:&lt;br&gt;
java -jar HeLI.jar -r &amp;lt;infile&amp;gt; -w &amp;lt;outfile&amp;gt; -l fin,swe,eng&lt;/p&gt;

&lt;p&gt;You can give the number of top-scored languages to print after the -t option. (overrides confidence)&lt;/p&gt;

&lt;p&gt;Usage:&lt;br&gt;
java -jar HeLI.jar -r &amp;lt;infile&amp;gt; -w &amp;lt;outfile&amp;gt; -l fin,swe,eng -t 2&lt;/p&gt;

&lt;p&gt;If you omit both of the filenames, the program will read the standard input one line at a time and write the result to standard output.&lt;/p&gt;

&lt;p&gt;It can identify c. 3000 sentences per second using one core on a 2021 laptop and around 3 gigabytes of memory.&lt;/p&gt;

&lt;p&gt;If you use this program in producing scientific publications, please refer to:&amp;nbsp;&lt;br&gt;
&amp;nbsp;@inproceedings{jauhiainen-etal-2017-evaluation,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;title = &amp;quot;Evaluation of language identification methods using 285 languages&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;author = &amp;quot;Jauhiainen, Tommi &amp;nbsp;and&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Lind{\&amp;#39;e}n, Krister &amp;nbsp;and&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Jauhiainen, Heidi&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;booktitle = &amp;quot;Proceedings of the 21st Nordic Conference on Computational Linguistics&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;month = may,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;year = &amp;quot;2017&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;address = &amp;quot;Gothenburg, Sweden&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;publisher = &amp;quot;Association for Computational Linguistics&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;url = &amp;quot;https://www.aclweb.org/anthology/W17-0221&amp;quot;,&lt;br&gt;
&amp;nbsp; &amp;nbsp; &amp;nbsp;pages = &amp;quot;183--191&amp;quot;,&lt;br&gt;
&amp;nbsp;}&lt;/p&gt;

&lt;p&gt;Producing and publishing this software has been partly supported by The Finnish Research Impact Foundation Tandem Industry Academia -funding in cooperation with Lingsoft.&lt;/p&gt;</subfield>
  </datafield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="n">doi</subfield>
    <subfield code="i">isVersionOf</subfield>
    <subfield code="a">10.5281/zenodo.4780897</subfield>
  </datafield>
  <datafield tag="024" ind1=" " ind2=" ">
    <subfield code="a">10.5281/zenodo.5853116</subfield>
    <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">software</subfield>
  </datafield>
</record>
972
236
views
downloads
All versions This version
Views 972122
Downloads 23658
Data volume 3.0 GB573.2 MB
Unique views 72498
Unique downloads 11424

Share

Cite as