/*
Copyright 2020 Tommi Jauhiainen
Copyright 2022 University of Helsinki
Copyright 2022 Heidi Jauhiainen

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
 HeLI-OTS 1.4.
 
 If you use this program in producing scientific publications, please refer to:
 
 @inproceedings{heliots2022,
	 title = "{H}e{LI-OTS}, Off-the-shelf Language Identifier for Text",
	 author = "Jauhiainen, Tommi  and
	   Jauhiainen, Heidi  and
	   Lind{\'e}n, Krister",
	 booktitle = "Proceedings of the 13th Conference on Language Resources and Evaluation",
	 month = june,
	 year = "2022",
	 address = "Marseille, France",
	 publisher = "European Language Resources Association",
	 url = "http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.416.pdf",
	 pages = "3912--3922",
	 language = "English",
 }
 
 Producing and publishing this software has been partly supported by The Finnish Research Impact Foundation Tandem Industry Academia -funding in cooperation with Lingsoft.
 */

import java.io.*;
import java.util.*;

class ConvertModels {

    private static TreeMap<String, TreeMap<String, Float>> gramDict;
    private static TreeMap<String, TreeMap<String, Float>> wordDict;
	private static List<String> languageList = new ArrayList<String>();

// The following values are the ones used in Jauhiainen et al. 2017.

	private static float usedmonos = (float) 0.0000005;
	private static float usedbis = (float) 0.0000005;
	private static float usedtris = (float) 0.0000005;
	private static float usedquads = (float) 0.0000005;
	private static float usedcinqs = (float) 0.0000005;
	private static float usedsexts = (float) 0.0000005;
	private static float usedwords = (float) 0.0000005;
	
	private static float penaltyValue = (float) 7.0;
	
	private static int maxNgram = 6;

	public static void main(String[] args) throws FileNotFoundException, IOException {
 
        InputStream in = ConvertModels.class.getResourceAsStream("languagelist");
        
		BufferedReader reader = null;
		
		try {
            reader = new BufferedReader(new InputStreamReader(in));
			String text = null;
			while ((text = reader.readLine()) != null) {
				languageList.add(text);
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (reader != null) {
					reader.close();
				}
			} catch (IOException e) {
			}
		}
		        
        gramDict = new TreeMap<>();
        wordDict = new TreeMap<>();
		
		ListIterator gramiterator = languageList.listIterator();
		while(gramiterator.hasNext()) {
			Object element = gramiterator.next();
			String languageCode = (String) element;
            
			loadModel(usedmonos, languageCode, "LowGramModel1");
			loadModel(usedbis, languageCode, "LowGramModel2");
			loadModel(usedtris, languageCode, "LowGramModel3");
			loadModel(usedquads, languageCode, "LowGramModel4");
			loadModel(usedcinqs, languageCode, "LowGramModel5");
			loadModel(usedsexts, languageCode, "LowGramModel6");
			loadModel(usedwords, languageCode, "LowWordModel");
		}
		
		writegramToFile();
		writewordToFile();
	}
	
	private static void loadModel(float usedFeatureRF, String languageCode, String modelType) {
        TreeMap<String, Float> tempDict;
		
        tempDict = new TreeMap<>();
	
        InputStream modelFile = null;
        
        modelFile = ConvertModels.class.getResourceAsStream("LanguageModels/" + languageCode + "." + modelType);
	
		float totalFeatureNumber = 0;
		float langamount = 0;
	
		BufferedReader reader = null;
		try {
            reader = new BufferedReader(new InputStreamReader(modelFile));
			String text = null;
            
            text = reader.readLine();
            totalFeatureNumber = Float.parseFloat(text);
            
			while ((text = reader.readLine()) != null) {
                String[] line = text.split("\t");
                String gram = line[0];
                long amount = Long.parseLong(line[1]);
                
                if (amount/totalFeatureNumber > usedFeatureRF) {
                    tempDict.put(gram, (float) amount);
                    langamount = langamount + (float) amount;
                }
                else {
                    break;
                }
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (reader != null) {
					reader.close();
				}
			} catch (IOException e) {
			}
		}

        for (Map.Entry<String,Float> entry : tempDict.entrySet()) {
			Float probability = (float) -Math.log10(entry.getValue() / langamount);
            TreeMap <String, Float> kiepro = new TreeMap<>();
   			if (modelType.equals("LowWordModel")) {
                String word = " " + entry.getKey() + " ";
                if (wordDict.containsKey(word)) {
                    kiepro = wordDict.get(word);
                }
                kiepro.put(languageCode,probability);
                wordDict.put(word, kiepro);
			}
			else {
                if (gramDict.containsKey(entry.getKey())) {
                    kiepro = gramDict.get(entry.getKey());
                }
                kiepro.put(languageCode,probability);
				gramDict.put(entry.getKey(), kiepro);
			}
		}
	}
	
	private static void writegramToFile() throws FileNotFoundException, IOException {
		OutputStream file = null;
		OutputStream buffer = null;
		ObjectOutput output = null;
		try {
			file = new FileOutputStream("gramdict.ser");
			buffer = new BufferedOutputStream(file);
			output = new ObjectOutputStream(buffer);
			output.writeObject(gramDict);
			
		}
		catch (IOException e) {
			System.out.println("cannot write to file:   "+e);
		}
		finally {
			output.close();
			buffer.close();
			file.close();
		}
	}
	
	private static void writewordToFile() throws FileNotFoundException, IOException {
		OutputStream file = null;
		OutputStream buffer = null;
		ObjectOutput output = null;
		try {
			file = new FileOutputStream("worddict.ser");
			buffer = new BufferedOutputStream(file);
			output = new ObjectOutputStream(buffer);
			output.writeObject(wordDict);
			
		}
		catch (IOException e) {
			System.out.println("cannot write to file:   "+e);
		}
		finally {
			output.close();
			buffer.close();
			file.close();
		}
	}
}
