# This R script takes as input report files obtained from ud-worder.py and prints out the output-ciep-full csv file.
library("entropy")
library("plyr")
library("tidyverse")

#Dependency list
quantifiers = list("det", "det:predet", "det:nummod", "det:numgov")
articles = list("det", "det:predet")
demonstratives = list("det", "det:predet")
quantifiersE = list("det", "det:predet", "det:nummod", "det:numgov",'amod',"nmod")
articlesE = list("det", "det:predet")
demonstrativesE = list("det", "det:predet")

#UPOS list
articleP = list("DET")
demonstrativeP = list("DET","PRON")
quantifierP = list("DET", "PRON")
articlePE = list("DET")
demonstrativePE = list("DET","PRON")
quantifierPE = list("DET", "PRON", "ADJ","NOUN")


#Article and demonstrative
article = list()
demonstrative = list()
quantifier = list()

#List of lemmas (article)
article[["UD_Croatian-SET"]] = NA
article[["UD_Serbian-SET"]] = NA
article[["UD_Bulgarian-BTB"]] = NA
article[["UD_Czech-PDT"]] = NA
article[["UD_Danish-DDT"]] = list("en", "et")
article[["UD_Dutch-Alpino"]] = list("de", "het", "een")
article[["UD_English-EWT"]] = list("the", "a")
article[["UD_French-GSD"]] = list("le", "un")
article[["UD_German-GSD"]] = list("der", "ein")
article[["UD_Greek-GDT"]] = list("ο", "ένας")
article[["UD_Irish-IDT"]] = list("an")
article[["Lithuanian-ALKSNIS"]] = NA
#article[["Italian"]] = list("il", "uno")
article[["UD_Polish-PDB"]] = NA
article[["UD_Portuguese-Bosque"]] = list("o", "um","O","Um","UM")
article[["UD_Romanian-RRT"]] = list("un")
article[["UD_Russian-SynTagRus"]] = NA
article[["UD_Spanish-AnCora"]] = list("el", "uno")
article[["UD_Welsh-CCG"]] = list("y","Υ")

#List of lemmas (demonstrative)
demonstrative[["UD_Croatian-SET"]] = list("ovaj", "taj", "onaj")
demonstrative[["UD_Serbian-SET"]] = list("ovaj", "taj", "onaj")
demonstrative[["UD_Bulgarian-BTB"]] = list("този")
demonstrative[["UD_Czech-PDT"]] = list("ten","onen", "tenhle", "takový") #Naughton 2005:82-88
demonstrative[["UD_Danish-DDT"]] = list("denne", "dette", "disse", "den", "det", "de")
demonstrative[["UD_Dutch-Alpino"]] = list("die", "dit", "dat", "deze")
demonstrative[["UD_English-EWT"]] = list("this", "that")
demonstrative[["UD_French-GSD"]] = list("ce")
demonstrative[["UD_German-GSD"]] = list("dies","jen","solch","derjenige")
demonstrative[["UD_Greek-GDT"]] = list("αυτός", "εκείνος", "τούτος", "αυτά", "αυτές", "αυτή", "αυτήν", "αυτής", "αυτό", "αυτοί", "αυτόν", "αυτός", "αυτού", "αυτούς", "αυτών", "εκείνα", "εκείνες", "εκείνη", "εκείνο", "εκείνοι", "εκείνον", "εκείνος", "εκείνου", "εκείνους", "εκείνων", "τούτη", "τούτο", "τούτον", "τούτους","Αυτός", "Εκείνος", "Τούτος", "Αυτά", "Αυτές", "Αυτή", "Αυτήν", "Αυτής", "Αυτό", "Αυτοί", "Αυτόν", "Αυτός", "Αυτού", "Αυτούς", "Αυτών", "Εκείνα", "Εκείνες", "Εκείνη", "Εκείνο", "Εκείνοι", "Εκείνον", "Εκείνος", "Εκείνου", "Εκείνους", "Εκείνων", "Τούτη", "Τούτο", "Τούτον", "Τούτους")
demonstrative[["Lithuanian-ALKSNIS"]] = list("šis","tas", "anas")
demonstrative[["UD_Irish-IDT"]] = list ("seo","sin")
#demonstrative[["Italian"]] = list("questo", "quello", "codesto")
demonstrative[["UD_Polish-PDB"]] = list("ów", "ten")
demonstrative[["UD_Portuguese-Bosque"]] = list("este", "esse", "aquele", "esta", "essa", "aquela", "estes", "esses", "aqueles", "estas", "essas", "aquelas","Este", "Esse", "Aquele", "Esta", "Essa", "Aquela", "Estes", "Esses", "Aqueles", "Estas", "Essas", "Aquelas")
demonstrative[["UD_Romanian-RRT"]] = list("acest","acesta","acel","acela") #Giurgea 2013:160-163
demonstrative[["UD_Russian-SynTagRus"]] = list("этот","тот") #Wade 2010
demonstrative[["UD_Spanish-AnCora"]] = list("este", "ese", "aquel")
demonstrative[["UD_Welsh-CCG"]] = list("hwn", "hwnnw", "hon", "honno", "hyn", "hynny","Ηwn", "Ηwnnw", "Ηon", "Ηonno", "Ηyn", "Ηynny")

#List of lemmas (quantifiers)
quantifier[["UD_Croatian-SET"]] = list("neki", "nekakav", "ništa", "nikakav", "sve", "mnogo", "nijedan", "dosta", "nekoliko", "koliko"," koji", "puno", "previše"," dovoljno", "malo" , "brojni", "različit", "razni", "svaki", "oboje", "pola", "polovina", "manje", "više")
quantifier[["UD_Serbian-SET"]] = list("neki", "nekakav", "ništa", "nikakav", "sve", "mnogo", "nijedan", "dosta", "nekoliko", "koliko"," koji", "puno", "previše"," dovoljno", "malo" , "brojni", "različit", "razni", "svaki", "oboje", "pola", "polovina", "manje", "više")
quantifier[["UD_Bulgarian-BTB"]] = list("всяко","всички","няколко", "някакъв", "всеки","половина","много","някой","малко","повече","толкова","никой","никакъв", "николко")
quantifier[["UD_Czech-PDT"]] = list("nějaký", "všechen", "veškerý", "celý", "každý", "žádný", "mnoho", "hodně", "málo", "několik","tolik", "mnoho","pár","moc","trocha","trochu", "co", "něco")
quantifier[["UD_Danish-DDT"]] = list("al", "alting", "alt", "alle", "begge", "enhver","hver","ingen","megen","meget","nogen","lidt","få","mange","nogen","noget","nogle")
quantifier[["UD_Dutch-Alpino"]] = list("alles", "allen", "al", "allemaal", "allebeide", "beide", "iedereen", "iemand", "niemand", "veel" ) #shetter and ham 2007
quantifier[["UD_English-EWT"]] = list("some", "several", "no", "many", "few", "enough", "all", "every", "each", "most", "half", "none", "neither", "more", "less", "different", "much","both") #Keenan 2012
quantifier[["UD_French-GSD"]] = list("assez","beaucoup","bien","chacun","chaque","moins","peu","pas","plus","quelque","plusieurs","trop","tout","combien") #Batchelor and M. Chebli-Saadi 2011, Own knowledge
quantifier[["UD_German-GSD"]] = list("manch","einig","kein","wenig","viel","welch","niemand", "nichts", "nirgendwo", "jeder","jeglich","alle","sämtlich","irgendein","irgendwelch", "mehrere","meist","keine","jeweils","beide") # Kobele-Zimmermann 2012
quantifier[["UD_Greek-GDT"]] = list("όλος","ολόκληρος","πολύς","περισσότερος","αρκετός","κάμποσος","μπόλικος","μερικοί","λίγος","ελάχιστος","υπόλοιπος","σωρό","ποσός", "κάποιος", "κάτι", "κάθε", "καθένας", "κανένας","τίποτα", "αμφότερα") #Holton et al. 2004, Giannikidou 2012, own knowledge
quantifier[["UD_Irish-IDT"]] = list("gach", "gach_uile", "neart", "mórán", "roinnt", "cúpla", "dóthain", "beagán", "tuilleadh", "cuid", "éigin", "uile", "eile", "céanna")
#quantifier[["Italian"]] = list ("qualcuno", "diverso", "nessuno", "molto", "poco", "abbastanza", "tutto", "ciascuno", "metà","mezzo","meno","più", "qualche", "alcuno", "quanto", "tanto", "troppo", "parecchio", "ogni", "entrambi", "ambo", "ambedue") #Own knowledge, Crisma 2012
quantifier[["Lithuanian-ALKSNIS"]] = list("kuris", "visi", "vienas", "nėra", "kas", "niekas", "viskas", "kažkas", "daug", "pakankamai", "mažai", "skirtingas", "įvairūs", "kiekvienas", "abu", "pusė", "joks", "mažiau")
quantifier[["UD_Polish-PDB"]] = list("dużo", "więcej", "najwięcej","wiele", "więcej", "najwięcej","mało", "mniej", "najmniej","niedużo", "niewiele","dużo","wiele","mało","parę","sporo","ciszy","mnóstwo","tłum","gromada","grupa","większość","reszta","resztka","trochę","odrobina","dość","dosyć","nic","coś","jeden","kilka","para") #Swan 2002
quantifier[["UD_Portuguese-Bosque"]] = list("quanto", "quanta", "quantos", "quantas","todo", "toda","todos", "todas","algo","alguém","cada","vários", "várias", "bastante", "bastantes","muito", "muita", "muitos", "muitas","tudo","pouco", "pouca", "poucos", "poucas","nenhum", "nehuma", "nenhuns", "nenhumas","ninguém","nada") #Whitlam 2011
quantifier[["UD_Romanian-RRT"]] = list("mult","câtva","destul","oricare","atât","cât","nici_un", "vreun", "tot", "fiecare", "ceva", "oricât")
quantifier[["UD_Russian-SynTagRus"]] = list("весь", "целый", "много", "мало", "более", "все", "всякий", "каждый", "всякий","большинство", "меньшинство", "менее", "разный", "некоторый", "несколько", "сколько", "никто", "ничто", "немного", "немногие", "многочисленный", "достаточно", "половина", "оба"  ) #Wade 2010, Paperno 2012
quantifier[["UD_Spanish-AnCora"]] = list("alguien","alguno","ninguno","cuanto","poco","mucho","bastante","suficiente","vario","demasiado","todo","cualquiera","ambos","cada","demás") #Butt et al. 2019
quantifier[["UD_Welsh-CCG"]] = list("faint", "sawl", "digon", "gormod", "llawer", "peth", "rhagor", "tamaid", "tipyn", "ychydig") #King 2003

df <- matrix(ncol = 58, nrow = 0)

setwd("/Users/utw/Desktop/linguistics/saarbruecken/parallelcorpusbuilding/output/ltac/treebanks/")

for (filename in list.files()) {
	report <- read.csv(filename,sep=',',quote='"')
	print(filename)
		language = str_split(str_remove(basename(filename[1]),"report-")[1],"[.]")[[1]][1]
		source = basename(filename)
		size = unique(report$size)
		no_sent = sum(as.integer(size[!size %in% "size"]))

		determ_raw_modx = NA
		determ_raw_xmod = NA
		determ_raw_entropy = NA
		
		quant_raw_modx = NA
		quant_raw_xmod = NA
		quant_raw_entropy = NA
		
		art_pos_modx = NA
		art_pos_xmod = NA
		art_pos_entropy = NA
		
		dem_pos_modx = NA
		dem_pos_xmod = NA
		dem_pos_entropy = NA
		
		quant_pos_modx = NA
		quant_pos_xmod = NA
		quant_pos_entropy = NA
		
		art_lemma_modx = NA
		art_lemma_xmod = NA
		art_lemma_entropy = NA

		dem_lemma_modx = NA
		dem_lemma_xmod = NA
		dem_lemma_entropy = NA

		quant_lemma_modx = NA
		quant_lemma_xmod = NA
		quant_lemma_entropy = NA
		
		art_synlemma_modx = NA
		art_synlemma_xmod = NA
		art_synlemma_entropy = NA
		
		dem_synlemma_modx = NA
		dem_synlemma_xmod = NA
		dem_synlemma_entropy = NA
		
		quant_synlemma_modx = NA
		quant_synlemma_xmod = NA
		quant_synlemma_entropy = NA
	#Just relevant syntactic relations
	print("determ_raw")
	determ_raw_xmod = nrow(report[ which(report$relsecond %in% demonstratives),])
	determ_raw_modx = nrow(report[ which(report$relfirst %in% demonstratives),])
	determ_raw_entropy = entropy(c(determ_raw_xmod,determ_raw_modx), method="ML", unit="log2")
	#synrels + pos
	#dem_pos
	print("dem_pos")
	dem_pos_xmod = nrow(report[ which(report$relsecond %in% demonstratives & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond %in% demonstrativeP) ),])
	dem_pos_modx = nrow(report[ which(report$relfirst %in% demonstratives & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst %in% demonstrativeP)  ),])
	dem_pos_entropy = entropy(c(dem_pos_xmod,dem_pos_modx), method="ML", unit="log2")
	#art_pos
	print("art_pos")
	art_pos_xmod = nrow(report[ which(report$relsecond %in% articles & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond == "DET" )  & !is.na(article[[language]]) ),])
	art_pos_modx = nrow(report[ which(report$relfirst %in% articles & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst == "DET" ) & !is.na(article[[language]])  ),])
	art_pos_entropy = entropy(c(art_pos_xmod,art_pos_modx), method="ML", unit="log2")
	#synrels + pos + lemma + extended
	#dem_lemma
	print("dem_lemma")
	dem_lemma_xmod = nrow(report[ which(report$relsecond %in% demonstrativesE & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond %in% demonstrativePE) & (report$lemmasecond %in% demonstrative[[language]])),])
	dem_lemma_modx = nrow(report[ which(report$relfirst %in% demonstrativesE & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst %in% demonstrativePE) & (report$lemmafirst %in% demonstrative[[language]])),])
	dem_lemma_entropy = entropy(c(dem_lemma_xmod,dem_lemma_modx), method="ML", unit="log2")
	#art_lemma
	print("art_lemma")
	art_lemma_xmod = nrow(report[ which(report$relsecond %in% articlesE & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond == "DET" ) & (report$lemmasecond %in% article[[language]])),])
	art_lemma_modx = nrow(report[ which(report$relfirst %in% articlesE & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst == "DET" ) & (report$lemmafirst %in% article[[language]])),])
	art_lemma_entropy = entropy(c(art_lemma_xmod,art_lemma_modx), method="ML", unit="log2")
	#synrels + lemma
	#dem_synlemma
	print("dem_synlemma")
	dem_synlemma_xmod = nrow(report[ which(report$relsecond %in% demonstrativesE & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") & (report$lemmasecond %in% demonstrative[[language]])),])
	dem_synlemma_modx = nrow(report[ which(report$relfirst %in% demonstrativesE & (report$lemmafirst %in% demonstrative[[language]])),])
	dem_synlemma_entropy = entropy(c(dem_synlemma_xmod,dem_synlemma_modx), method="ML", unit="log2")
	#art_synlemma
	print("art_synlemma")
	art_synlemma_xmod = nrow(report[ which(report$relsecond %in% articlesE & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") & (report$lemmasecond %in% article[[language]])),])
	art_synlemma_modx = nrow(report[ which(report$relfirst %in% articlesE & (report$lemmafirst %in% article[[language]])),])
	art_synlemma_entropy = entropy(c(art_synlemma_xmod,art_synlemma_modx), method="ML", unit="log2")
	#Quantifiers
		  #Just relevant syntactic relations
	    print("quant_raw")
		  quant_raw_xmod = nrow(report[ which(report$relsecond %in% quantifiers),])
		  quant_raw_modx = nrow(report[ which(report$relfirst %in% quantifiers),])
		  quant_raw_entropy = entropy(c(quant_raw_xmod,quant_raw_modx), method="ML", unit="log2")
	    #POS + synrels
		  #quant_pos
		  print("quant_pos")
		  quant_pos_xmod = nrow(report[ which(report$relsecond %in% quantifiers & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond %in% quantifierP ) ),])
		  quant_pos_modx = nrow(report[ which(report$relfirst %in% quantifiers & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst %in% quantifierP ) ),])
		  quant_pos_entropy = entropy(c(quant_pos_xmod,quant_pos_modx), method="ML", unit="log2")
		  #synrels + pos + lemma
		  #quant_lemma
		  print("quant_lemma")
		  quant_lemma_xmod = nrow(report[ which(report$relsecond %in% quantifiersE & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") &  (report$upossecond %in% quantifierPE ) & (report$lemmasecond %in% quantifier[[language]])),])
		  quant_lemma_modx = nrow(report[ which(report$relfirst %in% quantifiersE & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$uposfirst %in% quantifierPE ) & (report$lemmafirst %in% quantifier[[language]])),])
		  quant_lemma_entropy = entropy(c(quant_lemma_xmod,quant_lemma_modx), method="ML", unit="log2")
		  #synrels + lemma
		  #quant_synlemma
		  print("quant_synlemma")
		  quant_synlemma_xmod = nrow(report[ which(report$relsecond %in% quantifiersE  &  (report$upossecond %in% quantifierPE ) & (report$uposfirst=="NOUN" | report$uposfirst=="PROPN") & (report$lemmasecond %in% quantifier[[language]])),])
		  quant_synlemma_modx = nrow(report[ which(report$relfirst %in% quantifiersE & (report$uposfirst %in% quantifierPE ) & (report$upossecond=="NOUN" | report$upossecond=="PROPN") & (report$lemmafirst %in% quantifier[[language]])),])
		  quant_synlemma_entropy = entropy(c(quant_synlemma_xmod,quant_synlemma_modx), method="ML", unit="log2")
	
		  row <- data.frame (language = language, source = source, no_sent = no_sent, 
	                   determ_raw_modx = determ_raw_modx, determ_raw_xmod = determ_raw_xmod, determ_raw_frequency = determ_raw_modx + determ_raw_xmod, determ_raw_entropy = determ_raw_entropy, 
	                   quant_raw_modx = quant_raw_modx, quant_raw_xmod = quant_raw_xmod, quant_raw_frequency = quant_raw_modx + quant_raw_xmod, quant_raw_entropy = quant_raw_entropy, 
	                   art_pos_modx = art_pos_modx, art_pos_xmod = art_pos_xmod, art_pos_frequency = art_pos_modx + art_pos_xmod, art_pos_entropy = art_pos_entropy,
	                   dem_pos_modx = dem_pos_modx, dem_pos_xmod = dem_pos_xmod, dem_pos_frequency = dem_pos_modx + dem_pos_xmod, dem_pos_entropy = dem_pos_entropy, 
	                   quant_pos_modx = quant_pos_modx, quant_pos_xmod = quant_pos_xmod, quant_pos_frequency = quant_pos_modx + quant_pos_xmod, quant_pos_entropy = quant_pos_entropy, 
	                   art_lemma_modx = art_lemma_modx, art_lemma_xmod = art_lemma_xmod, art_lemma_frequency = art_lemma_modx + art_lemma_xmod, art_lemma_entropy = art_lemma_entropy,  
	                   dem_lemma_modx = dem_lemma_modx, dem_lemma_xmod = dem_lemma_xmod, dem_lemma_frequency = dem_lemma_modx + dem_lemma_xmod, dem_lemma_entropy = dem_lemma_entropy, 
	                   quant_lemma_modx = quant_lemma_modx, quant_lemma_xmod = quant_lemma_xmod, quant_lemma_frequency = quant_lemma_modx + quant_lemma_xmod, quant_lemma_entropy = quant_lemma_entropy, 
	                   art_synlemma_modx = art_synlemma_modx, art_synlemma_xmod = art_synlemma_xmod, art_synlemma_frequency = art_synlemma_modx + art_synlemma_xmod, art_synlemma_entropy = art_synlemma_entropy,  
	                   dem_synlemma_modx = dem_synlemma_modx, dem_synlemma_xmod = dem_synlemma_xmod, dem_synlemma_frequency = dem_synlemma_modx + dem_synlemma_xmod, dem_synlemma_entropy = dem_synlemma_entropy, 
	                   quant_synlemma_modx = quant_synlemma_modx, quant_synlemma_xmod = quant_synlemma_xmod, quant_synlemma_frequency = quant_synlemma_modx + quant_synlemma_xmod, quant_synlemma_entropy = quant_synlemma_entropy)
	df <- rbind(df, row)		
}
write.csv(x=df,file="/Users/utw/Desktop/linguistics/saarbruecken/papers/LTaC/data/output-treebanks-full.csv",row.names = FALSE)
