#!/usr/bin/env python
# coding: utf-8

# In[1]:


#Import classla library
import classla as cl
cl.download('bg')
proc = cl.Pipeline('bg')
import matplotlib.pyplot as plt


# In[2]:


# Import other libraries needed
import queue
import pandas as pd


# classla version - 1.1.0
# python version - 3.11.4
# pandas version - 1.5.3

# In[3]:


#FINDS IF THERE IS A RENERATIV FORM IN A SIMPLE SENTENCE

#Defines if a word is a past participle
def is_past_part(word_dict):
    if ('upos' in word_dict and 'VERB' == word_dict['upos']) and ('feats' in word_dict and  ('Tense=Past' in word_dict['feats'] or 'Tense=Imp' in word_dict['feats'])):
        if ('VerbForm=Part' in word_dict['feats']):
            return True
    return False

#Defines what the type of the part participle is (if aspect is Perfect or Imperfect)
def past_part_type(word_dict):
    if is_past_part(word_dict) == False:
        return 0
    if 'Aspect=Imp' in word_dict['feats']: #ходил
        return 1
    if 'Aspect=Perf' in word_dict['feats']: #ходел
        return 2
    return 0

# Detect the aux verb 'СЪМ' and defines its type (if it is a 3rd person form or not)
def be_type(word_dict):
    if is_bil(word_dict):
        return 0
    if('upos' in word_dict and 'AUX' == word_dict['upos']) and ('lemma' in word_dict and 'съм' == word_dict['lemma']):
        if(word_dict['text'].lower() in ['е', 'са', 'бе', 'би', 'беше', 'бяха', 'биха'] ):
            return 2 #е/са
        return 1 #съм/си/сме/сте
    return 0

#Detects a form of the aux verb 'БИЛ'
def is_bil(word_dict):
    if 'upos' in word_dict and ('AUX' == word_dict['upos'] or 'VERB' ==  word_dict['upos']):
        if 'feats' in word_dict and 'VerbForm=Part' in word_dict['feats'] and  'бил' in word_dict['text'] and len(word_dict['text']) <= 4:#and 'Gender' in word_dict['feats']:
            return True
    return False

#Determines a,b and c of a sentence
#a (pp)- if and of what type a past part exist in the sentence (0 if none, 3 if both), int
#b (be)- if and of what type a aux verb 'СЪМ' exist in the sentence (0 if none, 3 if both), int
#c (bil)- if there exists a form of the aux verb 'БИЛ'
def proc_sentence(sent_dict):
    pp = 0
    be = 0
    bil = False
    for x in sent_dict:
        res_pp = past_part_type(x)
        res_be = be_type(x)
        res_bil = is_bil(x)
        if res_pp > 0 and pp != 3:
            if pp > 0 and pp != res_pp:
                pp = 3 #Both types are present
            else:
                pp = res_pp
        if res_be > 0 and be != 3:
            if be > 0 and be != res_be:
                be = 3 #Both types are present
            else:
                be = res_be
        bil = bil or res_bil
    return (pp, be, bil)

#Decides if there is a renerative form on the basis of the aformentioned a, b and c markers
def simp_sent_detect(a,b,c):
    if a>0 or c == True:
        if b>0:
            return False
        else:
            return True
    else:
        return False


# In[4]:


#Vertex class to support tree structure
class Vertex:
    def __init__(self, cm):
        self.cm = cm
        self.parent = None
        self.children = []
    def get_cm(self):
        return self.cm
    def get_parent(self):
        if self.parent == None:
            return "none"
        return self.parent
    def get_children(self):
        return self.children
    def set_parent(self, parent):
        self.parent = parent
    def add_children(self, vert):
        self.children.append(vert)
    def has_children(self):
        return len(self.children) != 0


# In[5]:


#Partitions a sentence

#Builds the tree using classla's output head
#returns the root node
def build_tree(doc):
    vertices = []
    for x in doc:
        v = Vertex(x)
        vertices.append(v)
    root = None
    for v in vertices:
        h = int(v.get_cm()['head']) - 1
        if h < 0:
            root = v
            continue
        par = vertices[h]
        v.set_parent(par)
        par.add_children(v)
    return root

#Gets structures as parent + all children
def get_structs(root):
    q = queue.Queue()
    q.put(root)
    structs = []
    while not q.empty():
        curr = q.get()
        
        if curr.has_children():
            for x in curr.get_children():
                    q.put(x)
        if is_past_part(curr.get_cm()) == False and is_bil(curr.get_cm()) == False:
            continue
        curl = []
        if curr.get_parent() != "none":
            curl.extend(curr.get_parent().get_children())
            curl.append(curr.get_parent())
        else:
            curl.append(curr)
        if curr.has_children():
            curl.extend(curr.get_children())
        structs.append(curl)
    return structs

#Transforms the structures as a list of dictionaries
def get_sent_iter(struct):
    return [v.get_cm() for v in struct]


# In[6]:


#Uses other methods to determine how many renerativ forms are there in the sentence
#as it breaks a sentence to simple structures and treats them as simple sentences
#DISCLAIMER: counting is not accurate when more than 1 renerativ form exists
def detect_ren(doc):
    cnt = 0
    for sent in doc:
        if len(sent[0]) <= 1:
            a,b,c = proc_sentence(sent[0])
            if(simp_sent_detect(a,b,c)):
                cnt += 1
            continue
        root = build_tree(sent[0])
        structs = get_structs(root)
        for x in structs:
            struct = get_sent_iter(x)
            a,b,c = proc_sentence(struct)
            if(simp_sent_detect(a,b,c)):
                cnt += 1
    return cnt

#Uses the above method count to determine if there is a renerativ
def detect_ren_boolean(doc):
    return (detect_ren(doc) > 0)


# In[7]:


# Preprocessing - urls, mentions, hashtags, RT and emojis removal
# RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) #legacy
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def text_preproc(x):
  x = re.sub(r'https*\S+', ' ', x)
  x = re.sub(r'RT @\S+', ' ', x)
  x = re.sub(r'@\S+', ' ', x)
  x = re.sub(r'#\S+', ' ', x)
  x = remove_emoji(x)
  print (x)
  return x
  


# In[8]:


# EXAMPLE
TEXT = "Той е направил страхотни снимки по време на пътуването си."
TEXT = text_preproc(TEXT)
detect_ren_boolean(proc(TEXT).to_dict())

