import sqlite3
import string

import pandas as pd
from spiral import ronin
from textstat import textstat
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize

db_connection = sqlite3.connect("C:\\Users\\sheha\\Downloads\\_Output_AssertData.sqlite")
input_data = pd.read_sql_query("select AssertMethodId,AssertMessage,Category,SubCategory from annotation inner join assert_method on assert_method.Id = annotation.AssertMethodId", db_connection)

input_data['message'] = None
input_data['score_flesch_reading_ease'] = None
input_data['score_flesch_kincaid_grade'] = None
input_data['score_gunning_fog'] = None
input_data['score_automated_readability_index'] = None
input_data['score_coleman_liau_index'] = None
input_data['score_linsear_write_formula'] = None
input_data['score_dale_chall_readability_score'] = None
input_data['score_text_standard'] = None
input_data['score_spache_readability'] = None
input_data['score_mcalpine_eflaw'] = None
input_data['tokens'] = None
input_data['tokens_count'] = None
input_data['tokens_text'] = None
input_data['tokens_text_count'] = None
input_data['first_token'] = None
input_data['first2_token'] = None
input_data['first3_token'] = None
input_data['syllable_count'] = None
input_data['lexicon_count'] = None
input_data['sentence_count'] = None
input_data['char_count'] = None
input_data['letter_count'] = None
input_data['polysyllabcount'] = None
input_data['monosyllabcount'] = None

for index, row in tqdm(input_data.iterrows(), desc="Text Preprocessing rows", total=len(input_data)):
    item_message = row['AssertMessage'].replace('"', '')
    input_data.at[index, 'message'] = item_message

    t = []
    if row['Category'] != 'Identifier':
        tokens = word_tokenize(item_message)
    else:
        if '(' in item_message:
            tokens = word_tokenize(item_message)
        else:
            tokens = ronin.split(item_message)

    for token in tokens:
        if "." in token:
            # Handle floats (e.g., 0.3, -0.5, 0.4f) -- if the token is a float, then do not split it
            if token.lstrip('-').lstrip('+').rstrip('f').rstrip('F').rstrip('D').rstrip('d').replace('.','',1).replace('e-','',1).replace('e','',1).isdigit():
                t.append(token)
            else:
                # (e.g., employee.getName() is split into 'employee', 'getName()')
                terms = token.split(".")
                for term in terms:
                    # (getName is split into 'get' & 'name')
                    t.extend(ronin.split(term))
        else:
            t.extend(ronin.split(token))

    tokens = t
    count_tokens = len(tokens)
    tokens_text = [i for i in tokens if not i in string.punctuation]
    count_tokens_text = len(tokens_text)
    input_data.at[index, 'tokens'] = ",".join(tokens)
    input_data.at[index, 'tokens_count'] = count_tokens
    input_data.at[index, 'tokens_text'] = ",".join(tokens_text)
    input_data.at[index, 'tokens_text_count'] = count_tokens_text

    input_data.at[index, 'first_token'] = ' '.join(tokens[0:1])
    input_data.at[index, 'first2_token'] = ' '.join(tokens[0:2]) if len(tokens) >= 2 else None
    input_data.at[index, 'first3_token'] = ' '.join(tokens[0:3]) if len(tokens) >= 3 else None

    item_message = " ".join(tokens)
    input_data.at[index, 'score_flesch_reading_ease'] = textstat.flesch_reading_ease(item_message)
    input_data.at[index, 'score_flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(item_message)
    input_data.at[index, 'score_gunning_fog'] = textstat.gunning_fog(item_message)
    input_data.at[index, 'score_automated_readability_index'] = textstat.automated_readability_index(item_message)
    input_data.at[index, 'score_coleman_liau_index'] = textstat.coleman_liau_index(item_message)
    input_data.at[index, 'score_linsear_write_formula'] = textstat.linsear_write_formula(item_message)
    input_data.at[index, 'score_dale_chall_readability_score'] = textstat.dale_chall_readability_score(item_message)
    input_data.at[index, 'score_text_standard'] = textstat.text_standard(item_message, float_output=True)
    input_data.at[index, 'score_spache_readability'] = textstat.spache_readability(item_message)
    input_data.at[index, 'score_mcalpine_eflaw'] = textstat.mcalpine_eflaw(item_message)
    input_data.at[index, 'syllable_count'] = textstat.syllable_count(item_message)
    input_data.at[index, 'lexicon_count'] = textstat.lexicon_count(item_message)
    input_data.at[index, 'sentence_count'] = textstat.sentence_count(item_message)
    input_data.at[index, 'char_count'] = textstat.char_count(item_message)
    input_data.at[index, 'letter_count'] = textstat.letter_count(item_message)
    input_data.at[index, 'polysyllabcount'] = textstat.polysyllabcount(item_message)
    input_data.at[index, 'monosyllabcount'] = textstat.monosyllabcount(item_message)



input_data.to_sql('readability_score', db_connection, if_exists='replace', index=False)
