'''
poemtool.py
A Python script to create wordlevel and poemlevel metadata for poems of the childPoeDE corpus 
(Lehmann, Heumann, Kuijpers, Lauer & Lüdtke, 2023)

Requirements:
- Python 3
- Dependencies: pandas, rhymetagger, espeak
- An input directory containing the raw poem text files. Most convenient if input directory and poemtool.py exist on the same level.

i.e.
.
.
| - Some Directory
    | - input folder with poem texts
    | - poemtool.py
    | - output folder
|
.
.

How to run the script:

1. Open Command Line Interface, e.g. Terminal, CMD, PowerShell, etc
2. Navigate to "Some Directory" ("cd" - change directory command) or type "cd" and drag-and-drop "Some Directory" from Finder/FileExplorer.
3. Run this script with the following command: "python poemtool.py --in_dir <name_of_input_directory> --out_dir <name_of_output_directory>"

Note:
The output directory is created automatically if it doesn't exist already.

'''


import argparse
import os
import re
import glob
import pandas as pd
from rhymetagger import RhymeTagger
from datetime import datetime

##### 1. Paths and variables

# Parse command line arguments
parser = argparse.ArgumentParser(
    description='Get Input and Output directories from CL.')
parser.add_argument('--in_dir', type=does_dir_exist,
                    help='The directory containing the raw poems.')
parser.add_argument('--out_dir',
                    help='The directory where the output files should be placed.')
args = parser.parse_args()

# Create output directory if it does not exist already
if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

# set path variables
CWD = os.getcwd()
INPUT_DIR = os.path.join(CWD, args.in_dir)
OUTPUT_DIR = os.path.join(CWD, args.out_dir)    

# store all file names from input folder
poemsfolder = glob.glob(f'{INPUT_DIR}\*.txt')

# create list for collecting dataframes with metadata for one poem (dfs in list are concatenated in end)
df_single_poem_list = []


##### 2. Helper functions

### os functions

def does_dir_exist(string):
    if os.path.isdir(string):
        return string
    else:
        print('Error: Directory does not exist.')
        raise NotADirectoryError(string)

### cleaning functions

def clean_poem_line_list(poem_line_list):
    '''
    Excludes first line with title, dops \n at end of line and drops empty list elements (caused by empty lines between stanzas)
    '''
    cleaned_poem_line_list = []
    for nr, line in enumerate(poem_line_list):
        # exclude first line with title
        if nr != 0:
            # drop \n at end of line
            line = line.replace('\n', '')
             # drop empty lines between stanzas
            if line.strip() != '':
                cleaned_poem_line_list.append(line)

    return cleaned_poem_line_list

def clean_line(line):
    """
    Removes line breaks and trailing whitespace from line.
    """
    line = line.replace('\n', '')
    line = line.strip()

    return line

### sonority functions

def son_score_g(word):
    '''
    Calculates Sonority Score (min score: 1, max score: 7)
    '''
    # build sonority dictionary for lookups - ranking based on Jacobs (2017) and Stenneken et al. (2005)
    sonority = {
        'a': 7,
        'e': 6, 'o': 6,
        'i': 5, 'u': 5,
        'j': 4, 'w': 4, 'y': 4,
        'l': 3, 'r': 3,
        'm': 2, 'n': 2
    }

    for ugly_consonant in 'bcdfghkpqstvxz':
        sonority[ugly_consonant] = 1
    sonority['é'] = sonority['è'] = sonority['e']

    sonority['à'] = sonority['á'] = sonority['a']

    sonority['ä'] = (sonority['a']+sonority['e'])/2
    sonority['ö'] = (sonority['o']+sonority['e'])/2
    sonority['ü'] = (sonority['u']+sonority['e'])/2

    sonority['ß'] = sonority['s']

    # sonority score calculation
    pureword = ''  # starting point for word (without punctuation)
    son_value = 0  # starting point for sonority score of word

    for letter in word:
        if not letter.isalpha():
            continue     # ignore punctuation and non-words
        pureword += letter  # add letter by letter to string
        # add up sonority scores letter by letter
        son_value += sonority[letter.lower()]

    nlet = len(pureword)
    if nlet == 0:
        sono = 0  # ignore words of length zero, set sonority score to zero to avoid zero division error
    else:
        sono = son_value/nlet  # total sonority = average sonority score of letters in word

    # round score, two digits
    rounded_sono = round(sono, 2)

    return rounded_sono

def calc_average_son(son_score_list, poem_length_words):
    """
    Calculates average sonority score for the poem based on a list of the individual sonority scores for each word.
    """
    son_score_sum = 0
    av_son_score = 0
    for j, son in enumerate(son_score_list):
        # if poem has no title (exclude first two sonority scores for "Kein" and "Titel")
        if data_dict['Word_In_Title'][j] == 1:
            son_score_sum += 0
        else: 
            son_score_sum += son
        
    if poem_length_words != 0:
        av_son_score = round(son_score_sum / poem_length_words, 2)
    else:
        av_son_score = 0

    return av_son_score

### rhyme processing functions

def get_rhyme_structure(poemtext):
    '''
    Determines rhyme structure of poem with rhymetagger
    '''
    rt = RhymeTagger()
    rt.load_model(model='de', verbose=False)

    rhyme_struct = rt.tag(poemtext, output_format=3) 

    return rhyme_struct

def check_for_rhymes(rhyme_struct):
    '''
    Takes rhyme structure list (output of rhymetagger) as input.
    Determines if list contains integers (i.e. poem contains rhymes).
    Returns 1 if True, 0 if False
    '''
    has_rhyme = 0
    for i in rhyme_struct:
        if isinstance(i, int):
            has_rhyme = 1
            break
    return has_rhyme

def get_nr_rhyming_lines(rhyme_struct):
    '''
    Takes rhyme structure list (output of rhymetagger) as input.
    Counts number of rhyming lines i.e. number of integers in list.
    '''
    rhyming_lines = 0
    for i in rhyme_struct:
        if isinstance(i, int):
            rhyming_lines += 1
    return rhyming_lines
        
def get_rhyming_degree(nr_rhyming_lines, nr_lines_total):
    '''
    Calculates rhyming degree by dividing nr of rhyming lines by the total nr of lines in the poem.
    '''
    if nr_lines_total != 0:
        rhyming_degree = round(nr_rhyming_lines / nr_lines_total, 2)
        return rhyming_degree
    else:
        return 0

def get_rhyme_info(poemfile):
        '''
        Reads the whole poem as list of lines and determines rhyme info:
        rhyme structure, has_rhyme, nr_rhyming_lines, rhyming_degree
        '''
        with open(poemfile, 'r', encoding='utf8') as f:
            poemtext = f.read()

            # make list of lines with text only
            poem_line_list = poemtext.split('\n')

            cleaned_poem_line_list = clean_poem_line_list(poem_line_list)
            
            # get RYHME_STRUCTURE
            rhyme_structure_list = get_rhyme_structure(cleaned_poem_line_list)
            rhyme_structure_str = str(rhyme_structure_list)

            # get HAS_RHYME
            has_rhyme = check_for_rhymes(rhyme_structure_list)

            # get NR_RHYMING_LINES
            nr_rhyming_lines = get_nr_rhyming_lines(rhyme_structure_list)

            # get RHYMING_DEGREE
            rhyming_degree = get_rhyming_degree(nr_rhyming_lines, len(rhyme_structure_list))
        
        return rhyme_structure_str, has_rhyme, nr_rhyming_lines, rhyming_degree

### layout functions

def get_layout_data(lines):
    special_layout = 0
    has_punct = 0
    has_uc = 0
    has_lc = 0
    has_tc = 0

    for line in lines:
        # get SPECIAL_LAYOUT (special layout = poem contains tabs)
        # re.search searches for all occurences
        if re.search('\t', line):
            special_layout = 1
                    
        # get HAS_PUNCT
        if re.search('[\.,;:!\?–\-\*\(\)\[\]\{\}·…„“]', line):
            has_punct = 1
                    
        # get Has_UPPERCASE
        if re.search('[A-Z]', line):
            has_uc = 1
                    
        # get Has_LOWERCASE
        if re.search('[a-z]', line):
            has_lc = 1

        # get HAS_TITLECASE
        tokens = line.split(' ')
        for token in tokens:
            # check for title case
            if token.istitle():
                has_tc = 1
                break
            break

    return special_layout, has_punct, has_uc, has_lc, has_tc

### data functions

def create_data_dict():
    '''
    Creates a dictionary with all the column headers for the output csv as keys
    and empty lists as values (will be filled when iterating over tokens)
    '''
    data_dict = {
        "Poem_Id": [],
        "Title_Txt_File": [],
        "Title_Poem": [],
        "Has_Title": [],
        "Special_Layout": [],
        "Has_Punct": [],
        "Has_Uppercase": [],
        "Has_Lowercase": [],
        "Has_Titlecase": [],
        "Has_Sentence_Like_Structure": [],
        "Poem_Length_Stanzas": [],
        "Poem_Length_Lines": [],
        "Poem_Length_Words": [],
        "Stanza_Nr": [],
        "Line_Nr": [],
        "Line_Text": [],
        "Line_Length_Words": [],
        "Word_Text_With_Punct": [],
        "Word_Text": [],
        "Word_Text_LC": [],
        "Word_Nr_In_Poem": [],
        "Word_Nr_In_Stanza": [],
        "Word_Nr_In_Line": [],
        "Word_Length": [],
        "Word_In_Title": [],
        "Sonority_Score": [],
        "Average_Poem_Sonority": [],
        "Rhyme_Structure": [],
        "Rhyme": [],
        "Nr_Rhyming_Lines": [],
        "Rhyming_Degree": []
        }
    return data_dict

def create_stanza_list(lines):
    '''
    Splits poem into stanzas, lines and title, saves data in nested list
    Format:
    [
        [title],
        [line1, line2, line3, ...]
    ]
    '''
    stanzas = list()
    # add title (assumes first line is always title)
    stanzas.append([lines[0].replace('\n', '')])
    idx1 = idx2 = 1
    while True:
        try:
            # add all but last stanza
            idx2 = lines.index('\n', idx1+1)
            stanzas.append(lines[idx1+1:idx2])
            idx1 = idx2
        except:
            # add last stanza
            stanzas.append(lines[idx1+1:])
            break

    return stanzas

def create_word_list(line):
    """
    Splits line into tokens. Keeps only word tokens, drops punctuation.
    """
    tokens = line.split(' ')
    words = []
    punct_list = ['.',
                ',',
                ';',
                ':',
                '!',
                '?',
                '–',
                '-',
                '*',
                '(',
                ')',
                '[',
                ']',
                '{',
                '}',
                '·',
                '…',
                '„',
                '“']
    multiple_dots = '\.+'

    for token in tokens:
        if (token not in punct_list) and not (re.match(multiple_dots, token)):
            words.append(token)
    return words

def contains_only_allowed_characters(word):
    """
    Checks if word only contains allowed characters.
    Allowed characters are:
    - all alphanumeric characters
    - &
    - €
    & and € are the only additional allowed characters apart from alphanumerics, 
    since they are used as synonyms for the words "und"/"Euro" in all cases

    """
    word_contains_allowed_char = False
    for letter in word:
        if letter.isalnum() or letter == '&' or letter == '€':
            word_contains_allowed_char = True
    return word_contains_allowed_char


#### 3. Process poem files
"""   
Steps:
    - read file (by stanza, by line, by word)
    - calculate scores, save them in a data dict
    - transform data dicts into dataframes per poem
    - concat dataframes
    - save concatenated dataframes as csv
"""

for poemfile in poemsfolder:
    # setup filename variables
    txt_filename = os.path.split(poemfile)[1]
    csv_filename = txt_filename.split('.')[0] + '.csv'

    # get POEM ID
    # poem_id = 0000  # format 0001 bis 9999
    poem_id = f'p_{txt_filename.split("_")[0]}'

    # get rhyme info
    rhyme_structure_str, has_rhyme, nr_rhyming_lines, rhyming_degree = get_rhyme_info(poemfile)

    with open(poemfile, 'r', encoding='utf8') as f:
        ########## CREATE STANZA LIST AND DATA DICT ############

        lines = f.readlines()
        # create nested list with stanzas (title and lines separated)
        stanzas = create_stanza_list(lines)
        # create data dict with empty list as values (to be filled later)
        data_dict = create_data_dict()
        
        ########## GENERATE DATA FOR DATA DICT (poem and line level) ##########

        # get HAS_TITLE
        has_title = 0 if stanzas[0][0] == 'Kein Titel' else 1

        # get TITLE
        poem_title = ' '.join(stanzas[0]).strip()

        # get layout data
        special_layout, has_punct, has_uc, has_lc, has_tc = get_layout_data(lines)

        # has_sentence_like_structure
        if has_punct == 1 and has_uc == 1 and has_lc == 1 and has_tc == 1:
            has_sent_struct = 1
        else:
            has_sent_struct = 0

        # setup counting variables for loop over stanzas
        current_row = 1  # row counter (output file), one row for each word
        word_count_poem = line_nr = title_length = 0

        # read poem stanza by stanza
        for i_s, stanza in enumerate(stanzas):

            word_in_stanza_counter = 0

            # read stanza line by line
            for i_l, line in enumerate(stanza):
                # get LINE_NR (line with title gets 0)
                line_nr = line_nr if i_s == 0 else line_nr + 1
                
                # create word list (no punctuation)
                line = clean_line(line)
                words = create_word_list(line)

                # read word list
                for i_w, word in enumerate(words):
                    # avoid division by zero in sonority score function (caused by empty words)
                    if word == '':
                        continue

                    # check if word only contains allowed characters (allowed = alphanum, & or €)
                    word_contains_allowed_char = contains_only_allowed_characters(word)
                    if not word_contains_allowed_char:
                        continue

                    ########### GENERATE DATA FOR DATA_DICT (word level) ############

                    # get WORD_TEXT
                    word_text_with_punct = word.strip()
                    word_text_no_punct = word.strip('.,;:!?–-*()[]{}·…„“\t')
                    word_text_no_punct_lc = word_text_no_punct.lower()

                    # get WORD_LENGTH
                    word_len = len(word_text_no_punct_lc)

                    # get WORD_IN_TITLE
                    word_in_title = 1 if i_s == 0 else 0

                    # calculate SONORITY SCORE
                    sonority_score = son_score_g(word_text_no_punct_lc)

                    ########### APPEND DATA TO DATA_DICT ############

                    # POEM ID to data dict
                    data_dict['Poem_Id'].append(poem_id)

                    # TXT_FILENAME name to dict
                    txt_filename_no_id = '_'.join(txt_filename.split('_')[1:])
                    data_dict['Title_Txt_File'].append(txt_filename_no_id)
                    #data_dict['Title_Txt_File'].append(txt_filename)

                    # RHYME_STRUCTURE to dict
                    data_dict['Rhyme_Structure'].append(rhyme_structure_str)

                    # HAS_RYHME to dict
                    data_dict['Rhyme'].append(has_rhyme)

                    # NR_RHYMING_LINES to dict
                    data_dict['Nr_Rhyming_Lines'].append(nr_rhyming_lines)

                    # RYHMING_DEGREE to dict
                    data_dict['Rhyming_Degree'].append(rhyming_degree)

                    # HAS_TITLE to data dict
                    data_dict['Has_Title'].append(has_title)

                    # TITLE to dict
                    data_dict['Title_Poem'].append(poem_title)

                    # SPECIAL_LAYOUT to data dict
                    data_dict['Special_Layout'].append(special_layout)

                    # HAS_PUNCT to data dict
                    data_dict['Has_Punct'].append(has_punct)

                    # Has_Uppercase to data dict
                    data_dict['Has_Uppercase'].append(has_uc)

                    # Has_Lowercase to data dict
                    data_dict['Has_Lowercase'].append(has_lc)

                    # Has_Titlecase to data dict
                    data_dict['Has_Titlecase'].append(has_tc)

                    # Has_Sentence_Like_Structure to dict
                    data_dict['Has_Sentence_Like_Structure'].append(
                        has_sent_struct)

                    # WORD_TEXT to data dict
                    data_dict['Word_Text_With_Punct'].append(
                        word_text_with_punct)
                    data_dict['Word_Text'].append(word_text_no_punct)
                    data_dict['Word_Text_LC'].append(word_text_no_punct_lc)

                    # WORD_LENGTH to data dict
                    data_dict['Word_Length'].append(word_len)

                    # LINE_TEXT to dict
                    data_dict['Line_Text'].append(line)

                    # WORD_IN_TITLE to data dict
                    data_dict['Word_In_Title'].append(word_in_title)

                    # get POEM_LEGNTH_STANZAS
                    poem_length_stanzas = len(stanzas) - 1

                    # POEM_LEGNTH_STANZAS to dict
                    data_dict['Poem_Length_Stanzas'].append(
                        poem_length_stanzas)

                    # LINE_NR to dict
                    data_dict['Line_Nr'].append(line_nr)

                    # STANZA_Nr to dict
                    data_dict['Stanza_Nr'].append(i_s)

                    # LINE_LENGTH to dict
                    data_dict['Line_Length_Words'].append(len(words))

                    # WORD_NR_IN_LINE to dict
                    data_dict['Word_Nr_In_Line'].append(i_w + 1)

                    # update counters
                    title_length = title_length + 1 if i_s == 0 else title_length
                    word_in_stanza_counter = word_in_stanza_counter if i_s == 0 else word_in_stanza_counter + 1
                    word_count_poem = word_count_poem if i_s == 0 else word_count_poem + 1

                    # WORD_NR_IN_STANZA to dict
                    data_dict['Word_Nr_In_Stanza'].append(
                        word_in_stanza_counter)
                    data_dict['Word_Nr_In_Poem'].append(word_count_poem)

                    # SONORIY_SCORE to data dict
                    data_dict['Sonority_Score'].append(
                        sonority_score)

                    # update row counter
                    current_row += 1
        

        # calculate AVERAGE SONORITY SCORE
        son_score_list = data_dict['Sonority_Score']
        poem_length_words = data_dict['Word_Nr_In_Poem'][-1]
        av_son_score = calc_average_son(son_score_list, poem_length_words)
        
        ########## APPEND DATA TO DATA_DICT ##########
        for i in range(1, title_length+word_count_poem+1):
            data_dict['Poem_Length_Words'].append(word_count_poem)
            data_dict['Poem_Length_Lines'].append(line_nr)
            data_dict['Average_Poem_Sonority'].append(av_son_score)

    ### create df for singloe poem
    # data dict to df
    df_single_poem = pd.DataFrame.from_dict(data_dict)

    # append df to list of dfs for all poems
    df_single_poem_list.append(df_single_poem)

    print(f'Processed - poem {txt_filename}')


########## CREATE OUTPUT DATAFRAME ###########

# concat dfs from list
df_wordlevel_file_concat = pd.concat(df_single_poem_list, axis=0, ignore_index=True)

# exclude rows for words "Kein" and "Titel" (first line of poems without title)
df_wordlevel_file_concat_reduced = df_wordlevel_file_concat.query('~(Word_In_Title == 1 & Word_Text == "Kein" | Word_Text == "Titel")').copy()

# add word ids
cols = list(df_wordlevel_file_concat_reduced.columns)
cols.insert(0, 'Word_Id') # word id as first column
df_wordlevel_file_concat_reduced = df_wordlevel_file_concat_reduced.reindex(columns=cols)
df_wordlevel_file_concat_reduced = df_wordlevel_file_concat_reduced.assign(Word_Id=lambda x: ['w_{:06d}'.format(i) for i in range(1, len(x) + 1)])

# df to csv
date_today = datetime.today().strftime('%Y-%m-%d')
df_wordlevel_file_concat_reduced.to_csv(os.path.join(OUTPUT_DIR, f"{date_today}_wordlevel_all_cols.csv"), encoding='utf-8', sep='|', index=False)

print("Created - summary wordlevel file")