import os
import pandas as pd
import re
import argparse
import datetime
import sklearn.metrics

""" Program that 
    - use the whole accumulated corpus in IOB file
    - go through each document
        - count annotations
            - each BI-pair is an annotation
            - count annotation for its category
        - count sentences (new lines)
        - count tokens (line in IOB file)

- create pretty pictures from statistics
"""

def run_through_folder_structure(path):
    """we assume that all files are in folder path"""
    files = os.listdir(path)
    print(files)
    overall_num_docs = 0
    overall_num_annotations = 0 # when annotation (as a whole)
    overall_annotated_tokens = 0 # how many tokens per annotation
    category_annotations = { # add an annotation to a category
    }
    overall_num_sentences = 0 # per newline
    overall_num_tokens = 0 # lines
    for file in files:
        overall_num_docs += 1
        with open(os.path.join(path, file), 'r') as f:
            lines = f.readlines()
            num_newlines = 0
            for line in lines: 
                if line == "\n":
                    num_newlines += 1
                else: 
                    line_splitted = line.split(" ")
                    if len(line_splitted) >2:
                        print("more than 2 tokens; skipped")
                    else: 
                        annotation = line_splitted[1]
                        if annotation.startswith("B-"):
                            annotation_name = annotation[2:-1] # remove B- and I-
                            overall_num_annotations += 1
                            overall_annotated_tokens += 1
                            if not annotation_name in category_annotations.keys():
                                category_annotations[annotation_name] = 1
                            else: 
                                category_annotations[annotation_name] += 1
                        elif annotation.startswith("I-"):
                            overall_annotated_tokens += 1
            overall_num_sentences += num_newlines #- 1  # first newline
            overall_num_tokens += (len(lines) - num_newlines)
    print(
        f"""
 Statistics: 
    overall_num_docs: {overall_num_docs}\n
    overall_num_annotations: {overall_num_annotations}\n
    overall_annotated_tokens: {overall_annotated_tokens}\n
    category_annotations: {category_annotations}\n
    overall_num_sentences: {overall_num_sentences}\n
    overall_num_tokens: {overall_num_tokens}\n"""
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
                    prog='calculate_statistics.py.py',
                    description='',
                    epilog='May the Force be with you!')
    parser.add_argument('-i', '--input_dir', required=True, help="path for input directory")
    args = parser.parse_args()
    print(args)

    path_source =  args.input_dir #"~/Documents/Corpus_data/converted_data" #
    run_through_folder_structure(path_source)
