
import logging.config
import logging
import os
from pathlib import Path
from datetime import datetime, timedelta, date
from collections import OrderedDict
from dataclasses import dataclass
from datetime import date
from utils.helpers import get_repo_metadata
import pandas as pd


cwd = os.getcwd()
root_path = Path(cwd).parents[0]

logging.config.dictConfig(
    {
        "version": 1,
        "disable_existing_loggers": True,
    }
)

# Logging config for STDOUT and for FILE
logFormatter = logging.Formatter(
    "%(asctime)s [%(processName)-s] [%(levelname)-s] %(message)s"
)
rootLogger = logging.getLogger()
fileHandler = logging.FileHandler("commits-message-preprocess.log")
fileHandler.setFormatter(logFormatter)
rootLogger.addHandler(fileHandler)
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
rootLogger.addHandler(consoleHandler)
rootLogger.level = logging.DEBUG

filename = 'fasttext-commit-classification/notebooks/commits_classified.csv'

df = pd.read_csv(filename)

df_grouped = df.groupby("project")
data = list()

print(df_grouped["project"])

def incubation_months(start_date, end_date):
    """
    Computes the number of months between two dates

    Parameters
    ----------
    start_date: str
        The start date of the project as a string %Y-%m-%d

    end_date: str
        The end date of the project as a string %Y-%m-%d
    """
    start_date_split = start_date.split("-")
    end_date_split = end_date.split("-")

    start_date_as_date = date(
        int(start_date_split[0]), int(start_date_split[1]), int(start_date_split[2])
    )
    end_date_as_date = date(
        int(end_date_split[0]), int(end_date_split[1]), int(end_date_split[2])
    )
    # the following line only works in Pyton 3.7
    # months = OrderedDict(((date.fromisoformat(start_date) + timedelta(_)).strftime("%Y-%m-01"),0) for _ in range((date.fromisoformat(end_date) - date.fromisoformat(start_date)).days))

    months = OrderedDict(
        ((start_date_as_date + timedelta(_)).strftime("%Y-%m-01"), 0)
        for _ in range((end_date_as_date - start_date_as_date).days)
    )
    return months

def analyze_data(repos_info):
    
    for p in df_grouped:
        
        
        repo = p[0]
        try: 
            all_inc_months = repos_info[repo]

            status = p[1]["status"].values[0]
            tuples = p[1]

            df1 = pd.DataFrame.from_records(tuples)
            inc_months = df1.groupby("inc_month")
            commits_inc_months = list()        
            for i in inc_months:
                month = i[0]
                commits = pd.DataFrame.from_records(i[1])
                commits_gr = commits.groupby("labels_predicted", as_index=True).size()
            
                d = commits_gr.to_dict()
                perfective = 0
                features = 0
                unknown = 0
                non_functional = 0
                corrective = 0
                if "__label__perfective" in d:
                    perfective = int(d["__label__perfective"])

                if "__label__features" in d:
                    features = int(d["__label__features"])

                if "__label__unknown" in d:
                    unknown = int(d["__label__unknown"])

                if "__label__nonfunctional" in d:
                    non_functional = int(d["__label__nonfunctional"])

                if "__label__corrective" in d:
                    corrective = int(d["__label__corrective"])
                commits_inc_months.append(month)
                str_to_append = "{},{},{},{},{},{},{},{}".format(repo, status, month, corrective, features, non_functional, perfective, unknown)
                # print(str_to_append)
                data.append(str_to_append)

            if repo == "Nutch":
                print(all_inc_months)
                print(commits_inc_months)
                print(set(all_inc_months).difference(set(commits_inc_months)))
            for c in set(all_inc_months).difference(set(commits_inc_months)):
                str_to_append = "{},{},{},{},{},{},{},{}".format(repo, status, c, 0, 0, 0, 0, 0)
                data.append(str_to_append)
        except: 
            print("{} git project not found in the projects/git folder. Skipping it", repo)

    with open("commits_classified_processed.csv", "w") as f:
        f.write("project,status,incubation_month,corrective,features,non_functional,perfective, unknown\n")
        f.write("\n".join(data))


def analyze(data) -> str:
    repo_path = data.repo
    repo_name = (
        repo_path.replace("incubator-retired-", "").replace("incubator-", "").strip()
    )
    status = data.status
    start_date = data.start_date
    end_date = data.end_date

    path = os.path.join(root_path, "projects", "git", data.repo)
    inc_months = list(range(1,len(incubation_months(start_date, end_date)) + 1))

    return (repo_name, inc_months)


def run():
    logging.info("Using single core")
    all_repos = get_repo_metadata()
    
    repos_infos = list(map(lambda x: analyze(x), all_repos))
    metadata = dict(repos_infos)

    
    analyze_data(metadata)




def main():
    run()


if __name__ == "__main__":

    main()