from ietfdata.mailarchive2 import *
import email.utils
import json
import re
import ietfdata.rfcindex
from rfcerrata import *

rfc_errata = RfcErrata()
rfc_index = ietfdata.rfcindex.RFCIndex()

errata_by_rfc = {}
errata_by_id = {}

for erratum in rfc_errata.errata:
    errata_by_rfc[erratum.doc_id] = errata_by_rfc.get(erratum.doc_id, []) + [erratum.errata_id]
    errata_by_id[erratum.errata_id] = erratum

month_dict = {"January":1,"February":2,"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}

archive = MailArchive()

maillist_area_type_status = json.load(open('data/maillist_area_type_status.json'))
emailID_pid_dict = json.load(open('data/emailID_pid_dict.json'))

pid_area_counts = {}
ren = r'(?:\.?)([\w\-_+#~!$&\'\.]+(?<!\.)(@|[ ]?\(?[ ]?(at|AT)[ ]?\)?[ ]?)(?<!\.)[\w]+[\w\-\.]*\.[a-zA-Z-]{2,3})(?:[^\w])'
ren2 = r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
draft_pids = {}

for mailing_list_name in archive.mailing_list_names():
    ml = archive.mailing_list(mailing_list_name)
    m_area = maillist_area_type_status[mailing_list_name]['area']
    if maillist_area_type_status[mailing_list_name]['type'] == "wg":
        for message in ml.messages():
            from_addr = message.header_from()
            if from_addr is None or type(from_addr) is list:
                continue
            e = from_addr.replace("'", "__apostrophe__")
            x = re.findall(ren, str(e))

            if len(x) == 0:
                x = re.findall(ren2, str(e))
                if len(x) > 0:
                    email = x[0]
            else:
                email = x[0][0]

            email = email.replace("__apostrophe__", "'").lower()
            email = email.lower()

            if '@' not in email:
                email = email.replace(' at ','@').lower()
            from_addr = email
            
            if from_addr not in emailID_pid_dict:
                continue
            from_pid = emailID_pid_dict[from_addr]
            if from_pid not in pid_area_counts:
                pid_area_counts[from_pid] = {"total": 0}
            if m_area not in pid_area_counts[from_pid]:
                pid_area_counts[from_pid][m_area] = 0
            pid_area_counts[from_pid][m_area] += 1
            pid_area_counts[from_pid]["total"] += 1
        print(f"{mailing_list_name}, {m_area}")


for mailing_list_name in archive.mailing_list_names():
    ml = archive.mailing_list(mailing_list_name)
    m_area = maillist_area_type_status[mailing_list_name]['area']
    if maillist_area_type_status[mailing_list_name]['type'] == "wg":
        for message in ml.messages():
            from_addr = message.header_from()
            if from_addr is None or type(from_addr) is list:
                continue
            e = from_addr.replace("'", "__apostrophe__")
            x = re.findall(ren, str(e))

            if len(x) == 0:
                x = re.findall(ren2, str(e))
                if len(x) > 0:
                    email = x[0]
            else:
                email = x[0][0]

            email = email.replace("__apostrophe__", "'").lower()
            email = email.lower()

            if '@' not in email:
                email = email.replace(' at ','@').lower()
            from_addr = email
            
            if from_addr not in emailID_pid_dict:
                continue
            from_pid = emailID_pid_dict[from_addr]
            try:
                if re.search("draft-[A-Z|a-z|\-|0-9]*", message.header_subject()):
                    matches = re.findall("draft-[A-Z|a-z|\-|0-9]*", message.header_subject())
                    draft_name = matches[0]
                    if re.search("^.*-[0-9][0-9]$", draft_name):
                        draft_name = draft_name[:-3]
                    if draft_name not in draft_pids:
                        draft_pids[draft_name] = []
                    draft_pids[draft_name].append(from_pid)
            except Exception as e:
                continue
        print(f"{mailing_list_name}, {m_area}")

pid_areas = {}
with open("data/pid_area_map.csv", "w") as pidAreaMapFile:
    for pid in pid_area_counts:
        print(pid, pid_area_counts[pid])
        area_fractions = {area: area_total/pid_area_counts[pid]["total"] for (area, area_total) in pid_area_counts[pid].items()}
        del area_fractions["total"]
        print(pid, pid_area_counts[pid], sorted(area_fractions.items(), key=lambda x: x[1])[-1])
        pid_areas[pid] = sorted(area_fractions.items(), key=lambda x: x[1])[-1]
        print(f"{pid},{sorted(area_fractions.items(), key=lambda x: x[1])[-1][0]}", file=pidAreaMapFile)

with open("data/draft_pid_discuss_map.csv", "w") as draftPidDiscussFile:
    for draft in draft_pids:
        print(f"{draft},{','.join([str(i) for i in draft_pids[draft]])}", file=draftPidDiscussFile)

errata_counts = []
areas_counts = []

with open("data/draft-area-discussion-counts.csv", "w") as draftDiscussionCountFile:
    for rfc in rfc_index.rfcs(since="2001-01", until="2020-12"):
        errata_within_two_years_of_pub = []
        rfc_pub_datetime = datetime(rfc.year, month_dict[rfc.month], 1)
        for errata_id in errata_by_rfc.get(rfc.doc_id, []):
            erratum = errata_by_id[errata_id]
            time_diff = erratum.submit_date - rfc_pub_datetime
            if time_diff.days <= 365*2:
                errata_within_two_years_of_pub.append(errata_id)
        areas = [pid_areas[pid][0] for pid in draft_pids.get(str(rfc.draft)[:-3], [])]
        if len(set(areas)) > 0:
            errata_counts.append(len(errata_by_rfc.get(rfc.doc_id, [])))
            areas_counts.append(len(set(areas)))
        print(f"{rfc.doc_id},{rfc.draft},{len(errata_within_two_years_of_pub)},{len(errata_by_rfc.get(rfc.doc_id, []))},{len(set(areas))}", file=draftDiscussionCountFile)