# creates section-title-counts.pdf and location-percent.pdf

from rfcerrata import RfcErrata
Errata = RfcErrata()

import glob
import html2text
from html2txt import converters
from typing import List
import pandas as pd
# fetch texts and offsets

class ErratumLocation:
    erratum_id : int
    erratum_offset: int

class TextWithErrata:
    rfc_id : int
    text : str
    errata_locations: List[ErratumLocation]


data = []

for filename in glob.glob("data/inline-errata/r*.html"):
    file_html = open(filename, "r").read()

    rfc_ind = filename.find("rfc")
    rfc_ind = filename.find("rfc", rfc_ind+1)
    rfc_ind = filename.find("rfc", rfc_ind+1)
    rfc_dot_ind = filename.find("html")
    rfc_id = filename[rfc_ind + 3: rfc_dot_ind - 1]

    first_pre = file_html.find("<pre>")
    first_part = file_html[:first_pre]
    rest_of_html = file_html[first_pre:]
    cleaned_text = converters.Html2Markdown().convert("<html><body>" + rest_of_html).replace("<pre>","").replace("</pre>","").strip() #h.handle("" + rest_of_html)

    eid_location_list, current_text, str_to_find = [], cleaned_text, "](https://www.rfc-editor.org/errata/eid"
    while True:
        ind = current_text.find(str_to_find)
        if ind == -1:
            break
        else:
            eid_start_ind = ind + len(str_to_find)
            eid_end_ind = current_text.find(")", eid_start_ind)
            eid = int(current_text[eid_start_ind: eid_end_ind ])
            loc = ErratumLocation()
            loc.erratum_id = eid
            loc.erratum_offset = cleaned_text.find(str_to_find+str(eid))
            eid_location_list.append(loc)
            current_text = current_text[eid_end_ind + 1:]
    t = TextWithErrata()
    t.rfc_id = rfc_id
    t.text = cleaned_text
    t.errata_locations = eid_location_list
    data.append(t)

eid2type = {}
for e in Errata.errata:
    eid2type[e.errata_id] = e.errata_type_code
print(len(eid2type))


found_list = []
for t in data:
    for el in t.errata_locations:
        found_list.append(el.erratum_id)
found_set = set(found_list)

eid2sectionnumjson = {}

def isValidSectionNumberElement(e):
    if e.isnumeric():
        return True
    if e.isalpha() and len(e) == 1:
        return True
    return False

def isValidSectionNumber(n):
    if n.startswith("99"):
        return False
    if n.strip() == "":
        return False
    #print("***")
    #print(n)
    if n.endswith("."):
        n = n[:-1]
    #print(n)
    n = n.split()[0] # removes anything after the first whitespace (e.g., "9.1. something")
    #print(n)
    n = n.split(",")[0] # removes anything after the first whitespace (e.g., "9.1. something")
    #print(n)
    n = n.split(";")[0] # removes anything after the first whitespace (e.g., "9.1. something")
    #print(n)
    decisions = [isValidSectionNumberElement(num) for num in n.split(".")]
    #print(n.split("."))
    #print(decisions)
    return sum(decisions) == len(decisions)

ok, total = 0,0
for e in Errata.errata:
    #if e.section != "None": # around 40 out of 6090 are none
        eid2sectionnumjson[e.errata_id] = e.section
        total += 1
        if isValidSectionNumber(e.section):
            ok += 1
print("%d / %d" % (ok, total))

eid2sectionnumsimplified = {}
ok, total = 0,0
print(found_set)
for eid in found_set:
    if eid in eid2sectionnumjson:
        s = eid2sectionnumjson[eid]
        if (isValidSectionNumber(s) or s.lower().startswith("glob") or "appendix" in s.lower()):
            number_simplified = s.split(",")[0]
            number_simplified = number_simplified.split(".")[0]
            number_simplified = number_simplified.split()[0]
            if "appendix" in s.lower() or number_simplified.isalpha():
                eid2sectionnumsimplified[eid] = 16
                ok += 1
            else:
                number_simplified = int(number_simplified)
                if number_simplified <= 15:
                    eid2sectionnumsimplified[eid] = number_simplified
                    ok += 1
        else:
            print(s)
            pass
    total += 1
print("%d / %d" % (ok, total))
print("%d / %d = %.3f" % (ok, total, ok/total))

# leaves 83% of the errata
ok, total = 0, 0
for t in data:
    t.errata_locations = [el for el in t.errata_locations if el.erratum_id in eid2sectionnumsimplified]


data_for_plot = []

ok, notok = 0, 0
for t in data:
    for errata_loc in t.errata_locations:
        try:
            perc = min(100, int(((errata_loc.erratum_offset / len(t.text)) + 0.1)*10)*10)
            et = eid2type[errata_loc.erratum_id]
            data_for_plot.append((perc, et))
            ok += 1
        except:
            print(errata_loc.erratum_id)
            notok += 1

print("Ok for %d / %d." % (ok, ok + notok))

data_for_plot = sorted(data_for_plot, key = lambda x:x[0])
e_percentiles = [x[0] for x in data_for_plot]
e_types = [x[1] for x in data_for_plot]

df = pd.DataFrame(zip(e_percentiles, e_types), columns = ["percentile","type"])
df.to_csv("data/errata-location-percentiles.csv", index = False)

# ******************************************** NEXT PLOT *******************************************

def get_chapter_for_erratum(text, erloc):
    ok, total = 0,0
    chapter_num = eid2sectionnumsimplified[erloc.erratum_id]
    if chapter_num == 16:
        chapter_title = "Appendix"
    else:
        text_till_erratum = text[:erloc.erratum_offset]
        lines = text_till_erratum.split("\n")
        lines.reverse()
        success = False
        total += 1
        for l in lines:
            elems = l.split()
            if len(elems) >= 2:
                if elems[0].endswith(".") and "." not in elems[0][:-1] and elems[0][:-1].isnumeric():
                    parsed_number = int(elems[0][:-1])
                    if parsed_number == chapter_num:
                        chapter_title = " ".join(elems[1:])
                        success = True
                        break
        if success:
            ok += 1
        else:
            print(chapter_num)
            chapter_title = "FAILED"
    return chapter_num, chapter_title

eid2chapterinfo = {}
appendix_count = 0
for t in data:
    for erloc in t.errata_locations:
        sec_num, sec_title = get_chapter_for_erratum(t.text, erloc)
        eid2chapterinfo[erloc.erratum_id] = sec_num, sec_title, t.rfc_id
        if sec_title == "Appendix":
            appendix_count +=1


# try to find the most popular chapter titles
title_counts = {}
title_doclist = {}
for number, title, rid in eid2chapterinfo.values():
    try:
        #if title != "Appendix":
            if int(number) > 16:
                continue
    except:
        continue
    if title not in title_counts:
        title_counts[title] = 0
    title_counts[title] += 1
    
    if title not in title_doclist:
        title_doclist[title] = set()
    title_doclist[title].add(rid)

titles_sorted = []
for k in title_counts:
    titles_sorted.append((k,title_counts[k], len(title_doclist[k])))

titles_sorted = sorted(titles_sorted, key = lambda x:x[1], reverse = True)

top_titles = set([x[0] for x in titles_sorted if x[1] >= 10])
top_titles

titles_sorted_bydocfreq = sorted(titles_sorted, key = lambda x:x[2], reverse = True)
top_titles_doc = set([x[0] for x in titles_sorted_bydocfreq if x[2] >= 10])
top_titles_doc

data_for_plot = [] 

ok, notok = 0, 0
for t in data:
    #print(type(t.rfc_id))
    for errata_loc in t.errata_locations:        
        try:
            chapter_title = eid2chapterinfo[errata_loc.erratum_id][1]
            if "References" in chapter_title:
                chapter_title = "References"            
            chapter_num = int(eid2chapterinfo[errata_loc.erratum_id][0])
            et = eid2type[errata_loc.erratum_id]
            
            if chapter_num > 16 or chapter_title not in top_titles or chapter_title not in top_titles_doc or chapter_title == "FAILED":                
                    notok += 1
                    continue
            data_for_plot.append((chapter_title, et, errata_loc.erratum_id))
            ok += 1
        except:
            notok += 1
            
print("Ok for %d / %d." % (ok, ok + notok))
  
data_for_plot = sorted(data_for_plot, key = lambda x:x[1])
e_chaptertitles = [x[0] for x in data_for_plot]
e_types = [x[1] for x in data_for_plot]
e_ids = [x[2] for x in data_for_plot]

df = pd.DataFrame(zip(e_chaptertitles, e_types, e_ids), columns = ["titles","Type","ErratumID"])
df = df.sample(frac=1, random_state=69)
df.to_csv("data/errata-location-sections.csv", index = False)


