import csv
import os
import argparse
import re

from pathlib import Path

parser = argparse.ArgumentParser(
                    prog = 'cleanrequests',
                    description = 'Clean http requests logs for statistics',
                    epilog = 'Run using a file containing requests logs')
parser.add_argument('logfilepath')
#push payloads to open project
parser.add_argument('-p', action='store_true')

args = parser.parse_args()
print('logfilepath set to ' + args.logfilepath)

logfilepath = args.logfilepath
outputfolderpath = 'cleaned/'
while not os.path.exists(outputfolderpath):
            os.makedirs(outputfolderpath)
inputregex = re.compile('.*\/ege-webservice\/?\/Conversions\/([^\/]+)\/')
outputregex = re.compile('.*\/ege-webservice\/?\/Conversions\/[^\s]*\/([^\/]+)\/(?:conversion)?\?properties')
secondoutputregex = re.compile('.*\/ege-webservice\/?\/Conversions\/[^\s]*\/([^\/]+)\/?\s')
thirdoutputregex = re.compile('.*\/ege-webservice\/?\/Conversions\/[^\s]*\/([^\/]+)\/?conversion\s')
cleaningregex = re.compile('(^[A-Za-z\d]+)\%')
with open(logfilepath, 'r') as in_file:
    csvrows = csv.reader(in_file,delimiter=' ')
    with open(outputfolderpath + Path(logfilepath).stem + '.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        #add headers        
        writer.writerow(('url', 'date', 'request', 'http_code', 'referrer', 'input_type', 'output_type', 'input', 'output', 'input_matched', 'output_matched' ))
        for row in csvrows:
            #print((row[6], row[10], row[12], row[13]))
            #print(inputregex.match(row[12]))
            #print(inputregex.findall(row[12])[0])
            #print(outputregex.findall(row[12]))
            #request e.g. POST /ege-webservice/Conversions/ODDC%3Atext%3Axml/oddjson%3Aapplication%3Ajson/conversion?properties=%3Cconversions%3E%3Cconversion%20index%3D%220%22%3E%3Cproperty%20id%3D%22oxgarage.getImages%22%3Efalse%3C%2Fproperty%3E%3Cproperty%20id%3D%22oxgarage.getOnlineImages%22%3Efalse%3C%2Fproperty%3E%3Cproperty%20id%3D%22oxgarage.lang%22%3Een%3C%2Fproperty%3E%3Cproperty%20id%3D%22oxgarage.textOnly%22%3Etrue%3C%2Fproperty%3E%3Cproperty%20id%3D%22pl.psnc.dl.ege.tei.profileNames%22%3Edefault%3C%2Fproperty%3E%3C%2Fconversion%3E%3C%2Fconversions%3E%22 HTTP/2.0
            #input_type e.g. docx%3Aapplication%3Avnd.openxmlformats-officedocument.wordprocessingml.document
            #input e.g. docx
            #input_matched e.g. Word doc
            if row[13] == '200' and 'ege-webservice' in row[12] and 'Conversions' in row[12]:
                input = inputregex.findall(row[12])
                inputresult = input[0] if len(input) > 0 else ''
                inputcleaned = cleaningregex.findall(inputresult)
                inputcleanedresult = inputcleaned[0] if len(inputcleaned) > 0 else ''
                output = outputregex.findall(row[12])                           
                if len(output) == 0:
                    output = secondoutputregex.findall(row[12])
                if len(output) > 0 and output[0] == 'conversion':               
                    output = thirdoutputregex.findall(row[12])
                outputresult = output[0] if len(output) > 0 else '' 
                outputcleaned = cleaningregex.findall(outputresult)  
                outputcleanedresult = outputcleaned[0] if len(outputcleaned) > 0 else ''
                inputmatched = inputcleanedresult.replace("TEI", "TEI P5").replace("P4", "TEI P4").replace("docx", "placeholder").replace("doc", "placeholder").replace("placeholder", "doc(x)").replace("Tite", "TEI Tite").replace("Simple", "Simple TEI").replace("csv", "placeholder").replace("tsv", "placeholder").replace("placeholder", "csv/tsv").replace("xlsx", "placeholder").replace("xls", "placeholder").replace("placeholder", "xls(x)")
                outputmatched = outputcleanedresult.replace("TEI", "TEI P5").replace("P4", "TEI P4").replace("docx", "placeholder").replace("doc", "placeholder").replace("placeholder", "doc(x)").replace("Tite", "TEI Tite").replace("Simple", "Simple TEI").replace("csv", "placeholder").replace("tsv", "placeholder").replace("placeholder", "csv/tsv").replace("xlsx", "placeholder").replace("xls", "placeholder").replace("placeholder", "xls(x)")
                #print(row[6], row[10], row[12], row[13], row[15], inputresult, outputresult, inputcleanedresult, outputcleanedresult, inputmatched, outputmatched)
                writer.writerow( (row[6], row[10], row[12], row[13], row[15], inputresult, outputresult, inputcleanedresult, outputcleanedresult, inputmatched, outputmatched) )
        #writer.writerow(('title', 'intro'))
        #writer.writerows(csvrows)

