'''
@author: Andrew
'''
import subprocess
import os
import sys
import time
import re
from nltk.stem import PorterStemmer
import string

def cloneProject(url, dir):
    "clone the git repo at url to a local folder at dir for analysis"
    cmd = "git clone " + url + " " + dir
    print "executing command " + cmd
    retCode = subprocess.call(cmd, shell = True)
    if retCode == 0:
        print "clone successful"
        return True
    else:
        print "clone failed"
        return False     
    
def getRevisionNumbers(dir):
    "return a list of revision numbers in a git repo at local path dir"
    os.chdir(dir)
    cmd = "git log --format=%H"
    print "executing command " + cmd
    log = subprocess.check_output(cmd, shell = True)
    revNumbers = []
    lines = log.split("\n")
    for line in lines:
        hash = line.strip()
        if len(hash) > 0:
            revNumbers.append(hash)
    return revNumbers
    
def getDiff(commitID, dir):
    "return the diff of the revision oldRev and newRev of a git repo at dir"
    #git check diff of a commit (the commited revision vs its parent revision): git show --format=%b revID
    cmd = "git show --format=%b " + commitID
    print "executing command " + cmd
    return subprocess.check_output(cmd, shell = True)
    
#a super wrapping function for mining interesting logs and diff
#dir (string): path to local repo
#keywords (list): a list of interesting keywords
#outputPath (string): path to the output file
#skipRevs (list): a list of revisions to skip
#diffSizeLimit (int): give low priorities to huge diffs
def findLogsAndDiffs(dir, log_keywords, diff_keywords, remove_patterns, outputPath, skipRevs, diffSizeLimit):
    "return a file summarizing all interesting findings"
    #switch the working directory to the git repo
    os.chdir(dir)
    #measure the analysis time
    startTime = time.time()
    revNumbers = getRevisionNumbers(dir)
    numOfRevs = len(revNumbers)
    outputFile = open(outputPath, 'w')
    ################check revison log#######################
    #git check log of a revision: git show -s --format=%s revId
    numberOfLogsChecked = 0
    numberOfInterestingLogsFound = 0
    numberOfHighlyInterestingLogs = 0
    ps = PorterStemmer()
    exclude = set(string.punctuation)
    for i in range(numOfRevs):
        cmd = "git show -s --format=%s " + revNumbers[i]
        print "executing command " + cmd
        logMessage = subprocess.check_output(cmd, shell = True)
        numberOfLogsChecked += 1
        if logMessage is not None:
            logMessage = ''.join(ch for ch in logMessage if ch not in exclude)
            logMessageLower = logMessage.decode("utf-8").lower()
            words = logMessageLower.split()
            stemmedWords = []
            for word in words:
                stemmedWords.append(ps.stem(word))
            logMessageLower = " ".join(stemmedWords)
            if logMessageLower.find("leak") != -1:
                numberOfHighlyInterestingLogs += 1
            for keyword in log_keywords:
                for pattern in remove_patterns:
                    logMessageLower = re.sub(pattern, '', logMessageLower)
                if logMessageLower.find(keyword) != -1:
                    print "**************found an interesting log*****************"
                    print logMessageLower
                    numberOfInterestingLogsFound += 1
                    str1 = "############################################interesting logs###########################################\n"
                    #outputFile.write(str1.encode("utf8"))
                    outputFile.write(str1)
                    str2 = "revision " + revNumbers[i] + ": " + logMessage + "\n\n"
                    #print type(str2)
                    #outputFile.write(str2.encode("utf8"))
                    outputFile.write(str2)
                    break
    ##################check diff##############################
    numberOfDiffsChecked = 0
    numberOfInterestingDiffsFound = 0
    highPriorityDiff = list()
    lowPriorityDiff = list()
    for i in range(numOfRevs-1):
        if revNumbers[i] not in skipRevs:
            diff = getDiff(revNumbers[i], dir)
            numberOfDiffsChecked += 1
            if diff is not None:
                diffLower = diff.lower()
                print "##########diff of " + revNumbers[i] + "#########\n"
                diffLines = diffLower.splitlines()
                diffLength = len(diffLines)
                #print sys.path
                interestingDiffLines = list()
                for line in diffLines:
                    if line.startswith("+"):
                        #we are only interested in lines added since fixing resource leak mostly requires adding a resource releasing method call
                        for keyword in diff_keywords:
                            if line.find(keyword) != -1:
                                interestingDiffLines.append(line)
                                print line
                if len(interestingDiffLines) > 0:
                    numberOfInterestingDiffsFound += 1
                    outputString = str()
                    #output results
                    str1 = "############################################interesting diffs##########################################\n"
                    outputString += str1
                    for interestingLine in interestingDiffLines:
                        outputString += interestingLine + "\n"
                    str2 = "original diff of commit " + revNumbers[i] + "\n\n"
                    outputString += str2
                    outputString += diff   
                    if diffLength < diffSizeLimit:
                        highPriorityDiff.append(outputString)
                    else:
                        lowPriorityDiff.append(outputString)
    #now output to file
    for output in highPriorityDiff: 
        outputFile.write(output)
    for output in lowPriorityDiff:
        outputFile.write(output)            
    ###########################################################
    #output some statistics first and then details of interesting findings
    endTime = time.time()
    elapsedTime = endTime - startTime
    outputFile.write("----------------------analysis statistics-------------------------\n")
    outputFile.write("analysis time: " + str(elapsedTime) + "seconds\n")
    outputFile.write("checked " + str(numberOfLogsChecked) + " commit logs checked\n")
    outputFile.write("found " + str(numberOfInterestingLogsFound) + " interesting logs\n")
    outputFile.write("found " + str(numberOfHighlyInterestingLogs) + " highly interesting logs (containing leak keywords)\n")
    outputFile.write("checked " + str(numberOfDiffsChecked) + " diffs checked \n")
    outputFile.write("found " + str(numberOfInterestingDiffsFound) + " interesting diffs\n")
    outputFile.write("high priority: " + str(len(highPriorityDiff)) + "\n")
    outputFile.write("low priority: " + str(len(lowPriorityDiff)) + "\n")
    outputFile.close() 
    
    print "----------------------analysis statistics-------------------------\n"
    print "analysis time: " + str(elapsedTime) + "seconds\n"
    print "checked " + str(numberOfLogsChecked) + " commit logs checked\n"
    print "found " + str(numberOfInterestingLogsFound) + " interesting logs\n"
    print "found " + str(numberOfHighlyInterestingLogs) + " highly interesting logs (containing leak keywords)\n"
    print "checked " + str(numberOfDiffsChecked) + " diffs checked \n"
    print "found " + str(numberOfInterestingDiffsFound) + " interesting diffs\n"
    print "high priority: " + str(len(highPriorityDiff)) + "\n"
    print "low priority: " + str(len(lowPriorityDiff)) + "\n"
