import numpy as np
import pandas as pd
import os
import re
import sys

import matplotlib.pyplot as plt

import statsmodels.api as sm

from statsmodels.sandbox.regression.predstd import wls_prediction_std

import datetime

import math

from matplotlib.patches import Rectangle

def normalize(name):
    """Normalize the name of the column into a string without spaces
    """
    name = re.sub(r' \(.*', '', name)
    name = re.sub(r' +', '-', name)
    return name

column_names = [
    'Unix epoch time',
    'ISO date',
    'Number of statements',
    'Number of lines',
    'Number of functions',
    'Statement density',
    'Mean line length (characters)',
    'Average median line length (characters)',
    'Mean file length (lines)',
    'Mean file functionality (statements)',
    'Mean function length (lines)',
    'register keyword density',
    'restrict keyword density',
    'enum keyword density',
    'void keyword density',
    'signed keyword density',
    'unsigned keyword density',
    'const keyword density',
    'volatile keyword density',
    'inline keyword density',
    'noalias keyword density',
    'goto keyword density',
    'C preprocessor include statement density',
    'C preprocessor non-include statement density',
    'C preprocessor conditional statement density',
    'Internally visible declaration density',
    'Comment density',
    'Comment character density',
    'Mean comment size',
    'Kludge word density',
    'Mean statement nesting',
    'Average median statement nesting',
    'Mean identifier length',
    'Average median identifier length',
    'Mean indentation spaces',
    'Average median indentation spaces',
    'Formatting inconsistency',
    'Indentation spaces standard deviation'
]

plt.style.use('ggplot')

path = sys.argv[1]
output = sys.argv[2]
file_list = os.listdir(path)

dfs = []

for item in file_list:
    file = os.path.join(path, item)
    dfs.append(pd.read_csv(file, header=None,
                           parse_dates = ['ISO date'],
                           names=column_names, sep='\t'))

df = pd.concat(dfs, axis=0)

df = df.sort(['Unix epoch time'])

indx = xrange(0, len(df.index))

df['Revision'] = indx
df.set_index(['Revision'], inplace=True)
min_date = df['ISO date'].min()
max_date = df['ISO date'].max()

df['Days passed'] = ((df['ISO date'] - min_date)
                     / np.timedelta64(1, 'D')).astype(int)

total_days = df['Days passed'].max()

results_file = open(os.path.join(output, 'results.txt'), 'w')

for column in column_names[2:]:
    y = df[column]
    x = df['Days passed']
    
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    results_file.write("Examining " + column + " ~ intercept + Days passed\n" )
    results_file.write("=" * 80 + "\n")
    summary = str(results.summary())
    results_file.write(summary)
    results_file.write("\n\n")

    prstd, iv_l, iv_u = wls_prediction_std(results)

    fig, ax = plt.subplots(figsize=(8,6))
   
    plot1 = ax.plot(x.iloc[: ,1], y, 'bo', label="Data")
    plot2 = ax.plot(x.iloc[:, 1], results.fittedvalues, 'r--.',
                    label="Predicted")
    plot3 = ax.plot(x.iloc[:, 1], iv_u, 'r--.', markersize=5, linewidth=1)
    plot4 = ax.plot(x.iloc[:, 1], iv_l, 'r--.', markersize=5)

    xticks = ax.get_xticks().tolist()
    num_xticks = len(xticks)
    days_interval = math.ceil(total_days/(num_xticks-1.0))
    timedelta = datetime.timedelta(days=days_interval)
    date_xticks = []
    xtick_date = min_date
    for xtick in xticks:
        date_xticks.append(xtick_date.strftime('%Y-%m'))
        xtick_date += timedelta    
    ax.set_xticklabels(date_xticks)
    ax.set_xlabel('Date')
    ax.set_ylabel(column)
    handles, labels = ax.get_legend_handles_labels()
    extra = Rectangle((0, 0), 1, 1, fc="w", fill=False,
                      edgecolor='none', linewidth=0)
    handles.append(extra)
    labels.append('$R^2={0:.3f}$'.format(results.rsquared))
    ax.legend(handles, labels, loc="best")
    plt.savefig(os.path.join(output, normalize(column) + '.pdf'))

    #plt.show()
    plt.close()

results_file.close()
