#!/usr/bin/env python

import argparse
import json
import os
import re
import sys

FILE_DATA_SET_STATS = "data_set_sizes.json"
parser = argparse.ArgumentParser()
parser.add_argument("directory", type=str, action="store",
                    help="Directory in which to process recursively")
parser.add_argument("-r", "--regex", type=str, action="append", default=[],
                    help="Regex to filter which files to process")
parser.add_argument("--suffix", type=str, action="store", default=None,
                    help="Only data with the given suffix (or all if suffix is not "
                         "given are kept.")
parser.add_argument("-o", "--output", action="store", type=str, default="data_set_sizes.txt")


def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)


def parse_args(argv):
    options = parser.parse_args(argv)
    assert os.path.exists(options.directory)
    for no, regex in enumerate(options.regex):
        options.regex[no] = re.compile(regex)

    return options


def run(argv):
    options = parse_args(argv)
    files = []
    todo = [options.directory]
    while len(todo) > 0:
        dir_next = todo.pop()
        for item in os.listdir(dir_next):
            path_item = os.path.join(dir_next, item)
            if os.path.isdir(path_item):
                todo.append(path_item)
            elif item == FILE_DATA_SET_STATS:
                if all(regex.match(path_item) for regex in options.regex):
                    files.append(path_item)

    files = {os.path.basename(os.path.dirname(file_size)): file_size for
             file_size in files}
    sizes = ""
    for universe_name in natural_sort(files.keys()):
        file_size = files[universe_name]
        with open(file_size, "r") as f:
            all_stats = json.load(f)
        nb_problems = 0
        nb_samples = 0
        for data_file, stats in all_stats.items():
            if options.suffix is not None and not data_file.endswith(options.suffix):
                continue
            nb_samples += stats["#samples"]
            nb_problems += stats["#problems"]
        sizes += "%s\n\tProblems: %i\tTotal samples: %i\n" % (
            universe_name,
            nb_problems,
            nb_samples)

    with open(options.output, "w") as f:
        f.write(sizes)
if __name__ == "__main__":
    run(sys.argv[1:])
