#!/usr/bin/env python3

__copyright__ = "Copyright (C) 2021 Davide Rossi"
__license__ = "GPL-3.0-or-later"

import re
import csv
import sys
import datetime
import pytz
import click
import logging
from collections import namedtuple
from collections import defaultdict
from multiprocessing import Process, Queue, current_process, Manager

DEATH_PILL = -1
QUEUE_SIZE = 1000

WORD_RE = re.compile(r'\W+')
CAMEL_RE = re.compile(r'([A-Z][a-z]+)')
UPPER_RE = re.compile(r'([A-Z]+)')
UNDER_RE = re.compile(r'(_)')
STRIP_TZ_RE = re.compile(r'[\+-]\d\d$')
LEADING_BLANKS_RE = re.compile(r'^\s*')
DATETIME_FORMAT = r'%Y-%m-%d %H:%M:%S'

CommitRecord = namedtuple('CommitRecord', 'commit_id datetime timezone fullname email')
PlaceRecord = namedtuple('PlaceRecord', 'utczone offset country region flags zone_name population zone')
NameRecord = namedtuple('NameRecord', 'name kind place frequency')
ScoreRecord = namedtuple('ScoreRecord', 'place score explain')
PlaceCacheRecord = namedtuple("PlaceCacheRecord", "places people")

def advanced_splitter(seq):
    """Split words separated by spaces or using CamelNotation"""
    return WORD_RE.split(LEADING_BLANKS_RE.sub(r'', CAMEL_RE.sub(r' \1', UPPER_RE.sub(r' \1', UNDER_RE.sub(r' ', seq)))))
 
def read_names(filename, skip_header=True):
    """Read a names file are return a list of name tuples and a dictionary using the (lowercased) names as keys"""
    with open(filename, "r") as file:
        csv_reader = csv.reader(file, delimiter='\t')
        if skip_header:
            next(csv_reader)
        names = [NameRecord._make(item) for item in csv_reader]
        names = [NameRecord(name.name.lower(), name.kind, name.place, name.frequency) for name in names]
        names_by_name = defaultdict(dict)
        for name_record in names:
            if name_record.place not in names_by_name[name_record.name]:
                names_by_name[name_record.name][name_record.place] = []
            names_by_name[name_record.name][name_record.place].append(name_record)
        return names, names_by_name

def read_places(filename, skip_header=True):
    """Read a places file and return a list of place tuples and a dictionary using timezones as keys and a list of the corresponding countries (that are in that timezone not in DLS) as values"""
    with open(filename, "r") as file:
        csv_reader = csv.reader(file, delimiter='\t')
        if skip_header:
            next(csv_reader)
        all_places = [PlaceRecord._make(item) for item in csv_reader]
        places = filter(lambda place: place.flags in ['', 'STD'], all_places)

        #now we setup a map tz_offset -> list of places that have been at that offset in the timespan 2000-2020 (by sampling in the interval)
        places_by_tz = defaultdict(list)
        for place in places:
            local_timezone = pytz.timezone(place.zone_name)
            for year in range(2000, 2020):
                for month in range (1, 12, 3): #sample each three months
                    local_time = datetime.datetime(year, month, 1)
                    datetime_offset = str(int(local_timezone.utcoffset(local_time, is_dst=True).total_seconds()/60))
                    if not place in places_by_tz[datetime_offset]:
                        places_by_tz[datetime_offset].append(place)

        return places, places_by_tz

places_tz_cache = {} #I hate you, Python!
def find_places_at_timezone_approximate(all_places, places_by_tz, datetime_string, offset):
    """Return a list of places and the total population at the given offset in the given datetime ignoring 
    the time in the datetime and assuming 12:00:00"""
    global places_tz_cache

    datetime_string = datetime_string[0:10]+" 12:00:00" #keep only the date and use a fake time
    key = f"{datetime_string}_{offset}"
    if key in places_tz_cache:
        return places_tz_cache[key].places, places_tz_cache[key].people
    else:
        places, people = find_places_at_timezone_precise(all_places, places_by_tz, datetime_string+"+00", offset)
        places_tz_cache[key] = PlaceCacheRecord(places, people)
        return places, people

def find_places_at_timezone_precise(all_places, places_by_tz, datetime_string, offset):
    """Return a list of places and the total population at the given offset in the given datetime"""
    datetime_string = datetime_string[0:19] #keep only date and time
    places = []
    candidate_places = places_by_tz[offset]
    people = 0
    for place in candidate_places:
        local_timezone = pytz.timezone(place.zone_name)
        local_time = datetime.datetime.strptime(datetime_string, DATETIME_FORMAT)
        datetime_offset = int(local_timezone.utcoffset(local_time, is_dst=True).total_seconds()/60)
        if datetime_offset == int(offset) or int(offset) == 0:
            logging.debug(f'Compatible place: {place.country} ({place.zone_name}) at {datetime_string} has utc offset of {datetime_offset}')
            people += int(place.population)
            places.append(place)
    return places, people

name_backlist = {'jenkins', 'bot'}
def calculate_score(names, names_by_name, fullname, email, place, people_in_candidate_places, include_explanation):
    """Calculate a numerical score representing the likelihood of the given name to represent an inhabitant of the given place"""
    score = 0
    explain = [] if include_explanation else None
    name_parts = advanced_splitter(fullname)
    surname_found = False
    items_found = 0
    if len(name_parts) > 1 and not set(name_parts).intersection(name_backlist):
        for name_part in advanced_splitter(fullname):
            name_part = name_part.lower()
            if place.country in names_by_name[name_part]:
                items_found += 1
                names = names_by_name[name_part][place.country] #list of same names with different kinds [name, surname]
                for name_record in names:
                    logging.debug(f'Scoring {name_part} in {place.country} - name_record: {name_record}')
                    score += (float(name_record.frequency)*int(place.population)/100)/people_in_candidate_places
                    if name_record.kind == 'surname':
                        surname_found = True
                    if include_explanation:
                        explain.append(f'POP:{name_part}:{name_record.kind}:{name_record.place}:{place.population}:{people_in_candidate_places}:{name_record.frequency}')
            else:
                logging.debug(f'{name_part} not in {place.country}')
    if items_found >= 2 and surname_found:
        return score, explain
    else:
        logging.debug(f'Filtering - score: {score} - items_found: {items_found} - surname_found: {surname_found}')
        return 0, 'filtered'

def calculate_score_orig(names, names_by_name, fullname, email, place, people_in_candidate_places, include_explanation):
    """Calculate a numerical score representing the likelihood of the given name to represent an inhabitant of the given place"""
    score = 0
    explain = [] if include_explanation else None
    for name_part in advanced_splitter(fullname):
        name_part = name_part.lower()
        if place.country in names_by_name[name_part]:
            names = names_by_name[name_part][place.country] #list of same names with different kinds [name, surname]
            for name_record in names:
                logging.debug(f'Scoring {name_part} in {place.country}')
                score += (float(name_record.frequency)*int(place.population)/100)/people_in_candidate_places
                if include_explanation:
                    explain.append(f'POP:{name_part}:{name_record.kind}:{name_record.place}:{place.population}:{people_in_candidate_places}:{name_record.frequency}')
        else:
            logging.debug(f'{name_part} not in {place.country}')
    return score, explain

def pretty_scores(scores):
    msg = ''
    for score in scores:
        msg += (', ' if msg != '' else '[')+(score.place.country if score.place.region == '' else f'{score.place.country}({score.place.region})')+f', {score.score}, {score.explain}'
    return msg+"]"

def assign_and_explain(scores_by_zone) :
    """Given a dictionary of score records indexed by zone, return the assigned zone as the maximum among the country scores accumulated by zone"""
    #accumulate scores by zone
    accumulated_scores = {}
    for zone in scores_by_zone:
        accumulated_scores[zone] = sum(score.score for score in scores_by_zone[zone])
    accumulated_scores = {zone: score for zone, score in sorted(accumulated_scores.items(), key=lambda item: item[1], reverse=True)}
    assigned_zone = list(accumulated_scores.keys())[0] if len(accumulated_scores.keys()) > 0 else ''
    msg = ''
    for zone in scores_by_zone:
        msg += ((', ' if msg != '' else '') + f'{zone}:{accumulated_scores[zone]}:{pretty_scores(scores_by_zone[zone])}')
    return assigned_zone, accumulated_scores, msg

def assign_zone(find_places_at_timezone, places, places_by_tz, names, names_by_name, commit, include_explanation, zone_assignment_cache):
    """Given a commit tuple returns an assigned zone (and, optionally, a readable explanation)"""
    if zone_assignment_cache is not None:
        zone_cache_key = commit.fullname+"/"+commit.email+"/"+commit.timezone
        if zone_cache_key in zone_assignment_cache:
            return zone_assignment_cache[zone_cache_key]

    candidate_places, people_in_candidate_places = find_places_at_timezone(places, places_by_tz, commit.datetime, commit.timezone)
    scores = []
    for candidate_place in candidate_places:
        score, explain = calculate_score(names, names_by_name, commit.fullname, commit.email, candidate_place, people_in_candidate_places, include_explanation)
        scores.append(ScoreRecord(candidate_place, score, explain))
    #put all scores in a dict zone -> list of scores > 0 for places in that zone if you need an explanation, 
    #or simply a dict zone -> score otherwise
    scores_by_zone = defaultdict(list) if include_explanation else defaultdict(int)
    for score in scores:
        if score.score > 0:
            if include_explanation:
                scores_by_zone[score.place.zone].append(score)
            else:
                scores_by_zone[score.place.zone] += score.score
    if include_explanation:
        assigned_zone, _, explanation = assign_and_explain(scores_by_zone)
    else:
        assigned_zone = max(scores_by_zone, key=scores_by_zone.get) if len(scores_by_zone) > 0 else ''
        explanation = None
    if zone_assignment_cache is not None:
        zone_assignment_cache[commit.fullname+"/"+commit.email] = (assigned_zone, "cached")
    return assigned_zone, explanation

def write_line(commit, assigned_zone, explanation):
    print(f'{commit.commit_id}\t{commit.datetime}\t{commit.timezone}\t{commit.fullname}\t{commit.email}\t{assigned_zone}'+(('\t'+explanation) if explanation is not None else ''))

def worker(work_queue, res_queue, find_places_at_timezone, places, places_by_tz, names, names_by_name, zone_assignment_cache):
    """Parallel worker process, retrieve work items from the work queue (until a death pill appears), make the zone assignment and put the result in the result queue"""
    try:
        loop = True
        while loop:
            work_item = work_queue.get()
            if work_item == DEATH_PILL:
                work_queue.put(DEATH_PILL)
                loop = False
            else:
                commit, include_explanation = work_item
                assigned_zone, explanation = assign_zone(find_places_at_timezone, places, places_by_tz, names, names_by_name, commit, include_explanation, zone_assignment_cache)
                res_queue.put((commit, assigned_zone, explanation))
    except Exception as e:
        logging.warning(f'Worker failure: {e}')
        import traceback
        traceback.print_exc()
    res_queue.put(DEATH_PILL)

def writer(res_queue, n_processes):
    """Writer process, centralize the output to avoid interleaving issues"""
    dead_processes = 0
    while dead_processes < n_processes:
        item = res_queue.get()
        if item == DEATH_PILL:
            dead_processes += 1
        else:
            commit, assigned_zone, explanation = item
            write_line(commit, assigned_zone, explanation)

@click.command(
    help = f"""
    A script to make zone assignments for a set of commits for which we know datetime, name and e-mail of the author.
    Uses a heuristic based on the population and the frequency of forenames and surenames of various countries/territories
    to determine a likehood score.
    Parses an input composed by tab-separated lines with fields: commit_id, commit_date, commit_tz, author_name, author_email.
    Produces an output composed by tab-separated lines with fields: commit_id, commit_date, commit_tz, author_name, assigned_zone[, explanation]
    Needs a tab-separated zones files with fields: timezone(UTC[+-]HH:MM), offset(minutes), country/territory, region, timezone_flags(STD|DST_NS|DTS_SW), IANA_timezone, population, zone;
    and a tab-separated names file with fields: name, kind(forename|surname), country, frequency.
    """
)
@click.option(
    "-f", 
    "--file",
    "filename",
    type = str,
    help = "name of the input tab file (reads from stdin if omitted)"
)
@click.option(
    "-s", 
    "--skip_header",
    is_flag = True,
    help = "skip the first line of the input (useful if it contains a header)"
)
@click.option(
    "-e", 
    "--include_explanation",
    is_flag = True,
    help = "include an explanation field in the output"
)
@click.option(
    "-a", 
    "--approx_zones",
    is_flag = True,
    help = "use a corse-grained algorithm to find candidate places for a commit time"
)
@click.option(
    "-n", 
    "--names_file",
    "names_filename",
    type = str,
    required = True,
    help = "names tabfile"
)
@click.option(
    "-p", 
    "--places_file",
    "places_filename",
    type = str,
    required = True,
    help = "people tabfile"
)
@click.option(
    "-w", 
    "--workers",
    type = click.IntRange(2, 4096),
    required = False,
    help = "number of parallel workers"
)
@click.option(
    "-x", 
    "--reuse_zone_assignments",
    is_flag = True,
    help = "use a zone assignment cache: when an assignment has been made for a name,email pair reuse it regardless of current commit time/timezone"
)
@click.option(
    "-y", 
    "--share_zone_assignments",
    is_flag = True,
    help = "share the zone assignment cache between parallel workers"
)
@click.option(
    "--loglevel",
    type = str,
    help = "set the log level (allowed values: DEBUG, INFO, WARNING, ERROR, CRITICAL"
)
def main(filename, loglevel, skip_header, names_filename, places_filename, approx_zones, include_explanation, workers, reuse_zone_assignments, share_zone_assignments):
    if loglevel is not None:
        logging.basicConfig(level=loglevel)

    find_places_at_timezone = find_places_at_timezone_approximate if approx_zones else find_places_at_timezone_precise

    names, names_by_name = read_names(names_filename)
    places, places_by_tz = read_places(places_filename)

    parallel_processing = True if workers is not None else False

    if parallel_processing:
        #setup work item and output queues then start the writer and the worker processes
        zone_assignment_cache = None
        if reuse_zone_assignments:
            if share_zone_assignments:
                manager = Manager()
                zone_assignment_cache = manager.dict()
            else:
                zone_assignment_cache = {}
        work_queue = Queue(QUEUE_SIZE)
        res_queue = Queue(QUEUE_SIZE)
        writer_process = Process(target=writer, args=(res_queue, workers))
        writer_process.start()
        for i in range(workers):
            p = Process(target=worker, args=(work_queue, res_queue, find_places_at_timezone, places, places_by_tz, names, names_by_name, zone_assignment_cache))
            p.start()
    else:
        zone_assignment_cache = {} if reuse_zone_assignments else None

    with sys.stdin if filename is None else open(filename, mode="r") as file:
            csv_reader = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
            if skip_header:
                next(csv_reader)
            for csv_record in csv_reader:
                commit = CommitRecord._make(csv_record)
                commit = commit._replace(fullname=commit.fullname.lower())
                logging.info(f'Processing {commit}')

                if parallel_processing:
                    work_queue.put((commit, include_explanation))
                else:
                    assigned_zone, explanation = assign_zone(find_places_at_timezone, places, places_by_tz, names, names_by_name, commit, include_explanation, zone_assignment_cache)
                    logging.info(f'assigned_zone: {assigned_zone} - include_explanation: {include_explanation} - explanation: {explanation}')
                    write_line(commit, assigned_zone, explanation)

    if parallel_processing:
        work_queue.put(DEATH_PILL)
        writer_process.join()

if __name__ == "__main__":
    main() #pylint: disable=no-value-for-parameter
