#!/usr/bin/env python3

__copyright__ = "Copyright (C) 2021  Stefano Zacchiroli"
__license__ = "GPL-3.0-or-later"

import click
import csv
import logging
import sys


CCTLD_DB = "cctld.csv"
CCTLD_WP_URL = "https://en.wikipedia.org/wiki/Country_code_top-level_domain"
EMAIL_SEP = "@"
DOMAIN_SEP = "."

CcTldDb = {}#dict[str, str]


class InvalidEmail(ValueError):
    pass


def load_cctld_db(fname) -> CcTldDb:  # mapping ".TLD" -> "entity name"
    cctlds = {}
    with open(fname) as f:
        for row in csv.DictReader(f):
            cctlds[row["Name"]] = row["Entity"]

    return cctlds


def parse_email(raw_email):
    """parse an email into a pair of <account name, domain parts>, where domain parts is a
    list of "."-separated domain components

    """
    # XXX TODO: bare bone parsing, needs to implement email RFC properly...
    def parse_error():
        raise InvalidEmail(f"invalid email address: {raw_email}")

    try:
        name, domain = raw_email.split(EMAIL_SEP)
    except ValueError:
        parse_error()

    domain_parts = domain.split(DOMAIN_SEP)
    if len(domain_parts) < 2:
        parse_error()

    return name, domain_parts


def guess_country(cctlds, raw_email):
    """ raises ValueError """
    name, domain_parts = parse_email(raw_email)
    tld = domain_parts[-1]
    try:
        country = cctlds["." + tld]
        return country
    except KeyError:
        raise ValueError(f"ccTLD not found for email address: {raw_email}")


@click.command(
    help="""UNIX filter that detect the country of an email address based on ccTLD

    Input is considered to be a line-based database, with tab as default
    separator. Output is added to the input DB as additional columns.

    """
)
@click.option(
    "-d",
    "--cctld-db",
    "cctld_db_fname",
    type=click.Path(exists=True),
    default=CCTLD_DB,
    help=f"CSV table containing ccTLD information, downloaded from {CCTLD_WP_URL}."
    f" Default: {CCTLD_DB}",
)
@click.option(
    "-f",
    "--field",
    type=int,
    default=1,
    help="1-based field number where to read word/fullname from (default: 1)",
)
@click.option(
    "-s", "--separator", type=str, default="\t", help="field separator (default: TAB)"
)
def main(cctld_db_fname, field, separator):
    cctlds = load_cctld_db(cctld_db_fname)
    errors = {"invalid_email": 0, "cctld_not_found": 0}

    for line in sys.stdin:
        fields = line.rstrip("\r\n").split(separator)
        raw_email = fields[field - 1].lower()
        try:
            country = guess_country(cctlds, raw_email)
            print(separator.join(fields + [country]))
        except InvalidEmail:
            errors["invalid_email"] += 1
        except ValueError:
            errors["cctld_not_found"] += 1

    for error, count in errors.items():
        if count:
            logging.error(f"errors: {error}: {count}")


if __name__ == "__main__":
    main()
