#!/usr/bin/env python3

"""filter authors list to keep only rows that correspond to plausible full names

plausibility criteria are:

- being non-empty
- having enough (unicode) letter characters, as opposed to non-letters
- not being an email
- not being too long

as a side effect: normalize blank sequences to single spaces

"""

__copyright__ = "Copyright (C) 2020-2021  Stefano Zacchiroli"
__license__ = "GPL-3.0-or-later"

import csv
import logging
import sys
import unicodedata
import re

from collections import defaultdict
from typing import Dict

from pgcsv2utf8 import CSV_SEP as CSV_SEP_IN


MAX_NAME_LEN = 100  # maximum name length
MIN_ALPHA = 0.90  # minimum amount of letter characters in names (percentage)
EMAIL_re = re.compile(r"^[^\s@]+@[^\s@]+$")  # email address
CSV_SEP_OUT = "\t"


def is_alpha(s: str, min_alpha=MIN_ALPHA) -> bool:
    alpha_chars = sum(
        map(lambda c: 1 if unicodedata.category(c).startswith("L") else 0, s)
    )
    return alpha_chars / len(s) >= min_alpha


def filter_names(csv_reader, csv_writer) -> Dict[str, int]:
    # CSV input/output format: person id, fullname
    stats: Dict[str, int] = defaultdict(int)

    for person_id, fullname, email in csv_reader:
        stats["total_in"] += 1

        if not fullname:
            stats["empty"] += 1
            continue
        elif not is_alpha(fullname):
            stats["nonalpha"] += 1
            continue
        elif EMAIL_re.match(fullname):
            stats["email"] += 1
            continue
        elif len(fullname) > MAX_NAME_LEN:
            stats["toolong"] += 1
            continue

        csv_writer.writerow((person_id, fullname, email))
        stats["total_out"] += 1

    stats["skipped"] = stats["total_in"] - stats["total_out"]

    return stats


def main():
    csv_in = csv.reader(sys.stdin, delimiter=CSV_SEP_IN)
    csv_out = csv.writer(sys.stdout, delimiter=CSV_SEP_OUT)
    stats = filter_names(csv_in, csv_out)
    for k, v in stats.items():
        logging.error(f"{k}: {v}")


if __name__ == "__main__":
    main()
