#!/usr/bin/env python3

"""Smudge exported authors into a more analyzable format.

Specifically, apply the following changes:

- convert HEX encoded sequence of bytes to UTF-8 strings (the number of non-decodable
  (and hence ignored) rows will be recorded and reported at the end of the conversion)

"""

__copyright__ = "Copyright (C) 2020  Stefano Zacchiroli"
__license__ = "GPL-3.0-or-later"

import csv
import logging
import sys
import re

from typing import List

from pgcsv2utf8 import csv_filter, CSV_SEP, pghex_to_str


BLANKS_re = re.compile(r"\s+")  # non-empty sequence of blanks

def normalize_spaces(s: str) -> str:
    """normalize all blank sequences to ' ' and strip heading/trailing blanks"""
    return " ".join(list(filter(None, re.split(BLANKS_re, s))))

def smudge_row(row: List[str]) -> List[str]:
    # sample input line: person_id, fullname
    # 16879509,\x4c617572612042656e6e657474
    #
    # sample output line: person_id, fullname
    # 16879509,Laura Bennett

    row_ = []
    row_.append(row[0])  # person id
    row_.append(normalize_spaces(pghex_to_str(row[1])))  # fullname
    row_.append(normalize_spaces(pghex_to_str(row[2])))  # email

    return row_


def main():
    csv_in = csv.reader(sys.stdin, delimiter=CSV_SEP)
    csv_out = csv.writer(sys.stdout, delimiter=CSV_SEP)
    errors = csv_filter(csv_in, csv_out, smudge_row)
    if errors:
        logging.error(f"skipped {errors} row(s) due to conversion errors")


if __name__ == "__main__":
    main()
