#!/usr/bin/env python3
#

"""Change text corpus pair to character token format with _ spaces."""

from sys import argv

if len(argv) != 3:
    print("Usage:", argv[0], "SRC TGT")
    exit(1)

print("Reading from", argv[1], argv[2])
f = open(argv[1])
g = open(argv[2])
print("Writing to", argv[1] + ".chars", argv[2] + ".chars")
f2 = open(argv[1] + ".chars", "w")
g2 = open(argv[2] + ".chars", "w")

for l1 in f:
    l2 = next(g)
    words1 = l1.strip().split()
    words2 = l2.strip().split()
    pentagrams = int(max(len(words1) / 5, len(words2) / 5)) + 1
    for i in range(pentagrams):
        print(" ".join("_".join(words1[i*5:(i+1)*5])), file=f2)
        print(" ".join("_".join(words2[i*5:(i+1)*5])), file=g2)
