#!/bin/bash
if test -z $GTBOUND ; then
    echo export GTBOUND=path to boundcorpus and continue
    echo to get boundcorpus:
    echo svn checkout https://gtsvn.uit.no/boundcorpus
    exit 1
fi
DATA=$(pwd)/data
pushd $GTBOUND
convert2xml --goldstandard orig/sme
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtbound.text
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtbound.text
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -S > $DATA/gtbound.stuff
head -n 1000 $DATA/src-gtbound.text > $DATA/src-gtbound-valid.text
tail -n +1001 $DATA/src-gtbound.text > $DATA/src-gtbound-train.text
head -n 1000 $DATA/tgt-gtbound.text > $DATA/tgt-gtbound-valid.text
tail -n +1001 $DATA/tgt-gtbound.text > $DATA/tgt-gtbound-train.text
popd
