#!/bin/bash
if test -z $GTFREE ; then
    echo export GTFREE=path to freecorpus and continue
    echo to get freecorpus:
    echo svn checkout https://gtsvn.uit.no/freecorpus
    exit 1
fi
DATA=$(pwd)/data
pushd $GTFREE
convert2xml --goldstandard orig/sme
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree.text
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree.text
find converted/sme correct-no-gs/converted/sme -name '*.xml'|sort|\
    xargs ccat -S > $DATA/gtfree.stuff
head -n 1000 $DATA/src-gtfree.text > $DATA/src-gtfree-valid.text
tail -n +1001 $DATA/src-gtfree.text > $DATA/src-gtfree-train.text
head -n 1000 $DATA/tgt-gtfree.text > $DATA/tgt-gtfree-valid.text
tail -n +1001 $DATA/tgt-gtfree.text > $DATA/tgt-gtfree-train.text
popd

pushd $GTFREE
find converted/sme/admin correct-no-gs/converted/sme/admin -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-admin.text
find converted/sme/bible -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-bible.text
find converted/sme/blogs -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-blogs.text
find converted/sme/facta correct-no-gs/converted/sme/facta -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-facta.text
cat $DATA/src-gtfree-admin.text $DATA/src-gtfree-bible.text $DATA/src-gtfree-blogs.text $DATA/src-gtfree-facta.text > $DATA/src-gtfree-partial-train.text

find converted/sme/ficti correct-no-gs/converted/sme/ficti -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-ficti.text
find converted/sme/laws -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-laws.text
find converted/sme/science -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-science.text
find converted/sme/news correct-no-gs/converted/sme/news -name '*.xml'|sort|\
    xargs ccat -a -c > $DATA/src-gtfree-news.text
cat $DATA/src-gtfree-ficti.text $DATA/src-gtfree-laws.text $DATA/src-gtfree-science.text $DATA/src-gtfree-news.text > $DATA/src-gtfree-partial-valid.text

find converted/sme/admin correct-no-gs/converted/sme/admin -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-admin.text
find converted/sme/bible -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-bible.text
find converted/sme/blogs -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-blogs.text
find converted/sme/facta correct-no-gs/converted/sme/facta -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-facta.text
cat $DATA/tgt-gtfree-admin.text $DATA/tgt-gtfree-bible.text $DATA/tgt-gtfree-blogs.text $DATA/tgt-gtfree-facta.text > $DATA/tgt-gtfree-partial-train.text

find converted/sme/ficti correct-no-gs/converted/sme/ficti -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-ficti.text
find converted/sme/laws -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-laws.text
find converted/sme/science -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-science.text
find converted/sme/news correct-no-gs/converted/sme/news -name '*.xml'|sort|\
    xargs ccat -a > $DATA/tgt-gtfree-news.text
cat $DATA/tgt-gtfree-ficti.text $DATA/tgt-gtfree-laws.text $DATA/tgt-gtfree-science.text $DATA/tgt-gtfree-news.text > $DATA/tgt-gtfree-partial-valid.text
popd
