# Script used for RepEnrich2 analysis of preprocessed total RNA-seq data
# In case of questions contact Jens Johansen - jens.johansen@bric.ku.dk

########################################################################
### SETUP REPEAT DATABASE
########################################################################

ORG=mm10 
RE_DIR=/home/jens/programs/RepEnrich2
ANN_DIR=~/data/ann/RepEnrich2/${ORG}

#1. get repeatmasker annotation
#http://www.repeatmasker.org/species/hg.html
#Download version - mm10 - RepeatMasker open-4.0.5 - Repeat Library 20140131
mkdir -p $ANN_DIR && cd $ANN_DIR
wget -N http://www.repeatmasker.org/genomes/mm10/RepeatMasker-rm405-db20140131/mm10.fa.out.gz
gunzip mm10.fa.out.gz
ln -s mm10.fa.out mm10.rm4.0.5.fa.out

#2. Run setup for each repeat annotation file
mkdir -p ${ANN_DIR}/re2_index
nice python ${RE_DIR}/RepEnrich2_setup.py --threads 5 ${ANN_DIR}/mm10.rm4.0.5.fa.out /k/genomes/${ORG}/fa/merged/${ORG}_full.fa ${ANN_DIR}/re2_index >${ANN_DIR}/re2_index.log 2>&1 

#RepEnrich2_setup.py : 
# 1) For each repeat extracts all instances and concatenates them to a single fasta file incl 25 bp flanking seq (individual instances delimited by 200 N's). 
# 2) Create bowtie2 index for repeat type fasta file. 

########################################################################
### RUN RE2
########################################################################
  
ORG=mm10
RE_DIR=/home/jens/programs/RepEnrich2
ANN_DIR=~/data/ann/RepEnrich2/${ORG}

FASTQ_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018/fastq_trim
BASE_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018/repenrich2
CORES=8

###1. Run bowtie mapping with --max option to capture the multimapping reads
#bowtie map unique
mkdir -p ${BASE_DIR}/bowtie2
for flab in $(ls ${FASTQ_DIR}/*R1.trim.fastq.gz | xargs -r -L1 basename | perl -pe "s/_R1.trim.fastq.gz//"); do
  f1=${FASTQ_DIR}/${flab}_R1.trim.fastq.gz
  f2=${FASTQ_DIR}/${flab}_R2.trim.fastq.gz
  echo "nice bowtie2 -q -p $CORES -x /k/genomes/${ORG}/index/bowtie2_full/${ORG} -1 $f1 -2 $f2 -S /dev/stdout 2>${BASE_DIR}/bowtie2/${flab}_bowtie2.log | samtools view -Sb - > ${BASE_DIR}/bowtie2/${flab}.bam"
done | nice parallel -j 3 

#split uniquely and multi mapping reads
cd ${BASE_DIR}/bowtie2
for f in `ls *.bam | grep -v "unique"`; do
  flab=`basename $f .bam`
  echo "python ${RE_DIR}/RepEnrich2_subset.py --pairedend TRUE $f 30 ${flab}_uniq" 
done | parallel -j 12

#echo "nice samtools index $f"
#make bowtie2 mapping summary file
for f in `ls *_bowtie2.log`; do
  sed -e '1d' -e '$d'  $f | perl -pe "s/^ +(\d+) .+\) /\\1\t/" | perl -pe "s/aligned //" | perl -pe "s/were unpaired; of these:/input/" | perl -pe "s/\>1/gt1/" | tr " " "_" | cuto -f2,1 | cat <(echo -e "file\t$f") - | transpose_table
done | sed '3~2d' > bowtie2_summary.log

###2. Run RepEnrich main module
cd ${BASE_DIR}
mkdir -p ${BASE_DIR}/re2out
for flab in $(ls ${BASE_DIR}/bowtie2/*_multimap_R[12].fastq | xargs -r -L1 basename | perl -pe "s/_multimap_R[12].fastq//" | sort -u); do
  f1=${BASE_DIR}/bowtie2/${flab}_multimap_R1.fastq
  f2=${BASE_DIR}/bowtie2/${flab}_multimap_R2.fastq
  echo "nice python ${RE_DIR}/RepEnrich2.py --pairedend TRUE --fastqfile2 $f2 ${ANN_DIR}/mm10.rm4.0.5.fa.out ${BASE_DIR}/re2out/${flab} $flab ${ANN_DIR}/re2_index $f1 ${BASE_DIR}/bowtie2/${flab}_unique.bam --cpus $CORES >${BASE_DIR}/re2out/${flab}.RepEnrich2.log 2>&1"
done > RepEnrich2_py_cmds.txt
cat RepEnrich2_py_cmds.txt | parallel -j 3   #tycho : bedtools v2.26.0-92-g88cd6c5,bowtie version 1.1.2

#rename to have same labels as in 'run2'
rename "s/uniq/subset/" *

# optional args for RepEnrich2.py 
# (--fastqfile2) <multimapping_read_2.fastq> -- second pair if applicable
# (--is_bed TRUE) -- if using a custom .bed file for repeat annotation
# (--cpus N) #nb:only specify even N for paired end data! 
# (--pairedend TRUE) -- if using paired end data