# Script used for RepEnrich2 analysis of preprocessed total RNA-seq data # In case of questions contact Jens Johansen - jens.johansen@bric.ku.dk ######################################################################## ### SETUP REPEAT DATABASE ######################################################################## ORG=mm10 RE_DIR=/home/jens/programs/RepEnrich2 ANN_DIR=~/data/ann/RepEnrich2/${ORG} #1. get repeatmasker annotation #http://www.repeatmasker.org/species/hg.html #Download version - mm10 - RepeatMasker open-4.0.5 - Repeat Library 20140131 mkdir -p $ANN_DIR && cd $ANN_DIR wget -N http://www.repeatmasker.org/genomes/mm10/RepeatMasker-rm405-db20140131/mm10.fa.out.gz gunzip mm10.fa.out.gz ln -s mm10.fa.out mm10.rm4.0.5.fa.out #2. Run setup for each repeat annotation file mkdir -p ${ANN_DIR}/re2_index nice python ${RE_DIR}/RepEnrich2_setup.py --threads 5 ${ANN_DIR}/mm10.rm4.0.5.fa.out /k/genomes/${ORG}/fa/merged/${ORG}_full.fa ${ANN_DIR}/re2_index >${ANN_DIR}/re2_index.log 2>&1 #RepEnrich2_setup.py : # 1) For each repeat extracts all instances and concatenates them to a single fasta file incl 25 bp flanking seq (individual instances delimited by 200 N's). # 2) Create bowtie2 index for repeat type fasta file. ######################################################################## ### RUN RE2 ######################################################################## ORG=mm10 RE_DIR=/home/jens/programs/RepEnrich2 ANN_DIR=~/data/ann/RepEnrich2/${ORG} FASTQ_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018/fastq_trim BASE_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018/repenrich2 CORES=8 ###1. Run bowtie mapping with --max option to capture the multimapping reads #bowtie map unique mkdir -p ${BASE_DIR}/bowtie2 for flab in $(ls ${FASTQ_DIR}/*R1.trim.fastq.gz | xargs -r -L1 basename | perl -pe "s/_R1.trim.fastq.gz//"); do f1=${FASTQ_DIR}/${flab}_R1.trim.fastq.gz f2=${FASTQ_DIR}/${flab}_R2.trim.fastq.gz echo "nice bowtie2 -q -p $CORES -x /k/genomes/${ORG}/index/bowtie2_full/${ORG} -1 $f1 -2 $f2 -S /dev/stdout 2>${BASE_DIR}/bowtie2/${flab}_bowtie2.log | samtools view -Sb - > ${BASE_DIR}/bowtie2/${flab}.bam" done | nice parallel -j 3 #split uniquely and multi mapping reads cd ${BASE_DIR}/bowtie2 for f in `ls *.bam | grep -v "unique"`; do flab=`basename $f .bam` echo "python ${RE_DIR}/RepEnrich2_subset.py --pairedend TRUE $f 30 ${flab}_uniq" done | parallel -j 12 #echo "nice samtools index $f" #make bowtie2 mapping summary file for f in `ls *_bowtie2.log`; do sed -e '1d' -e '$d' $f | perl -pe "s/^ +(\d+) .+\) /\\1\t/" | perl -pe "s/aligned //" | perl -pe "s/were unpaired; of these:/input/" | perl -pe "s/\>1/gt1/" | tr " " "_" | cuto -f2,1 | cat <(echo -e "file\t$f") - | transpose_table done | sed '3~2d' > bowtie2_summary.log ###2. Run RepEnrich main module cd ${BASE_DIR} mkdir -p ${BASE_DIR}/re2out for flab in $(ls ${BASE_DIR}/bowtie2/*_multimap_R[12].fastq | xargs -r -L1 basename | perl -pe "s/_multimap_R[12].fastq//" | sort -u); do f1=${BASE_DIR}/bowtie2/${flab}_multimap_R1.fastq f2=${BASE_DIR}/bowtie2/${flab}_multimap_R2.fastq echo "nice python ${RE_DIR}/RepEnrich2.py --pairedend TRUE --fastqfile2 $f2 ${ANN_DIR}/mm10.rm4.0.5.fa.out ${BASE_DIR}/re2out/${flab} $flab ${ANN_DIR}/re2_index $f1 ${BASE_DIR}/bowtie2/${flab}_unique.bam --cpus $CORES >${BASE_DIR}/re2out/${flab}.RepEnrich2.log 2>&1" done > RepEnrich2_py_cmds.txt cat RepEnrich2_py_cmds.txt | parallel -j 3 #tycho : bedtools v2.26.0-92-g88cd6c5,bowtie version 1.1.2 #rename to have same labels as in 'run2' rename "s/uniq/subset/" * # optional args for RepEnrich2.py # (--fastqfile2) -- second pair if applicable # (--is_bed TRUE) -- if using a custom .bed file for repeat annotation # (--cpus N) #nb:only specify even N for paired end data! # (--pairedend TRUE) -- if using paired end data