# Script used for preprocessing of total RNA-seq data - from QC to counts per sample per gene # In case of questions contact Jens Johansen - jens.johansen@bric.ku.dk # 76bp PE - total RNA BASE_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018 ############################################################################################ ### FASTQC & FASTQ SCREEN QUALITY CONTROL - BEFORE TRIMMING AND FILTERING ############################################################################################ cd $BASE_DIR mkdir -p fastqc fastq_screen #fastqc & fastq_screen for f in `ls ${BASE_DIR}/fastq/*fastq.gz`;do echo "nice fastqc --quiet -o ${BASE_DIR}/fastqc -f fastq $f" echo "nice fastq_screen --quiet --subset 2500000 --outdir ${BASE_DIR}/fastq_screen $f" done | parallel --will-cite -j 4 ############################################################################################ ### TRIM ############################################################################################ BASE_DIR=/home/jens/BRIC/aditya_sankar/rnaseq_nov2018 CORES=5 #TruSeq protocol used (also checked with Trim Galores auto-detect) ADAPTER_FILE=/usr/local/lib/Trimmomatic-0.32/adapters/TruSeq2-PE.fa mkdir -p ${BASE_DIR}/fastq_trim && cd ${BASE_DIR}/fastq_trim for mylab in $(ls ${BASE_DIR}/fastq/*.fastq.gz | xargs -r -L1 basename | perl -pe "s/_R[12].fastq.gz//" | sort -u); do echo "nice java -Xmx5g -jar /usr/local/lib/Trimmomatic-0.32/trimmomatic-0.32.jar PE -threads $CORES -phred33 ${BASE_DIR}/fastq/${mylab}_R1.fastq.gz ${BASE_DIR}/fastq/${mylab}_R2.fastq.gz ${mylab}_R1.trim.fastq.gz ${mylab}_R1.unpair.fastq.gz ${mylab}_R2.trim.fastq.gz ${mylab}_R2.unpair.fastq.gz ILLUMINACLIP:${ADAPTER_FILE}:2:30:10:1:true HEADCROP:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:25 > ${mylab}_trimmomatic.log 2>&1" done | parallel -j 4 ############################################################################ ### ALIGN WITH STAR ############################################################################ ORG=mm10 GTF_FILE=/k/genomes/${ORG}/ann/2018_08_05/derived/refGene_${ORG}_all_incl_multi.gtf READ_LENGTH=65 #set to read length - 1 = 66 (max length after trimming)-1 = 65. used for --sjdbOverhang database STAR_INDEX=/k/genomes/${ORG}/index/STAR_canonical STAR_MismatchNmax=5 STAR_MismatchNoverLmax=0.1 STAR_MatchNmin=16 CORES=5 #STAR genomeLoad=LoadAndExit --genomeDir $STAR_INDEX --readFilesIn $(ls ${BASE_DIR}/fastq_trim/*fastq.gz | head -n 1) mkdir -p ${BASE_DIR}/alignment && cd ${BASE_DIR}/alignment for mylab in $(ls ${BASE_DIR}/fastq_trim/*R1.trim.fastq.gz | xargs -r -L1 basename | perl -pe "s/_R1.trim.fastq.gz//"); do #mylab=`basename $f .fastq.gz` f1=${BASE_DIR}/fastq_trim/${mylab}_R1.trim.fastq.gz f2=${BASE_DIR}/fastq_trim/${mylab}_R2.trim.fastq.gz echo "nice STAR-2.5.1a --runThreadN $CORES --genomeDir $STAR_INDEX --readFilesIn $f1 $f2 --readFilesCommand zcat --sjdbGTFfile $GTF_FILE --sjdbOverhang $READ_LENGTH --twopassMode Basic --outSAMtype BAM SortedByCoordinate --outSAMattributes All --outSAMunmapped Within --outFileNamePrefix $mylab --outFilterMismatchNoverLmax $STAR_MismatchNoverLmax --outFilterMatchNmin $STAR_MatchNmin --outFilterMismatchNmax $STAR_MismatchNmax >${BASE_DIR}/alignment/${mylab}.log 2>&1" done | parallel --will-cite -j 3 #index files for f in $(ls *.bam); do echo "samtools index $f"; done | nice parallel -j 5 #cleanup rm -r *_STARtmp *_STARpass1 *_STARgenome ############################################################################ ### RSEQC ############################################################################ mkdir -p ${BASE_DIR}/rseqc && cd ${BASE_DIR}/rseqc #make list of bam files to analyze ls -1 ${BASE_DIR}/alignment/*.bam > bam_files.txt ANN_FILE=/k/genomes/mm10/ann/2018_08_05/derived/refGene_mm10_all_incl_multi.bed12 infer_experiment.py -r $ANN_FILE -i $(head -n 1 bam_files.txt) > infer_experiment.log # This is PairEnd Data # Fraction of reads failed to determine: 0.0049 # Fraction of reads explained by "1++,1--,2+-,2-+": 0.0426 # Fraction of reads explained by "1+-,1-+,2++,2--": 0.9525 #RESULT - stranded, reverse #estimate genebody coverage shuf -n 1000 $ANN_FILE > small_ann.bed12 nice geneBody_coverage.py -r small_ann.bed12 -i bam_files.txt -o totscRNAseq.rseqc >genebody_coverage.log 2>&1 #calculate TIN score for samples tin.py -r $ANN_FILE -i bam_files.txt ############################################################################ ### COUNTS ############################################################################ mkdir -p ${BASE_DIR}/counts && cd ${BASE_DIR}/counts ORG=mm10 GTF_FILE=/k/genomes/${ORG}/ann/2018_08_05/derived/refGene_${ORG}_all_incl_multi.gtf nice featureCounts -T 10 --primary -p -B -O -M --fraction -s 2 -J -a $GTF_FILE -o feaureCounts_sctotRNAseq ../alignment/*.bam >feaureCounts_sctotRNAseq.log 2>&1 ############################################################################ ### GEO ############################################################################ #md5sum cat md5sum_fastq.txt | perl -pe "s/ +/\t/" | cuto -f2,1 | perl -pe "s/^(.+-TotRSq-.+-Pool4_S(\\d+)_R.+)\t/\\1\t\\2\t/" | sort -k2,2n | cut -f1,3 > junk.txt cut -f1 junk.txt | while : do read -r r1 || break read -r r2 : do echo -e "$r1\t$r2" done #colleact insert size metrics for each alignment bam file cd ~/BRIC/aditya_sankar/kdm4a/rnaseq_nov2018/repenrich2/bowtie2 for f in $(ls *bam | grep -v uniq); do #echo $f echo "samtools view -h -f 0x2 $f | nice java -XX:ParallelGCThreads=4 -jar /usr/local/bin/picard.jar CollectInsertSizeMetrics I=/dev/stdin O=$(basename $f .bam)_insert.txt H=$(basename $f .bam)_insert.pdf M=0.5" done | nice parallel -j 5 # for f in $(ls *insert.txt); do isize=$(grep -A 1 "MEAN_INSERT_SIZE" $f | sed '1d' | cut -f5); echo -e "$f\t$isize"; done | perl -pe "s/_insert.txt//" | perl -pe "s/^(.+-TotRSq-.+-Pool4_S(\\d+))\t/\\1\t\\2\t/" | sort -k2,2n | cut -f1,3 > mean_insert_size.txt #NB. '2c' cells have shorter mean insert size than other cell stages