#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Std;
use File::Basename;
use threads;
use List::Util qw(sum);

## last updated April 7 2016
## version 0.15
##################################################################################
# a script to clean up reads (adaptor/duplicate/contamination removal), trimming # 
#  Sonal Singhal, sonal.singhal1 [at] gmail.com                                  # 
#  Ke Bi, kebi [at] berkeley.edu                                                 # 
##################################################################################


&main;
exit;

sub main {
        &usage if (@ARGV<1);
        my $command = shift(@ARGV);
        my %TRANS = (cleanSE=>\&cleanSE,  cleanPE=>\&cleanPE);
        die("Unknown command \"$command\"\n") if (!defined($TRANS{$command}));
        &{$TRANS{$command}};
      }

sub usage {
  die(qq/

Usage: scrubReads <command> [<arguments>]\n

Command: 

cleanPE:          clean paired end reads 

cleanSE:          clean single end reads

External dependencies: 
"bowtie2", "fastQC", "flash"
"trimmomatic", "super_deduper"
"cutadapt"

If read error correction is desired, 
"bfc" + "seqtk"(1.1-r91) 
or "Recorrector" is also needed

If run rescale qualty scores for museum DNA:
mapDamage2.0
novoalign (>=V3.04)
SAMtools (>1.3)

\n/);
}

sub cleanPE {
  
  die (qq/

Usage: scrubReads cleanPE  [options]
Options:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-f     DIR      Path to the folder of raw fastq files
-o     DIR      Path to the results folder 
-t     PATH     Path to trimmoatic executable
-c     FILE     Contaminant file
-m     FLOAT    percent similarity to contaminant file to
                consider a read to be a contaminant
                [0.90]
-R     INT      1=remove duplicates; 0 = keep all duplicates [1] 
                Note: for DGE analysis using RNAseq data, keeping 
                or removing duplicates is still on debate. 
-k     INT      Number of threads needed for bowtie2 alignment 
                (remove contamination), skewer(trimming adapters),
                trimmomatic (trimming adapters and low qual), read 
                error correction (for de novo assemblies) [4]
-h     INT      Trimmomatic trimming length cutoff [36]
-q     INT      Quality trimming threhold [20]
-d     CHAR     The particular library that you like to process, 
                if "all" is specified, all libraries in the folder
                (-f) will be processed sequenctially [all]
-M     INT      Also keep unmerged overlapping PE reads? 
                1 = yes (for DGE)
                0 = no (for SNP calling) [0]
-g     FLOAT    Maximum allowed ratio between the number of 
                mismatched base pairs and the overlap length [0.1]
-z              If z is supplied, use fastQC to evaluate 
                cleaned sequence reads [null]
-v              print version

#### The above assumes trimming Truseq adapters only
#### For trimming Nextera PE transposase adapters
-n              If n is supplied assuming trimming Nextera PE transposase Adapters ONLY [null]

####  Error correction is only recommended for de novo assemblies  ####
####  DO NOT run error correction for alignment based analyses ####

-N     INT      error correction
                0 = no error correction -- default
                1 = using bfc (when number of reads is < 30 million)
                2 = using Rcorrector (when number of reads is >= 30 million)
                [0]
-T     CHAR     if N=2 provide full path to the Rcorrector folder
-G     INT      if N=1 approx genome size (k\/m\/g allowed)
                for transcriptomic data set this to 50m [50m] 

#### rescale qualty scores for museum DNA:
-A              Sequence are from museum samples [null]
-B     FILE     a reference genome for alignment (for popgen projects)
-C     FOLDER   a folder with individual assemblies as references (for phylogenetic projects)
-D     INT      Maximum alignment score acceptable for the best alignment. Recommend use 150-180  
                (roughly 5-6 mismatches are allowed per read) [180].
-S              save DNA damage stats report? [null]                                

Note: 1) do not do error correction (-N\/-T) for museum DNA samples!
      2) I strongly recommend use Novoalign for all downstream alignments!
      3) require bedtools in path! 
      

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


\n/) if !@ARGV;
  
  my %opts = (f=>undef, o=>undef,  t=>undef, c=>undef,  g=>0.1, d=>"all", h=>36, M=>0,  R=>1, w=>1,  k=>4, q=>20, M=>0, N=>0, T=>undef, G =>"50m", m=>0.9, B=>undef, C=>undef,D=>180);  
  getopts('B:C:D:f:o:t:c:g:d:i:h:R:w:k:q:N:T:G:m:M:zvnAS', \%opts);
  
  if ($opts{v}) {
    print "version 0.15\n";
    exit;
  } 
  
  my $cpu = $opts{k};
  my $rd = $opts{R};
  my $quality = $opts{q};
  my $index = $opts{w};
  my $mismat = $opts{m};
  my $nomerge = $opts{M};
  
  my $rawdir = redir ($opts{f});
  my $outdir = redir ($opts{o});
  mkdir ($outdir) unless -d ($outdir); 
 
  my $contam = $opts{c};
  die(qq/\nHmm...I could not find where the contaminant file is.. Check "-c"? \n\n/) unless -e $contam;
  
  my $uniad = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT';
  
  
  my $P1 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'; ##TruSeq3_IndexedAdapter READ1
  my $P2 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'; ##TruSeq3_UniversalAdapter READ2

  my $pe1 = 'TACACTCTTTCCCTACACGACGCTCTTCCGATCT';
  my $pe2 = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT';
  my $pe1_rc = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA';
  my $pe2_rc = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC';

  my $nextpe = "AGATGTGTATAAGAGACAG";
  my $nextse1 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG";
  my $nextse2 = "CTGTCTCTTATACACATCTGACGCTGCCGACGA";
  my $nextse3 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG";
  my $nextse4 = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC";

  my $P1rc = 'TACACTCTTTCCCTACACGACGCTCTTCCGATCT';
  my $P2rc = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT';

  
  my $trimmomatic = $opts{t};
  die(qq/\nHmm...I could not locate where the trimmomatic executable is.. Check "-t"? \n\n/) unless -e $trimmomatic;
  
  my $cutadapt = 'cutadapt';
  my $flash = 'flash';
  my $bowtie = 'bowtie2';
  my $cope = 'cope';##not used by this script
  my $minLength = $opts{h};

  my $rr = $opts{N};
  my $gz = $opts{G};
  my $rc = '';
  if ($rr == 2) {
    $rc = redir ($opts{T}) || die "can not locate Rcorrector!\n";
  } 
  
  my $dir = $rawdir . 'pre-clean/';
  
  if (! -s $dir) {
    rawclean ($rawdir);   
  }

  my @files;
  if ($opts{d} eq "all") {
    @files = <$dir*_R1.fq>; 
    die(qq/\nHmm...I do not know where you are hiding your raw sequence files... Check "-d" and\/or "-f"? \n\n/) if (scalar(@files) == 0);
    print "\n","OK! Now processing all data files!", "\n";
  }
  
  else {
    @files = <$dir*$opts{d}_R1.fq>; 
    die(qq/\nHmm...I do not know where you are hiding your raw sequence files... Check "-d" and\/or "-f"? \n\n/) if (scalar(@files) == 0);
    print "\n","OK! Now processing library $opts{d}!", "\n";
    
  }
  
  foreach my $file1 (@files) {
  
    open (IN,"<",$file1);
    my $firstline = <IN>;
    #$InID = $1 if ($firstline =~ m/^\@([^:]+):/ );
    #print "can not find instrument ID!\n" if ($firstline !~ m/^\@[^:]+:/);
    #exit if ($firstline !~ m/^\@[^:]+:/);
    #close IN;
    my $seqline = <IN>;
    my $readLength = length $seqline;
    close IN;


    my $file2 = $file1;
    $file2 =~ s/_R1.f/_R2.f/;
    my $lib = $1 if basename($file1) =~ m/(\S+)_R[1|2]\.f/i;   
    
    ### use bfc to do error correction 
    if ($rr == 1) {
      my $inter = $dir . $lib . "_inter.fq";
      system ("seqtk mergepe $file1 $file2 > $inter");
      
      my $bfc_cor = $dir . $lib . "_inter_bfccor.fq";
      system ("bfc -s $gz -t $cpu $inter > $bfc_cor");
      my $r1 =  $dir . $lib . "_R1.fq1";
      my $r2 =  $dir . $lib . "_R2.fq1";
      open (IN, "<", $bfc_cor);
      open (OUT1, ">", $r1);
      open (OUT2, ">", $r2);
      unlink ($inter, $bfc_cor);
      while (<IN>) {
	chomp (my $line = $_);
	if ($line =~ m/^\@/) {
	  chomp (my @a = split /\s+/, $line);
	  chomp (my $seq1 = <IN>);
	  chomp (my $quali1 = <IN>);
	  chomp (my $qual1 = <IN>);
	  
	  chomp (my $next = <IN>);
	  chomp (my @b = split /\s+/, $next);
	  chomp (my $seq2 = <IN>);
	  chomp (my $quali2 = <IN>);
	  chomp (my $qual2 = <IN>);
	  
	  print OUT1 $a[0], "\n";
	  print OUT1 $seq1 , "\n";
	  print OUT1 $quali1, "\n";
	  print OUT1 $qual1, "\n";
	  
	  print OUT2 $b[0], "\n";
	  print OUT2 $seq2, "\n";
	  print OUT2 $quali2, "\n";
	  print OUT2 $qual2, "\n";
	  
	}
      }
      close IN;
      close OUT1;
      close OUT2;
      system ("mv $r1 $file1");
      system ("mv $r2 $file2");
    }


    ## use Rcorrector for error correction
    if ($rr == 2) {
      my $run = $rc . "run_rcorrector.pl";
      system ("perl $run -1 $file1 -2 $file2 -k 31 -t $cpu -od $dir");
      my $r1 =  $dir . $lib . "_R1.cor.fq";
      my $r2 =  $dir . $lib . "_R2.cor.fq";
      open (IN1, "<", $r1);
      open (IN2, "<", $r2);
       
      my $R1 =  $dir . $lib . "_R1.cor.fq1";
      my $R2 =  $dir . $lib . "_R2.cor.fq1";
      open (OUT1, ">", $R1);
      open (OUT2, ">", $R2);
      
      unlink ($r1, $r2);
      while (<IN1>) {
	chomp (my $line = $_);
	if ($line =~ m/^\@/) {
	  chomp (my @a = split /\s+/, $line);
	  chomp (my $seq1 = <IN1>);
	  chomp (my $quali1 = <IN1>);
	  chomp (my $qual1 = <IN1>);
	  
	  print OUT1 $a[0], "\n";
	  print OUT1 $seq1 , "\n";
	  print OUT1 $quali1, "\n";
	  print OUT1 $qual1, "\n";
	}
      }
      close IN1;
      close OUT1;

      while (<IN2>) {
	chomp (my $line = $_);
	if ($line =~ m/^\@/) {
	  chomp (my @a = split /\s+/, $line);
	  chomp (my $seq2 = <IN2>);
	  chomp (my $quali2 = <IN2>);
	  chomp (my $qual2 = <IN2>);
	  
	  print OUT2 $a[0], "\n";
	  print OUT2 $seq2 , "\n";
	  print OUT2 $quali2, "\n";
	  print OUT2 $qual2, "\n";
	}
      }
      close IN2;
      close OUT2;
      
      system ("mv $R1 $file1");
      system ("mv $R2 $file2");
    }
   
    my $start1 = time;	
    if ($rd == 1) {
      my $rawdr = $dir . "orginal/";
      mkdir $rawdr unless -e $rawdr;
      
      my ($file1, $file2) = remove_dup($file1, $file2, $lib, $dir, $rawdr);
    }
    
    my $time1 = int((time - $start1)/60);  
    if ($rd == 0 ) {
      print "All duplicates are kept...\n";
    }
    if ($rd == 1 ) {
      print "Found duplicates in $lib in $time1 minutes! All duplicates will be removed...\n";
    }
    
   # my $start2 = time;
   # my $low = $outdir . $lib . '.lowComplexity.out';
   # removeLowComplexity($file1,$low,$nper,$aper);
   # removeLowComplexity($file2,$low,$nper,$aper);
   # my $time2 = int((time - $start2)/60);
   # print "Found low complexity reads in $lib in $time2 minutes! Whew, almost there!\n";
    
    my $start3 = time;
    my %reads = ('1' => $file1, '2' => $file2);
    #my $ad = getAdaptors($lib,$adaptorFile,$libInfo,$index,$uniad); 

    my ($outp1, $outp2);
    my ($outpair1, $outpair2, $merge);
    my ($out1, $out2);
    
    if ($opts{n} && $opts{b}) {
      print "can only use -b or -n, not both!\n";
      exit;
    }
      
    
    ###TruseqPE only
    if (!$opts{n}) {
      #($outp1, $outp2) = skewer ($dir, $lib, $file1, $file2, $P1, $P2, $minLength, $cpu);
      ($outp1, $outp2) = cutadatpair ($cutadapt, $dir, $lib, $file1, $file2, "i");
      ($outpair1, $outpair2, $merge) = trimmomaticpair ($lib, $pe1, $pe2, $pe1_rc, $pe2_rc, $pe1, $pe2, $outp1,  $outp2, 'trim1', $trimmomatic, $minLength, $cpu, $quality);
    }


    ###NexteraPE only
    if ($opts{n}) {
      #($outp1, $outp2) = skewer ($dir, $lib, $file1, $file2, $nextpe, $nextpe, $minLength, $cpu);
      ($outp1, $outp2) = cutadatpair ($cutadapt, $dir, $lib, $file1, $file2, "n");
      ($outpair1, $outpair2, $merge) = trimmomaticpair ($lib, $nextpe, $nextpe, $nextse1, $nextse2, $nextse3,$nextse4,$outp1, $outp2, 'trim1', $trimmomatic, $minLength, $cpu, $quality);
    }

    ###both NexteraPE and TruseqPE ##
    #if (!$opts{n} && $opts{b}) {  
    #  $outp1 = skewersingle ($dir, $lib, $file1, $P1, $P2, $P1rc,  $P2rc,  $nextse1, $nextse2, $nextse3,$nextse4, $minLength, $cpu, "1");
    #  $out1 = trimmomatic ($lib, $P1, $P2, $P1rc,  $P2rc, $nextse1, $nextse2, $nextse3,$nextse4, $outp1, $outp1,'trim1',$trimmomatic,$minLength, $cpu, $quality);     
    #  $outp2 = skewersingle ($dir, $lib, $file2, $P1, $P2, $P1rc,  $P2rc,  $nextse1, $nextse2, $nextse3,$nextse4, $minLength, $cpu,"2");
    #  $out2 = trimmomatic ($lib, $P1, $P2, $P1rc,  $P2rc, $nextse1, $nextse2, $nextse3,$nextse4, $outp2, $outp2,'trim2',$trimmomatic,$minLength, $cpu, $quality);
    #  $outpair1 = $file1 . ".paired";
    #  $outpair2 = $file2 . ".paired";
    #  $merge = $file1 . ".u";
    #  fixmatepair ($out1, $out2, $outpair1, $outpair2, $merge);      
    #}    

    %reads = ('1' => $outpair1, '2' => $outpair2, 'u' => $merge);
    
    my $unmerged1;
    my $unmerged2;
    if ($nomerge == 1) {
      $unmerged1 = $outdir . $lib ."_unmerged_R1.fastq";
      $unmerged2 = $outdir . $lib ."_unmerged_R2.fastq";
      system ("cp $outpair1 $unmerged1 ");
      system ("cp $outpair2 $unmerged2 ");     
    }
    
    my $reads4 = mergeReads($lib,\%reads,\%reads,"merge",$readLength,$flash, $opts{g},$opts{e});
    
    if ($opts{A}) { ###MUSEUM SAMPLES!
      my %newreads1 = %{$reads4};
      my $read1 = $newreads1{'1'};
      my $read2 = $newreads1{'2'};
      my $readu = $newreads1{'u'};
      
      my $ref = '';
      if ($opts{B}) { ##popgen projects!!
	$ref = $opts{B} || die "can not find reference genome for alignment!\n";
      }
      if ($opts{C}) { ##phylogenetics projects!!
	my $refsdir = redir ($opts{C});
	$ref = $refsdir . $lib . "_targetedRegionAndFlanking.fasta"  || die 'can not find reference genome for alignment! The name must be ended with "_targetedRegionAndFlanking.fasta"!\n';
      }   
      my $refname = $1 if basename ($ref) =~ /(\S+)\.fa/;
      my $nix = $refname  . ".nix";
      system ("novoindex $nix $ref");

      my $pairedSam =  $outdir . $lib ."_paired.sam";
      my $seSam = $outdir . $lib ."_unpaired.sam";
      ###paired
      system ("novoalign -o FullNW -t $opts{D} -d $nix -f $read1 $read2 -F STDFQ -n $readLength -i PE 150, 100 -o SAM  > $pairedSam");
      ###unpaired
      my $rr = $readLength  * 2;
      system ("novoalign -o FullNW -t $opts{D} -d $nix -f $readu  -F STDFQ -n $rr  -o SAM  > $seSam");
      
      my $rescaledBamPe = $outdir . $lib ."_rescaled_paired.bam";
      my $rescaledBamSe = $outdir . $lib ."_rescaled_se.bam";
      
      
      system ("mapDamage --forward --rescale  --rescale-out=$rescaledBamPe --seq-length=50  -i $pairedSam -r $ref");
      system ("mapDamage --rescale  --rescale-out=$rescaledBamSe --seq-length=50  -i $seSam -r $ref");

      $reads4 = extractreads(\%newreads1, $rescaledBamPe, $rescaledBamSe,  $ref);
      
      unlink ($nix, $pairedSam,$seSam, $rescaledBamPe, $rescaledBamSe);
      my $resultfolder =  "results_" . $lib . "_combined/";
      system ("rm -r $resultfolder") unless $opts{S};	
    }
         
    my $start4 = time;
    my $contaminants = $outdir . $lib . '.contam.out';
    removeContamination($reads4,$contam,$contaminants,$bowtie,'pe', $cpu,  $mismat);
    my $time4 = int((time - $start4)/60);
    print "Removed contamination in $lib in $time4 minutes! It's going...\n";	
    
    makeFinal($outdir,$lib,$reads4,$contaminants);

    if ($nomerge == 1) {     
      my %junk;
      
      #open(IN, "<$low");
      #while(<IN>){
#	chomp(my $line = $_);
#	$junk{$line}++;
 #     }
  #    close(IN);
      open(IN, "<$contam");
      while(<IN>) {
	chomp(my $line = $_);
	$junk{$line}++;
      }
      close(IN);
      
      my $new1 = $outdir . $lib . "_unmerged_1_final.fq";
      my $new2 = $outdir . $lib . "_unmerged_2_final.fq";
  
      my %new = ('1' => $new1, '2' => $new2);
      my %reads =  ('1' => $unmerged1, '2' => $unmerged2);
      foreach my $type (keys %reads) {
	open(OUT, ">$new{$type}");
	open(IN, "<$reads{$type}");
	while(<IN>) {
	  chomp(my $line = $_);
	  my @line =split (/\t+/, $line);
	  if (scalar(@line) == 1) {
	    if ($line[0] =~ m/^@(\S+)\/[1|2]$/) {
	      my $id = $1;
	      my $seq = <IN>; my $qualid = <IN>; my $qual = <IN>;
	      unless($junk{$id}){
		print OUT $line, "\n", $seq,$qualid,$qual;
	      }
	    }
	  }	    
	}
	close(IN); close(OUT);
      }
      unlink ( $unmerged1,  $unmerged2);      
    }    
    system("rm $dir*$lib*paired*");
    system("rm $dir*$lib*.u");
    system("rm $dir*$lib*trimmed*");
  }
  
  if ($opts{z}) {
    print "\n","start evaluating ... OH WOW! Your sequence data are looking much better now!" , "\n\n";
    my @clean;
    
    if ($opts{d} eq "all") {
      @clean = < $outdir*_final.fq> ;
    }
    
    else {
      @clean = <$outdir$opts{d}*_final.fq>; 
    }
    
    my $resdir =  $outdir.'evaluation/';
    mkdir $resdir unless -e $resdir;
    foreach (<@clean>) {
      my $lib = $1 if basename($_) =~ m/(\S+)_[1|2|u]_final.fq/; 
      my $call1 = system("fastqc -t 2 $_ -o $resdir");
      die(qq/\nThe program "fastQC" is not in path! \n\n/) if ($call1 == -1);
    
      system ("rm $resdir$lib*fastqc.zip");
    }
    
  }	
  print "\n","Congratulations! The cleanup step is done... Now what?" , "\n\n"; 
}

sub cutadatpair {
  my ($cutadapt, $dir, $lib, $file1, $file2, $type) = @_;
  my $outp1round1 = $dir . $lib . "_read1_cutadapt1.fq";
  my $outp2round1 = $dir . $lib . "_read2_cutadapt1.fq";
  my $outp1round2 = $dir . $lib . "_read1_cutadapt2.fq";
  my $outp2round2 = $dir . $lib . "_read2_cutadapt2.fq";
  if ($type eq "i") {
    system ("$cutadapt -b AGATCGGAAGAGC -B AGATCGGAAGAGC -n 5 -f fastq -o  $outp1round1 -p $outp2round1 $file1 $file2");
    system ("$cutadapt -b GCTCTTCCGATCT -B GCTCTTCCGATCT -n 5 -f fastq -o  $outp1round2 -p $outp2round2 $outp1round1 $outp2round1");
  }
  if ($type eq "n") {
    system ("$cutadapt -b AGATGTGTATAAGAGACAG -B AGATGTGTATAAGAGACAG -n 5 -f fastq -o  $outp1round1 -p $outp2round1 $file1 $file2");
    system ("$cutadapt -b CTGTCTCTTATACACATCT -B CTGTCTCTTATACACATCT -n 5 -f fastq -o  $outp1round2 -p $outp2round2 $outp1round1 $outp2round1");
  }
  
  unlink ($outp1round1, $outp2round1);
  return ($outp1round2, $outp2round2);
}


sub  extractreads {
  my ($newreads1, $rescaledBamPe, $rescaledBamSe, $ref) = @_;
  ##$reads4 = extractreads(\%newreads1, $rescaledBamPe, $rescaledBamSe,);
  my %newreads1 = %{$newreads1};
  my $read1 = $newreads1{'1'};
  my $read2 = $newreads1{'2'};
  my $readu = $newreads1{'u'};
  system (" bedtools bamtofastq -i $rescaledBamPe -fq $read1 -fq2 $read2");
  system (" bedtools bamtofastq -i $rescaledBamSe -fq $readu");
  my %reads = ('1' => $read1,'2' => $read2, 'u' => $readu); 
 
  return (\%reads);
}

sub skewer {
 my ($outdir, $lib, $file1, $file2,$P1, $P2, $minLength, $cpu) = @_;
 my $outp1 = $outdir . $lib . "_R1-trimmed-pair1.fastq";
 my $outp2 = $outdir . $lib . "_R1-trimmed-pair2.fastq";

 system ("skewer -t $cpu  -x $P1 -y $P2 $file1 $file2 -m pe -l $minLength -f sanger ");
 #system ("rm $outdir*$lib*masked*");
 return ($outp1, $outp2);
}

sub skewersingle {
  my ($outdir, $lib, $file1, $P1,$P2, $P1rc,$P2rc, $n1,$n2,$n3,$n4, $minLength, $cpu, $ps) = @_;
  my $outp1 = $outdir . $lib . "_R". $ps. "-trimmed.fastq";

  my $adfile = $file1 . ".adfile.fa";
  
  open(OUT, ">" ,$adfile);
  print OUT  ">adapter1", "\n";
  print OUT $P1, "\n";
  print OUT ">adapter2", "\n";
  print OUT $P2, "\n";
  print OUT  ">adapter3", "\n";
  print OUT $P1rc, "\n";
  print OUT ">adapter4", "\n";
  print OUT $P2rc, "\n";
  if ($n1 ne '0') {
    print OUT  ">adapter5", "\n";
    print OUT $n1, "\n";
    print OUT ">adapter6", "\n";
    print OUT $n2, "\n";
    print OUT  ">adapter7", "\n";
    print OUT $n3, "\n";
    print OUT ">adapter8", "\n";
    print OUT $n4, "\n";
  }
  close OUT;
  
  my $outp1round1 = $outdir . $lib . "_read1_cutadapt1.fq";

  system ("cutadapt -b AGATCGGAAGAGC -n 5 -f fastq -o  $outp1round1 $file1 ");
  system ("cutadapt -b GCTCTTCCGATCT -n 5 -f fastq -o  $outp1 $outp1round1");

  #system ("skewer -t $cpu  -x $adfile  $file1  -m any -l $minLength -f sanger ");
  system ("rm  $adfile");
  
  unlink ($outp1round1);
  
  return ($outp1);
}

sub makeFinal {
  my ($outdir, $lib,$reads,$contam) = @_;
  
  my %junk;
  
  #open(IN, "<$low");
  #while(<IN>){
  #  chomp(my $line = $_);
  #  $junk{$line}++;
  #}
  #close(IN);
  open(IN, "<$contam");
  while(<IN>) {
    chomp(my $line = $_);
    $junk{$line}++;
  }
  close(IN);
  
  my $new1 = $outdir . $lib . "_1_final.fq";
  my $new2 = $outdir . $lib . "_2_final.fq";
  my $newu = $outdir . $lib . "_u_final.fq";
  my %new = ('1' => $new1, '2' => $new2, 'u' => $newu);
  my %reads = %{$reads};
  foreach my $type (keys %reads) {
    open(OUT, ">$new{$type}");
    open(IN, "<$reads{$type}");
    while(<IN>) {
      chomp(my $line = $_);
      my @line =split (/\t+/, $line);
      if (scalar(@line) == 1) {
	if ($line[0] =~ m/^@(\S+)\/[1|2]$/) {
	  my $id = $1;
	  my $seq = <IN>; my $qualid = <IN>; my $qual = <IN>;
	  unless($junk{$id}){
	    print OUT $line, "\n", $seq,$qualid,$qual;
	  }
	}
      }	    
    }
    close(IN); close(OUT);
  }
}

sub getAdaptors {
  my ($lib,$adaptorFile,$libInfo,$index,$uniad) = @_;
  
  my %lib;
  open(IN, "<$libInfo");
  
  my $header = <IN>;
  
  while(<IN>) {
    chomp(my $line = $_);
    my @d = split(/\s+/,$line);
    next if $d[0] =~ /^$/;
    if ($d[2]) {
      $lib{$d[0]} = {'P7'=>$d[1], 'P5'=>$d[2]};
    }
    if (!$d[2]) {
      $lib{$d[0]} = {'P7'=>$d[1]};
    }
  }
  close(IN);
  
  my %P7;
  my %P5;
  
  
  open(IN, "<$adaptorFile");
  while(<IN>) {
    chomp(my $line = $_);
    next if $line =~ /^$/;
    if ($line =~ m/>P7_index(\d+)/) {
      my $bc = $1;
      chomp(my $seq = <IN>);
      $P7{$bc} = $seq;
    }
    if ($line =~ m/>P5_index(\d+)/) {
      my $bc2 = $1;
      chomp(my $seq2 = <IN>);
      $P5{$bc2} = $seq2;
    }
    
  } 
  close(IN);	  
  my %ad;
  if ($index == 1) {
    %ad = ("uni" => $uniad, "uni_rc" => rc($uniad), "index" => $P7{$lib{$lib}{'P7'}}, "index_rc" => rc($P7{$lib{$lib}{'P7'}}));
    die(qq/\nHmm...I could not find the adapter sequences for $lib ... Check the naming of the adapter sequences in "-a". \n\n/) if (!$ad{"index"});
  }
  if ($index == 2) {
    %ad = ("uni" =>$P5{$lib{$lib}{'P5'}}, "uni_rc" => rc($P5{$lib{$lib}{'P5'}}), "index" => $P7{$lib{$lib}{'P7'}}, "index_rc" => rc($P7{$lib{$lib}{'P7'}}));
    die(qq/\nHmm...I could not find the adapter sequences for $lib ... Check the naming of the adapter sequences in "-a". \n\n/) if (!$ad{"index"} || !$ad{"uni"});
    
  } 
  
  return(\%ad);
}

sub rc {
  my ($seq) = @_;
  my $rc = $seq;
  $rc = reverse($rc);
  $rc =~ tr/ATGCatgc/TACGtacg/;
  return($rc);
}

sub removeContamination {
  my ($reads,$contam,$contaminants, $bowtie, $suf, $cpu,  $mismat) = @_;
  my %reads = %{$reads};
  unless (-f $contam . ".3.bt2") {
    my $bw2build = $bowtie . "-build";
    my $call1 = system("$bw2build $contam $contam");
  }
  if ($suf eq 'pe')  {    
    my $contamout1 = $reads{'1'} . ".contam.sam1";
    my $call2 = system("$bowtie -x $contam -1 $reads{'1'} -2 $reads{'2'} --fast -S $contamout1 -p $cpu --sam-nohead --sam-nosq");
    die(qq/\nThe program "bowtie2" is not in path! \n\n/) if ($call2 == -1 );
    my $contamout2 = $reads{'1'} . ".contam.sam2";
    my $call3 = system("$bowtie -x $contam -U $reads{'u'} --fast -S $contamout2 -p $cpu --sam-nohead --sam-nosq");
    my $contamout_all = $reads{'1'} . ".contam_all.sam";
    system ("cat $contamout1 $contamout2 > $contamout_all");
   
    parseSAM($contaminants,$contamout_all, $mismat);
    system ("rm $contamout1 $contamout2");  
  }
  if ($suf eq 'se') {
    
    my $contamout1 = $reads{'1'} . "contam.sam1";  
    my $call3 = system("$bowtie -x $contam -U $reads{'1'} --fast -S $contamout1 -p $cpu --sam-nohead --sam-nosq");
    parseSAM ($contaminants,$contamout1, $mismat);
    system ("rm $contamout1 ");
  }
  
}
#1H56M1D43M

sub parseSAM {
  my ($contaminants,$contam,  $mismat) = @_;
  open(OUT, ">$contaminants");
  open(IN, "<$contam");
  
  while(<IN>) {
    chomp(my $line = $_);
    my @d = split(/\t/,$line);
    if ($d[2] !~ m/\*/) {
      if ($d[5] =~ m/\d+M/) {
	my @sum = ($d[5] =~ m/(\d+)/);
	my $length = sum (@sum);
	my $md = $1 if $line =~ m/(MD\:Z\S+)/;
	my @a = ($md =~ m/([ATGC])/g);
	if (scalar(@a)/$length < 1 - $mismat) {
	  $d[0] =~ s/\/[1|2]$//;
	  print OUT $d[0], "\n";
	}
      }
    }
  }
  close(IN); close(OUT);  
  #unlink($contam);
}       


sub remove_dup {
  my ($file1, $file2,$lib,$dir, $newdir) = @_;
  
  system ("super_deduper -1 $file1 -2 $file2 -p $dir" . "$lib -s 5 -l 10");
  system ("mv $file1 $file2 $newdir");
  my $newfile1 = $dir . $lib . "_nodup_PE1.fastq";
  my $newfile2 = $dir . $lib . "_nodup_PE2.fastq";
  system ("mv $newfile1 $file1");
  system ("mv $newfile2 $file2");
  return ($file1, $file2);
}


sub removeLowComplexity {
  my ($file,$low,$nper,$aper) = @_;
  open(IN, "<$file");
  open(OUT, ">>$low");
  while(<IN>) {
    chomp(my $line = $_);		
    if ($line =~ m/^@(\S+)\/[1|2]$/) {
      my $id = $1;
      chomp(my $seq = <IN>);
      my $n = int($nper*length($seq));
      my $a = int($aper*length($seq));
      my $ncounter = ($seq =~ m/N/g);
      if ($seq =~ m/[A]{$a}/i || $seq =~ m/[T]{$a}/i || $seq =~ m/[G]{$a}/i || $seq =~ m/[C]{$a}/i || $ncounter >= $n) {
	print OUT $id, "\n";
      }
    }
  }	
  close(IN); close(OUT);	
}

sub mergeReads {
  my ($lib,$orig,$reads,$base, $readLength, $flash,$g, $e) = @_;
  #($lib,\%reads,\%reads,"trim2",$readLength,$flash, $opts{g},$opts{e});
  my %reads = %{$reads};
  
  my $newread1 = $orig->{'1'} . '_' . $base . '_p1';
  my $newread2 = $orig->{'2'} . '_' .$base .'_p2';
  my $newreadu = $orig->{'1'} . '_' .$base .'_u';
  
  my $call1 = system("$flash $reads{'1'} $reads{'2'} -M $readLength -m 10 -x $g  --allow-outies  -o $lib");
  die(qq/\nThe program "flash" is not in path! \n\n/) if ($call1 == -1 );
  
  open (EXTEND,"<",  $lib . ".extendedFrags.fastq");
  open (NEW, ">",  $lib . ".extendedFrags.fastq1");
  while (<EXTEND>) {
    chomp(my $line = $_);	
    if ($line =~ m/^@(\S+)/) {
      my $new_line =  $line . "/1";
      my $seq =<EXTEND>;
      my $qualid = <EXTEND>;
      my $qual = <EXTEND>;
      print NEW $new_line , "\n" , $seq , $qualid , $qual;
    } 
  }
  close EXTEND;
  close NEW;
  my $call2 = system("cat $reads{'u'} $lib\.extendedFrags.fastq1 > $newreadu");
  my $call3 = system("mv $lib\.notCombined_1.fastq $newread1");
  my $call4 = system("mv $lib\.notCombined_2.fastq $newread2");
  my $call5 = system("rm $lib\.extendedFrags.fastq $lib\.extendedFrags.fastq1  $lib\.hist*");
  
  my %newreads = ('1' => $newread1,'2' => $newread2, 'u' => $newreadu);
  return(\%newreads);
}

sub reallyMergeReads {
  my ($lib,$orig,$reads,$base1,$base2) = @_;	
  
  my %reads = %{$reads};	
  my $newread1 = $orig->{'1'} . '_' . $base1 . '_p1';
  my $newread2 = $orig->{'2'} . '_' .$base1 .'_p2';
  my $newreadu = $orig->{'1'} . '_' .$base1 .'_u';
  my $call1 = system("cope -a $reads{'1'} -b $reads{'2'} -o $lib\.copemerged -2 $newread1 -3 $newread2 -m 0 -l 10 -c 0.99 -s 33");
  die(qq/\nThe program "cope" is not in path! \n\n/) if ($call1 == -1 );
  
  my $newerread1 = $orig->{'1'} . '_' . $base2 . '_p1';
  my $newerread2 = $orig->{'2'} . '_' .$base2 .'_p2';
  my $newerreadu = $orig->{'1'} . '_' .$base2 .'_u';
  my %newerreads = ('1' => $newerread1,'2' => $newerread2, 'u' => $newerreadu);
  
  open (COPE, "<", $lib . ".copemerged");
  open (COPE_NEW, ">", $lib . ".copemerged1");  
  while (<COPE>) {
    chomp(my $line = $_);
    my @f = split /\s+/,$line;
    if (scalar(@f) == 4) {
      if ($f[1] =~ m/^(\S+)(\/1)(_\S+)\/[1|2]$/) {
	my $full_id = $1 . $2 . $3;
	my $seq = <COPE>; my $qualid = <COPE>; my $qual = <COPE>;
	print COPE_NEW "@",$1,$2,"\n", $seq,$qualid,$qual;
	
      }
    }
    if (scalar(@f) == 2) {
      if ($f[0] =~ m/^@(\S+)(\/1)(_\S+)\/[1|2]$/) {
	my $full_id = $1 . $2 . $3;
	my $seq = <COPE>; my $qualid = <COPE>; my $qual = <COPE>;
	print COPE_NEW "@",$1,$2,"\n", $seq,$qualid,$qual;	
      }
    }
  }
  close COPE;
  close COPE_NEW;
  
  my $call3 = system("cat $lib\.copemerged1 $reads{'u'} > $newerreadu");
  my $call4 = system("mv $newread1 $newerread1");
  my $call5 = system("mv $newread2 $newerread2");
  my $call6 = system("rm $lib\.copemerged $lib\.copemerged1 ");
  return(\%newerreads);
}

sub fixmatepair {
  my ($out1, $out2, $outpair1, $outpair2, $merge) = @_;
  
  open (IN1, "<",$out1);
  open (IN2, "<",$out2);
  open(OUT1, ">$outpair1"); 
  open(OUT2, ">$outpair2"); 
  open(OUTU, ">$merge");

  my %pair;
  
  while(<IN1>) {
    chomp(my $line = $_);
    if ($line =~ m/^@(\S+)\/[1|2]$/) {
      $pair{$1}++;
      chomp(my $seq = <IN1>); chomp(my $qualid = <IN1>); chomp(my $qual = <IN1>);
    }
  }
  seek IN1, 0,0;	
  while(<IN2>) {
    chomp(my $line = $_);
    if ($line =~ m/^@(\S+)\/[1|2]$/) {
      $pair{$1}++;
      chomp(my $seq = <IN2>); chomp(my $qualid = <IN2>); chomp(my $qual = <IN2>);
    }
  }
  seek IN2, 0,0;

  while(<IN1>) {
    chomp(my $line = $_);
    if ($line =~ m/^@(\S+)\/[1|2]$/) {
      chomp(my $seq = <IN1>); chomp(my $qualid = <IN1>); chomp(my $qual = <IN1>);
      if ($pair{$1} == 2) { 
	print OUT1 $line, "\n",  $seq, "\n",  $qualid, "\n", $qual, "\n";
      }
      else {
	print OUTU $line, "\n",  $seq, "\n",  $qualid, "\n", $qual, "\n";

      }  
    }
  }
  close IN1;
  close OUT1;

  while(<IN2>) {
    chomp(my $line = $_);
    if ($line =~ m/^@(\S+)\/[1|2]$/) {
      chomp(my $seq = <IN2>); chomp(my $qualid = <IN2>); chomp(my $qual = <IN2>);
      if ($pair{$1} == 2) { 
	print OUT2 $line, "\n",  $seq, "\n",  $qualid, "\n", $qual, "\n";
      }
      else {
	print OUTU $line, "\n",  $seq, "\n",  $qualid, "\n", $qual, "\n";
	
      }  
    }
  }
  close IN2;
  close OUT2;
  close OUTU;
  unlink ($out1, $out2);

}

sub fixMatePair1 {
  my ($lib,$read,$readarray,$base) = @_;
  my @trim = @{$readarray};
  my %pair;	
  foreach my $reads (@trim) {
    open(IN, "<$reads");
    while(<IN>) {
      chomp(my $line = $_);
      if ($line =~ m/^@(\S+)\/[1|2]$/) {
	$pair{$1}++;
	chomp(my $seq = <IN>); chomp(my $qualid = <IN>); chomp(my $qual = <IN>);
      }
    }
    close(IN);	
  }
  my %reads = %{$read};
  my $out1 = $reads{'1'} . '_' . $base . '_p1';
  my $out2 = $reads{'2'} . '_' . $base . '_p2';
  my $outu = $reads{'1'} . '_' . $base . '_u';
  print $out1, "\n"; 
  print $out2, "\n";
  print $outu, "\n";
  open(OUT1, ">$out1"); 
  open(OUT2, ">$out2"); 
  open(OUTU, ">$outu"); 
  my %newpairs = ('1' => $out1, '2' => $out2, 'u' => $outu);
  foreach my $reads (@trim) {
    open(IN, "<$reads");
    my $file = $1 if $reads =~ m/_R(\d+)\./;
    while(<IN>) {
      chomp(my $line = $_);	
      if ($line =~ m/^@(\S+)\/[1|2]$/) {
	my $id = $1;
	my $seq = <IN>;
	my $qualid = <IN>;
	my $qual = <IN>;
	if ($pair{$id} == 2) {
	  if ($file == 1) {
	    print OUT1 $line . "\n" . $seq . $qualid . $qual;
	  }
	  else {
	    print OUT2 $line . "\n" . $seq . $qualid . $qual;
	  }
	}
	else {
	  print OUTU $line . "\n" . $seq . $qualid . $qual;
	}
      }	
    }
    close(IN);	
  }	
  close(OUT1); close(OUT2); close(OUTU);	
  return(\%newpairs);
}


sub trimmomaticpair {
  my ($lib, $P1, $P2, $a, $b, $c, $d, $read1, $read2, $base, $trimmomatic,$minLength,$cpu, $quality) = @_;
 
  my $outpair1 = $read1 . ".paired";
  my $outpair2 = $read2 . ".paired";
  my $outunpair1 = $read1 . ".unpaired";
  my $outunpair2 = $read2 . ".unpaired";
  my $merge = $read1 . ".u";
  
  my $adfile = $read1 . "_adfile.fa";
  open(OUT, ">" ,$adfile);
  print OUT  ">adapter/1", "\n";
  print OUT $P1, "\n";
  print OUT ">adapter/2", "\n";
  print OUT $P2, "\n";
  print OUT ">adapters1", "\n";
  print OUT $c, "\n";
  print OUT ">adapters2", "\n";
  print OUT $a, "\n";
  print OUT ">adapters3", "\n";
  print OUT $d, "\n";
  print OUT ">adapters4", "\n";
  print OUT $b, "\n";
  
  close OUT;
  
  my $call2 = system ("java -jar $trimmomatic PE -threads $cpu -phred33 $read1 $read2 $outpair1 $outunpair1 $outpair2 $outunpair2 ILLUMINACLIP:$adfile:2:30:10 SLIDINGWINDOW:4:$quality MINLEN:$minLength LEADING:3 TRAILING:3");
 
  system ("cat $outunpair1 $outunpair2 > $merge");
  
  unlink($adfile,$read1, $read2, $outunpair1, $outunpair2);
  return($outpair1, $outpair2, $merge);
}

sub trimmomatic {
  my ($lib,$P1,$P2, $P1rc,$P2rc, $in,$base,$suffix, $trimmomatic,$minLength, $cpu, $quality) = @_;
  my $out = $base . '_' . $suffix;	
  my $adfile = $in . "_adfile.fa";
  open(OUT, ">" ,$adfile);
  print OUT  ">single1", "\n";
  print OUT $P1, "\n";
  print OUT ">single2", "\n";
  print OUT $P2, "\n";
  print OUT ">single3", "\n";
  print OUT $P1rc, "\n";
  print OUT ">single4", "\n";
  print OUT $P2rc, "\n";
  
  close OUT;
  
  my $call1 = system("java -jar $trimmomatic SE -threads $cpu  -phred33 $in $out ILLUMINACLIP:$adfile:2:30:10 SLIDINGWINDOW:4:$quality MINLEN:$minLength LEADING:3 TRAILING:3");
  unlink($adfile);
  return($out);
}

sub cutadapt {
  my ($ad,$in,$base,$suffix, $cutadapt,$minLength) = @_;
  my $out  = $base . '_' . $suffix;	
  my $curRead = $in;
  my $tracker = 1;
  my %ad = %{$ad};
  foreach my $key (keys %ad) {
    my $out = $curRead . $tracker;
    my $call = system("$cutadapt -b $ad{$key} -O 4 -n 5 -e 0.15 -f fastq $curRead -o $out -m $minLength");
    die(qq/\nThe program "cutadapt" is not in path! \n\n/) if ($call == -1 );
    unlink($curRead) unless($curRead eq $in);
    $curRead = $out;
    $tracker++;
  }
  my $call2 = system("mv $curRead $out");
  return($out);
}

sub bowtie {
  my ($lib, $ad,$in,$base,$suffix,$bowtie,$minLength, $cpu) = @_;
  my $out  = $base . '_' . $suffix;	
  my $file = $lib."out.sam";
  
  my $adfile = $lib . "_adfile_norev.fa";
  open(OUT, ">$adfile");
  foreach my $name (keys %{$ad}) {
    print OUT ">", $name, "\n", $ad->{$name}, "\n" unless $name =~ m/rc/;
  }
  close(OUT);	
  
  my $bw2build = $bowtie . "-build";
  my $call1 = system("$bw2build $adfile $adfile");
  my $call2 = system("$bowtie --local -D 15 -R 2 -N 1 -L 10 -i S,1,0.75 -p $cpu -k 1 -x $adfile -U $in -S $file");
  die(qq/\nThe program "bowtie2" is not in path! \n\n/) if ($call2 == -1 );
  my $call3 = system("rm $adfile" . "*");
  
  open(IN, "<$file");
  open(OUT, ">$out");
  
  while(<IN>) {
    chomp(my $line1 = $_);
    my @d1 = split(/\t/,$line1);
    
    my $seq1 = $d1[9];
    my $qual1 = $d1[10];
    
    unless($line1 =~ m/^@/) {
      if ($line1 !~ m/\*/) {			
	if ($d1[5] =~ m/^(\d+)S\d+M$/) {
	  my $l = $1;
	  $seq1 = substr $seq1, 0, $l;
	  $qual1 = substr $qual1, 0, $l;
	}
	elsif ($d1[5] =~ m/^(\d+)M\d+S$/) {
	  my $start = $1;
	  $seq1 = substr $seq1, $start;
	  $qual1 = substr $qual1, $start;
	}
	else {
	  my @s;
	  while ($d1[5] =~ m/(\d+)S/g) {
	    push(@s,$1);
	  }
	  @s = sort {$a <=> $b} @s;
	  if ($s[$#s] >= $minLength) {	
	    if ($d1[5] =~ m/^(\S*)$s[$#s]/) {
	      my $match = $1;
	      my $length = $s[$#s];
	      my $start = 0;
	      while ($match =~ m/(\d+)/g) {
		$start += $1;
	      }
	      $seq1 = substr $seq1, $length;
	      $qual1 = substr $qual1, $length;	
	    }													
	  }
	  else {
	    $seq1 = 'N'; $qual1 = 'N';
	  }
	}
      }
      
      if (length($seq1) >= $minLength) {	
	print OUT "@" . $d1[0] . "\n" . $seq1 . "\n" . '+' . "\n" . $qual1 . "\n";	
      }
    }	
  }	
  unlink($file);
  close(IN); close(OUT);	
  return($out);
}


sub redir {
  my ($dir) = @_;
  my $out;
  if ($dir =~ m/\/$/ ){
    $out = $dir; 
  }
  else {
    $out = $dir . "/";
  }
  return ($out);
}


#now convert the raw data files to format compatible to downstream analyses
sub rawclean {
  my ($rawdir) = @_;
  my $Result_dir1 = $rawdir . 'combined/';
  mkdir $Result_dir1 unless -e  $Result_dir1;
  my $dir = $rawdir . 'pre-clean/';
  mkdir $dir unless -e  $dir;
  
  my @orig_files = <$rawdir*.gz>; 
  
  die(qq/\nHmm...Is it the right folder? \n\n/) if (scalar (@orig_files) == 0);
  my %lib;
  
  foreach (<@orig_files>) {   
    my $file = basename($_) =~ /(\S+)_L\d+_(R[1|2])_\d+\.fastq\.gz/;
    my $libs = $1. $2;
    $lib{$libs}++; 
  }
  
  @orig_files = <$rawdir*.gz>;

 foreach (<@orig_files>) {   
   my $file = basename($_) =~ /(\S+)_L\d+_(R[1|2])_\d+\.fastq\.gz/;
   my $lib_name = $1;
   my $d = $2;
   my $libs = $lib_name . $d;
   if ($lib{$libs}) {
     
     my $file2 = $rawdir . $lib_name .  '*'.  $d . '*' . '.fastq.gz';
     my $combined = $Result_dir1 . $lib_name . "_". $d . '.fastq.gz'; 

     system ("cat $file2 > $combined ");
     
   }
   delete $lib{$libs} if $lib{$libs};
 }
  
  my @merged_files = < $Result_dir1*.fastq.gz> ;
  
  foreach my $file (<@merged_files>) {
    my $out = $dir .  $1 .".fq" if basename($file)  =~ /(\S+_R[1|2])\.fastq\.gz/;
    my $redundancy = '^--$' ; 
    print "cleaning","\t",$file,"\n";
    if ($file =~ m/_R([1|2])\.fastq\.gz/) {
      if ($1 == 1) {
	system ("gunzip -c $file | grep -A 3 \'^@.* [^:]*:N:[^:]*:\'  | grep -v $redundancy | sed \'s/ 1:N:0:.*/\\/1/g\' > $out");
      }
      if ($1 == 2) {
	system ("gunzip -c $file | grep -A 3 \'^@.* [^:]*:N:[^:]*:\' | grep -v $redundancy | sed  \'s/ 2:N:0:.*/\\/2/g\' > $out");
      }  
    }
  }
  return ($dir);
}


sub cleanSE {
  die (qq/

Usage: scrubReads cleanPE [options]

External dependencies:
"skewer", "bowtie2", "fastQC","trimmomatic"

If read error correction is desired, 
"bfc" + "seqtk"(1.1-r91) 
or "Recorrector" is also needed

Options:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-f     DIR      Path to the folder of raw fastq files
-o     DIR      Path to the results folder
-t     FILE     Path to trimmoatic executable
-c     FILE     Contaminant file
-m     FLOAT    percent similarity to contaminant file to
                consider a read to be a contaminant
                [0.90]
-k     INT      Number of threads needed for bowtie2 alignment 
                (remove contamination), skewer(trimming adapters),
                trimmomatic (trimming adapters and low qual), read 
                error correction (for de novo assemblies) [4]
-q     INT      Quality trimming threhold [20]
-d     CHAR     The particular library that you like to process,
                if "all" is specified, all libraries in the folder
                (-f) will be processed sequenctially [all]
-r     FLOAT    Will get rid of reads with any runs of bases
                longer than r*read_length [0.5]
-h     INT      Trimmomatic trimming length cutoff [36]
-z              If z is supplied, use fastQC to evaluate
                cleaned sequence reads [null]
-v              print version

####  Error correction is only recommended for de novo assemblies  ####
####  DO NOT run error correction for alignment based analyses ####

-N     INT      error correction
                0 = no error correction -- default
                1 = using bfc (when number of reads is < 30 million)
                2 = using Rcorrector (when number of reads is >= 30 million)
                [0]
-T     CHAR     if N=2 provide full path to the Rcorrector folder
-G     INT      if N=1 approx genome size (k\/m\/g allowed)
                for transcriptomic data set this to 50m [50m] 


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


\n/) if !@ARGV;

my %opts = (f=>undef, o=>undef,t=>undef, c=>undef,  k=> 4, q=> 20,  d=>"all", h=>36, N=>0, T=>undef, G =>"50m", m=>0.9);
getopts('f:o:t:c:d:h:k:q:N:T:G:zv', \%opts);

  if ($opts{v}) {
    print "version 0.15\n";
    exit;
  } 
 
  my $rawdir = redir ($opts{f});
  my $outdir = redir ($opts{o});
 
  mkdir ($outdir) unless -d ($outdir);
  
  
  my $contam = $opts{c};
  die(qq/\nHmm...I could not find where the contaminant file is.. Check "-c"? \n\n/) unless -e $contam;
  
  #universal adapter P5 if no barcode is used in it
  my $uniad = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT';
  
  my $P1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA';
  my $P2 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC';
  my $P1rc = 'TACACTCTTTCCCTACACGACGCTCTTCCGATCT';
  my $P2rc = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT';
  
  
  my $trimmomatic = $opts{t};
  die(qq/\nHmm...I could not locate where the trimmomatic executable is.. Check "-t"? \n\n/) unless -e $trimmomatic;
  my $cpu = $opts{k};
  my $quality = $opts{q};
  my $cutadapt = 'cutadapt';
  my $bowtie = 'bowtie2';
  my $cope = 'cope';
  my $minLength = $opts{h}; 
  my $index = $opts{w};
  my $maxmis = $opts{g};
  my $mismat = $opts{m};
  
  my $rr = $opts{N};
  my $gz = $opts{G};
  my $rc = '';
  if ($rr == 2) {
    $rc = redir ($opts{T}) || die "can not locate Rcorrector!\n";
  } 
  
  my $dir = $rawdir . 'pre-clean/';
  if (! -s $dir) {
    rawclean ($rawdir);   
  }
 
  my @files;
  if ($opts{d} eq "all") {
    @files = <$dir*_R1.fq>; 
    die(qq/\nHmm...I do not know where you are hiding your raw sequence files... Check "-d" and\/or "-f"? \n\n/) if (scalar(@files) == 0);
    print "\n","OK! Now processing all data files!", "\n";
  }
  
  else {
    @files = <$dir*$opts{d}_R1.fq>; 
    die(qq/\nHmm...I do not know where you are hiding your raw sequence files... Check "-d" and\/or "-f"? \n\n/) if (scalar(@files) == 0);
    print "\n","OK! Now processing library $opts{d}!", "\n";
  }
  
  
  foreach my $file1 (@files) {
    open (IN,"<",$file1);
    my $firstline = <IN>;
    #$InID = $1 if ($firstline =~ m/^\@([^:]+):/ );
    #print "can not find instrument ID!\n" if ($firstline !~ m/^\@[^:]+:/);
    #exit if ($firstline !~ m/^\@[^:]+:/);
    #close IN;
    my $seqline = <IN>;
    my $readLength = length $seqline;
    close IN;
    
    my $lib = $1 if basename($file1) =~ m/(\S+)_R1\./i;   


    ### use bfc to do error correction 
    if ($rr == 1) {      
      my $bfc_cor = $dir . $lib . "_bfccor.fq";
      system ("bfc -s $gz -t $cpu $file1 > $bfc_cor");
      my $r1 =  $dir . $lib . "_R1.fq1";

      open (IN, "<", $bfc_cor);
      open (OUT1, ">", $r1);

      unlink ($bfc_cor);
      while (<IN>) {
	chomp (my $line = $_);
	if ($line =~ m/^\@/) {
	  chomp (my @a = split /\s+/, $line);
	  chomp (my $seq1 = <IN>);
	  chomp (my $quali1 = <IN>);
	  chomp (my $qual1 = <IN>);
	  
	  print OUT1 $a[0], "\n";
	  print OUT1 $seq1 , "\n";
	  print OUT1 $quali1, "\n";
	  print OUT1 $qual1, "\n";	  
	}
      }
      close IN;
      close OUT1;    
      system ("mv $r1 $file1");
    }

    ## use Rcorrector for error correction
    if ($rr == 2) {
      my $run = $rc . "run_rcorrector.pl";
      system ("perl $run -s $file1  -k 31 -t $cpu -od $dir");
      my $r1 =  $dir . $lib . "_R1.cor.fq";
      open (IN1, "<", $r1);    
      my $R1 =  $dir . $lib . "_R1.cor.fq1";
      open (OUT1, ">", $R1);

      unlink ($r1);
      
      while (<IN1>) {
	chomp (my $line = $_);
	if ($line =~ m/^\@/) {
	  chomp (my @a = split /\s+/, $line);
	  chomp (my $seq1 = <IN1>);
	  chomp (my $quali1 = <IN1>);
	  chomp (my $qual1 = <IN1>);
	  
	  print OUT1 $a[0], "\n";
	  print OUT1 $seq1 , "\n";
	  print OUT1 $quali1, "\n";
	  print OUT1 $qual1, "\n";
	}
      }
      close IN1;
      close OUT1;
      
      system ("mv $R1 $file1");

    }
    
    #################find low complexity#####################
    #my $start2 = time;
    #my $low = $outdir . $lib . '.lowComplexity.out';
    #removeLowComplexity($file1,$low,$nper,$aper); 
    #my $time2 = int((time - $start2)/60);
    #print "Found low complexity reads in $lib in $time2 minutes! Whew, almost there!\n";
    #########################################################
    
    ##############trim adapter and merging###################
    my $start3 = time;
    my %reads = ('1' => $file1);
    #my $ad = getAdaptors($lib,$adaptorFile,$libInfo,$index,$uniad);
    my $final1;  
    #my ($outp1) = skewersingle ($dir, $lib, $file1, $P1, $P2, $P1rc,  $P2rc, '0','0','0','0', $minLength, $cpu, "1");
    my $outp1 = cutadaptsingle ($cutadapt, $dir, $lib, $file1);
    my $out1 = trimmomatic($lib, $P1, $P2, $P1rc,  $P2rc, $outp1, $outp1, 'trim1',$trimmomatic,$minLength, $cpu, $quality);
                     
    $final1 = $file1 . '_cleaned1';
    my $call = system("mv $out1 $final1");
    
    
    my $time3 = int((time - $start3)/60);
    print "Trimmed in $lib in $time3 minutes! Whew, almost there!\n";
    #########################################################
    
    
    #################remove contaminaitons###################
    my $start4 = time;
    my $contaminants = $outdir . $lib . '.contam.out';
    my $suf = 'se';
    %reads = ('1' => $final1);
    removeContamination(\%reads,$contam,$contaminants,$bowtie,'se', $cpu, $mismat);
    my $time4 = int((time - $start4)/60);
    print "Removed contamination in $lib in $time4 minutes! It's going...\n";	
    #########################################################
    
    
    ######################make final files###################
    makeFinalSE ($outdir,$lib,$final1,$contaminants);
    
    my $call_rm1 = system("rm $dir*$lib*clean*");
   # my $call_rm2 = system("rm $dir*$lib*trim*");
   
  }
  
  if ($opts{z}) {
    print "\n","start evaluating ... OH WOW! Your sequence data are looking much better now!" , "\n\n";
    my @clean;
    
    if ($opts{d} eq "all") {
      @clean = < $outdir*_final.fq> ;
    }
    
    else {
      @clean = <$outdir$opts{d}*_final.fq>; 
    }
    
    my $resdir =  $outdir.'evaluation/';
    mkdir $resdir unless -e $resdir;
    foreach (<@clean>) {
      my $lib = $1 if basename($_) =~ m/(\S+)_[1|2|u]_final.fq/; 
      my $call1 = system("fastqc -t 2 $_ -o $resdir");
      die(qq/\nThe program "fastQC" is not in path! \n\n/) if ($call1 == -1 );
      system ("rm $resdir$lib*fastqc.zip");
    }
    
  }	
  
  print "\n","Congratulations! The cleanup step is done... Now what?" , "\n\n"; 
  #############################################################
}


sub cutadaptsingle {
  my ($cutadapt, $dir, $lib, $file1) = @_;
  my $outp1round1 = $dir . $lib . "_read1_cutadapt1.fq";
  my $outp1round2 = $dir . $lib . "_read1_cutadapt2.fq";

  system ("$cutadapt -b AGATCGGAAGAGC -n 5 -f fastq -o  $outp1round1 $file1 ");
  system ("$cutadapt -b GCTCTTCCGATCT -n 5 -f fastq -o  $outp1round2 $outp1round1");

  unlink ($outp1round1);
  return ($outp1round2);
}

sub makeFinalSE {
  my ($outdir, $lib,$reads,$contam) = @_;
  
  my %junk;
 
 # open(IN, "<$low");
 # while(<IN>){
 #   chomp(my $line = $_);
 #   $junk{$line}++;
 # }
 # close(IN);
  
  open(IN, "<$contam");
  while(<IN>) {
    chomp(my $line = $_);
    $junk{$line}++;
  }
  close(IN);
  
  my $new1 = $outdir . $lib . "_1_final.fq";
      
      open(IN, "<", $reads);
      open(OUT, ">", $new1);
      while(<IN>) {
	chomp(my $line = $_);
	my @line =split (/\t+/, $line);
	if (scalar(@line) == 1) {
	  if ($line[0] =~ m/^@(\S+)\/1$/) {
	    my $id = $1;
	    my $seq = <IN>; my $qualid = <IN>; my $qual = <IN>;
	    unless($junk{$id}){
	      print OUT "@", $id , "/1", "\n", $seq,$qualid,$qual;
	    }
	  }
	}	    
      }
      close(IN); close(OUT);
}