############################################################################ # Copyright (c) 2011-2015 BIOLS, CAS # All Rights Reserved # See file LICENSE for details. ############################################################################ use strict; use Getopt::Long; $| = 1; open STDERR, ">>./CIRI_AS_error.log"; my $version = 'version 1.2'; my ($circ_list, $AS_out, $sam, $anno_gtf, $ref_dir, $ref_1file, $log, $help, $output_all, $mapq_uni, $mapq_both, $mapq_thres, $high_strigency, $low_strigency); Getopt::Long::GetOptions( 'ciri|C=s' => \$circ_list, 'out|O=s' => \$AS_out, 'sam|S=s' => \$sam, 'anno|A=s' => \$anno_gtf, 'ref_dir|R=s' => \$ref_dir, 'ref_file|F=s' => \$ref_1file, 'log|G=s' => \$log, 'output_all|D=s' => \$output_all, 'help|H!' => \$help, ); if(!defined($circ_list) and !defined($AS_out) and !defined($sam) and !defined($anno_gtf) and !defined($ref_dir) and !defined($ref_1file) and !defined($log) and !defined($help)){ # print "Please use the --help or -H option to get usage information.\n"; die; }elsif(defined($help)){ print "This is CIRI-AS $version, a detection tool for circRNA internal components and alternative splicing events. Welcome!\n\n"; print "Written by Yuan Gao. Any questions please mail to gaoyuan06\@mails.ucas.ac.cn.\n\n"; print "Usage of CIRI-AS:\n"; print "with annotation as input:\n perl CIRI_AS.pl -S in.sam -C in.ciri -O output -F ref.fa (-R ref_dir/) -A anno.gtf\n"; print "without annotation as input:\n perl CIRI_AS.pl -S in.sam -C in.ciri -O output -F ref.fa (-R ref_dir/)\n\n"; print "Arguments:\n"; print " --sam/-S\t\tinput SAM file (required; generated by BWA-MEM using PAIRED END mode)\n"; print " --ciri/-C\t\tinput circRNA list (required; generated by CIRI)\n"; print " --out/-O\t\tprefix of output files (required)\n"; print " --ref_dir/-R\t\tdirectory of reference sequences (Please make sure FASTA files in this directory are the same ones provided to CIRI. Either this argument or --ref-file/-F is required.)\n"; print " --ref_file/-F\t\tFASTA file of all reference sequences (Please make sure this file is the same one provided to CIRI. Either this argument or --ref-dir/-R is required.)\n"; print " --anno/-A\t\tGTF formatted annotation file (optional)\n"; print " --output_all/-D\tif output all processing info (Choose 'yes' would require more disk space. default: no)\n"; print " --log/-G\t\toutput log file name (optional)\n"; print " --help/-H\t\tshow help information\n"; die; } my $strigency; my $read_length; my @die_reason; my $if_PE = 1; my $exon_len4lib_len_est = 1000; my $if_die; my $min_intron = 70; my $U_test_strigency = 1; my $Z_alpha; if ($U_test_strigency == 1){ $Z_alpha = 1.6449; }elsif ($U_test_strigency == 2){ $Z_alpha = 2.3263; } if(!defined($sam)){ push @die_reason, "Please use --sam or -S option to designate input SAM alignment file!\n"; }elsif(!-e $sam or !-f $sam){ push @die_reason, "No SAM alignment file found at designated directory!\n"; }else{ my (%read_length_types, %read_12); open SAM, "<", $sam or die "CIRI-AS cannot open the SAM file: $!"; while (){ chomp; next if /^\[/ or /^[@]/; my @line = split /\t/; my $if_first_read = &ten2b($line[1],7); $read_12{$if_first_read}{$line[0]}++; if ( scalar(keys %{$read_12{$if_first_read}}) > 100 ){ delete $read_12{$if_first_read}{$line[0]}; last; } if ( $read_12{$if_first_read}{$line[0]} == 1){ $read_length_types{$if_first_read}{length($line[9])} ++; } } unless ( scalar(keys %{$read_12{'1'}}) >= 100 and scalar(keys %{$read_12{'0'}}) >= 100 ){ $if_PE = 0; } if(exists $read_length_types{'0'} and exists $read_length_types{'1'}){ my @read_length0 = keys %{$read_length_types{'0'}}; my @read_length1 = keys %{$read_length_types{'1'}}; #print scalar(keys %{$read_length_types{'0'}}),"\n"; #print scalar(keys %{$read_length_types{'1'}}),"\n"; if(@read_length0 == 1 and @read_length1 == 1 and $read_length0[0] == $read_length1[0]){ $read_length = $read_length0[0]; } } close SAM; } if(!defined($circ_list)){ push @die_reason, "Please use --circ_list or -C option to designate input circRNA list generated by CIRI!\n"; }elsif(!-e $circ_list or !-f $circ_list){ push @die_reason, "No circRNA list found at designated directory!\n"; } if(!defined($read_length) or $read_length < 40){ #print $read_length, "\n"; push @die_reason, "Please make sure the SAM is the same one used for circRNA detection by CIRI without modification!\n"; } if($if_PE == 0){ push @die_reason, "Please make sure the SAM was generated by BWA-MEM using PE mode and was not modified!\n"; } my ($AS_out_sam, $AS_out_list, $AS_out_java, $AS_out_sp, $AS_out_cov, $AS_out_clu, $AS_out_lib_len, $AS_out_cat, $log2); if(!defined($AS_out)){ push @die_reason, "Please use --out or -O option to designate output prefix!\n"; }elsif(-e $AS_out.'.list' and -f $AS_out.'.list'){ push @die_reason, "Output file $AS_out.list already exists!\n"; }else{ $AS_out_sam = $AS_out.'.sam'; $AS_out_list = $AS_out.'.list'; $AS_out_java = $AS_out.'_jav.list'; $AS_out_sp = $AS_out.'_splice.list'; $AS_out_cov = $AS_out.'_coverage.list'; $AS_out_clu = $AS_out.'_cluster.list'; $AS_out_lib_len = $AS_out.'_library_length.list'; $log = $AS_out.'.log' unless defined $log; $log2 = $AS_out.'.log2'; $AS_out_cat = $AS_out.'_AS.list'; } if( !defined($ref_dir) and !defined($ref_1file) ){ push @die_reason, "Please use --ref_dir/-R option to designate the directory of reference files, or use --ref_file/-F to designate one FASTA file recording all references!\n"; }elsif( defined($ref_dir) and (!-e $ref_dir or !-d $ref_dir) ){ push @die_reason, "Reference directory $ref_dir does not exist!\n"; }elsif( defined($ref_dir) and !-r $ref_dir ){ push @die_reason, "Reference directory $ref_dir is not readable!\n"; }elsif( defined($ref_1file) and (!-e $ref_1file or !-f $ref_1file) ){ push @die_reason, "Reference file $ref_1file does not exist!\n"; }elsif( defined($ref_1file) and !-r $ref_1file ){ push @die_reason, "Reference file $ref_1file is not readable!\n"; } my ($if_PE, $gff, $gtf); if (defined $ref_dir){ $ref_dir = substr($ref_dir, 0, length($ref_dir) - 1) if rindex($ref_dir, "/") == length($ref_dir) - 1; my @ref_file = <$ref_dir/*.fa $ref_dir/*.fasta>; if(@ref_file == 0){ push @die_reason, "No fasta file is found in designated refenece directory.\n"; $if_die = 1; } } if (defined $output_all and ($output_all !~ /^yes$/i or $output_all !~ /^y$/i) ){ $output_all = 'yes'; }elsif (defined $output_all and $output_all !~ /^no$/i or $output_all !~ /^n$/i){ $output_all = 'no'; }elsif (!defined $output_all){ $output_all = 'no'; }else{ push @die_reason, "CIRI-AS doesn't understand a value of $output_all for --output_all/-D.\n"; } if(defined($high_strigency) and defined($low_strigency)){ push @die_reason, "Sensitivity cannot be both high and low.\n"; }elsif(defined($high_strigency)){ $strigency = 3; }else{ $strigency = 1; } if(!defined($mapq_thres)){ $mapq_thres = 5; } if(!defined($mapq_uni)){ $mapq_uni = 0; } if(!defined($mapq_both)){ $mapq_both = 0; } my $if_anno; if(defined($anno_gtf) and (!-e $anno_gtf or !-f $anno_gtf)){ push @die_reason, "No annotation file found at designated directory!\n"; }elsif(defined($anno_gtf) and !($anno_gtf =~ /\.gff/ or $anno_gtf =~ /\.gtf/)){ push @die_reason, "Please provide .gff or .gtf format as annotation file!\n"; }elsif(defined($anno_gtf) and $anno_gtf =~ /\.gff/){ $gff = 1; $if_anno = 1; }elsif(defined($anno_gtf) and $anno_gtf =~ /\.gtf/){ $gtf = 1; $if_anno = 1; } if($gff == 1){ my $test_line = 1000; my $line_count = 0; open ANNO, "<", $anno_gtf or die "CIRI-AS cannot open the annotation file $anno_gtf: $!"; while(){ chomp; $line_count ++; my @line = split /\t/; if ($line[2] eq 'exon'){ if($line[8] =~ /^Parent=\w+\.\w*.*/){ $gff ++; last; }elsif($line[8] =~ /gene=\w+;.*transcript_id=\w+\.*/){ $gff += 2; last; } } if ($line_count >= $test_line and $gff == 1){ last; } } close ANNO; if ($gff == 1){ push @die_reason, "The GFF file provided cannot be understood by CIRI-AS! Please refer to manual for details of required GFF formats.\n"; } } if(@die_reason >= 1){ print @die_reason; }else{ #no if $] >= 5.018, "experimental::smartmatch"; my $timeA = time; ##[HASH:circ_range{chromosome}{base_coordinate}->ref(ARRAY:circ_id)]record each base within circRNA ranges from output of ciri; #[HASH:junction_read{read_id}->circ_id]record each junction read of circRNAs; #[HASH:circ_junc_reads{circ_id}{read_id}]record each ircRNA with its junction reads; #[HASH:circ_info{circ_id}->ref(ARRAY:circ_info)]record other info of each circRNA; #[HASH:circ_chr{chromosome}{circ_id}->#junction_read]record each circRNA according to its chromosome; open ASOUT, ">", $AS_out_list or die "CIRI-AS cannot write cirexon detection output $AS_out_list: $!"; open ASCAT, ">", $AS_out_cat or die "CIRI-AS cannot write alternative splicing detection output $AS_out_cat: $!"; open LOG, ">", $log or die "CIRI-AS cannot write log $log: $!"; if($output_all eq 'yes'){ open LOG2, ">", $log2 or die; #open ASSAM, ">", $AS_out_sam or die; open ASJAV, ">", $AS_out_java or die; open COV, ">", $AS_out_cov or die; open SPL, ">", $AS_out_sp or die; #open CLU, ">", $AS_out_clu or die; open LIBLEN, ">", $AS_out_lib_len or die; } my (%start_exon, %end_exon, %gene_exon, %length_exon, $min_exon_length, $max_exon_length, %long_exon4lib_len, $sample_chr_1st, %long_exon_range, $if_exp_mod, @library_lengths_all); ($min_exon_length, $max_exon_length) = (20, 2000); if($gtf == 1 or $gff >= 2){ print "Loading annotation from $anno_gtf...\n"; my $total_exon_num = 0; if($gtf == 1){ open ANNO, "<", $anno_gtf or die "CIRI-AS cannot open the annotation file $anno_gtf: $!"; while(){ chomp; my @line = split /\t/; if ($line[2] eq 'exon'){ my $locus = "$line[0]:$line[3]:$line[4]"; $start_exon{$line[0]}{$locus} = $line[3]; $end_exon{$line[0]}{$locus} = $line[4]; $length_exon{$locus} = $line[4] - $line[3] + 1; $total_exon_num ++; if ($line[-1] =~ /gene_id \"([^"]+)\"/){ $gene_exon{$line[0]}{$locus} = $1; }else { print LOG "ERROR: no gene ID was found for exon $locus:\n$_\n"; } } } close ANNO; }elsif($gff == 2){ open ANNO, "<", $anno_gtf or die "CIRI-AS cannot open the annotation file $anno_gtf: $!"; while(){ chomp; my @line = split /\t/; if ($line[2] eq 'exon'){ my $locus = "$line[0]:$line[3]:$line[4]"; $start_exon{$line[0]}{$locus} = $line[3]; $end_exon{$line[0]}{$locus} = $line[4]; $length_exon{$locus} = $line[4] - $line[3] + 1; $total_exon_num ++; if ($line[8] =~ /^Parent=(\w+\.\w*).*/){ $gene_exon{$line[0]}{$locus} = $1; }else { print LOG "ERROR: no gene ID was found for exon $locus:\n$_\n"; } } } close ANNO; }elsif($gff == 3){ open ANNO, "<", $anno_gtf or die "CIRI-AS cannot open the annotation file $anno_gtf: $!"; while(){ chomp; my @line = split /\t/; if ($line[2] eq 'exon'){ my $locus = "$line[0]:$line[3]:$line[4]"; $start_exon{$line[0]}{$locus} = $line[3]; $end_exon{$line[0]}{$locus} = $line[4]; $length_exon{$locus} = $line[4] - $line[3] + 1; $total_exon_num ++; if ($line[8] =~ /;gene=(\w+)/){ $gene_exon{$line[0]}{$locus} = $1; }else { print LOG "ERROR: no gene ID was found for exon $locus:\n$_\n"; } } } close ANNO; } print "$total_exon_num exons are successfully loaded.\n"; my @exon_sort_by_length = sort {$length_exon{$b} <=> $length_exon{$a}} (keys %length_exon); my @chr_sort_by_exon = sort {scalar(keys %{$start_exon{$b}}) <=> scalar(keys %{$start_exon{$a}})} (keys %start_exon); for my $chr(@chr_sort_by_exon){ $sample_chr_1st = $chr; my @exon_sort_by_loci_sample1 = sort {$start_exon{$sample_chr_1st}{$a} <=> $start_exon{$sample_chr_1st}{$b} or $end_exon{$sample_chr_1st}{$a} <=> $end_exon{$sample_chr_1st}{$b}} (keys %{$start_exon{$sample_chr_1st}}); for my $i(1 .. $#exon_sort_by_loci_sample1-1){ if ($start_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i+1]} > $end_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i]}){ if ($start_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i]} > $end_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i-1]}){ if ($length_exon{$exon_sort_by_loci_sample1[$i]} >= $exon_len4lib_len_est){ $long_exon4lib_len{$exon_sort_by_loci_sample1[$i]} = 1; for my $j($start_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i]} .. $end_exon{$sample_chr_1st}{$exon_sort_by_loci_sample1[$i]}){ $long_exon_range{$j} = $exon_sort_by_loci_sample1[$i]; } last if scalar(keys %long_exon4lib_len) >= 1000; } } } } if(scalar(keys %long_exon4lib_len) >= 100 or scalar(keys %long_exon4lib_len) >= .01*$total_exon_num){ print scalar(keys %long_exon4lib_len), " long exons (>=", $exon_len4lib_len_est, " bp) from chromosome $sample_chr_1st are recorded for library insert length estimation.\n"; last; }else{ %long_exon4lib_len = (); $sample_chr_1st = ''; } } if(scalar(keys %long_exon4lib_len) == 0){ print "No enough long exons were annotated in a single chromosome or contig. Library insert length estimation and correction for psi fail.\n"; } } my $timeB = time; print "Loading circRNAs from $circ_list...\n"; my (%circ_range, %junction_read, %circ_junc_reads, %circ_info, %circ_chr); open CIRC, "<", $circ_list or die "CIRI-AS cannot open $circ_list: $!"; while(){ chomp; my @line = split /\t/; next if $line[0] eq 'circRNA_ID'; my @reads = split ',', $line[-1]; for my $read (@reads){ $junction_read{$read} = $line[0]; $circ_junc_reads{$line[0]}{$read} = 1; } @{$circ_info{$line[0]}} = @line[1..9]; $circ_chr{$line[1]}{$line[0]} = $line[4]; } close CIRC; if(scalar(keys %circ_info) == 0){ print LOG "No circRNAs were loaded. Please make sure $circ_list is the circRNA list generated by CIRI!\nPrediction failed.\n"; print "No circRNAs were loaded. Please make sure $circ_list is the circRNA list generated by CIRI!\nPrediction failed.\n"; die; }elsif(scalar(keys %junction_read) == 0){ print LOG "No circular junction reads in circRNA list were loaded. Please make sure $circ_list is the circRNA list generated by CIRI!\nPrediction failed.\n"; print "No circular junction reads in circRNA list were loaded. Please make sure $circ_list is the circRNA list generated by CIRI!\nPrediction failed.\n"; die; }else{ print scalar(keys %circ_info), " circRNAs are loaded.\n"; } my $timeC = time; print "Clustering circRNAs according to their locations...\n"; #[HASH:circ_cluster_com{cluster_chr:cluster_start:cluster_end}->ref(ARRAY:circRNA)]record circRNAs into their loci clusters calculated according to range overlaps; #[HASH:circ_cluster_loci{cluster_chr:cluster_start:cluster_end}{'chr'},{'start'},{'end'}]record chr,loci_start, loci_end of loci clusters described above; #[HASH:circ_cluster_range{chromosome}{base_coordinate}->cluster_id]record each loci of each clusters described above; my (%circ_cluster_com, %circ_cluster_loci, %circ_cluster_range, %circ_cluster_id); my (%read_range, %read_mapq); my $circ_num_now = 0; &proc_bar($circ_num_now, scalar(keys %circ_info)); my $circ_cluster_id_total = 0; while( my ($chr, $circ_exp) = each %circ_chr ){ my @sort_circ = sort{$circ_info{$a}[1] <=> $circ_info{$b}[1] or $circ_info{$a}[2] <=> $circ_info{$b}[2]} keys %{$circ_exp}; my ($circ_cluster_end) = $circ_info{$sort_circ[0]}[2]; my ($circ_cluster_start) = $circ_info{$sort_circ[0]}[1]; my @putative_cluster; push @putative_cluster, $sort_circ[0]; for my $i(1 .. $#sort_circ){ #modified on 030715 from sort_circ[$i]}[1] < circ_cluster_end to sort_circ[$i]}[1] < circ_cluster_end+30 if ($circ_info{$sort_circ[$i]}[1] < $circ_cluster_end+$min_intron*2+1){ $circ_cluster_end = $circ_info{$sort_circ[$i]}[2] if $circ_info{$sort_circ[$i]}[2] > $circ_cluster_end; push @putative_cluster, $sort_circ[$i]; }else{ if (@putative_cluster>0){ $circ_cluster_id_total ++; my $circ_cluster = $chr.':'.$circ_cluster_start.':'.$circ_cluster_end; $circ_cluster_id{$circ_cluster_id_total} = $circ_cluster; @{$circ_cluster_com{$circ_cluster}} = @putative_cluster; print CLU "$circ_cluster\t@putative_cluster\n"; $circ_cluster_loci{$circ_cluster}{'chr'} = $chr; $circ_cluster_loci{$circ_cluster}{'start'} = $circ_cluster_start; $circ_cluster_loci{$circ_cluster}{'end'} = $circ_cluster_end; #modified on 030715 from circ_cluster_start to circ_cluster_start-30, circ_cluster_end to circ_cluster_end+30 for my $i($circ_cluster_start-$min_intron .. $circ_cluster_end+$min_intron){ $circ_cluster_range{$chr}{$i} = $circ_cluster_id_total; } } @putative_cluster = (); $circ_cluster_start = $circ_info{$sort_circ[$i]}[1]; $circ_cluster_end = $circ_info{$sort_circ[$i]}[2]; push @putative_cluster, $sort_circ[$i]; } } $circ_cluster_id_total ++; my $circ_cluster = $chr.':'.$circ_cluster_start.':'.$circ_cluster_end; $circ_cluster_id{$circ_cluster_id_total} = $circ_cluster; @{$circ_cluster_com{$circ_cluster}} = @putative_cluster; print CLU "$circ_cluster\t@putative_cluster\n"; $circ_cluster_loci{$circ_cluster}{'chr'} = $chr; $circ_cluster_loci{$circ_cluster}{'start'} = $circ_cluster_start; $circ_cluster_loci{$circ_cluster}{'end'} = $circ_cluster_end; #modified on 030715 from circ_cluster_start to circ_cluster_start-30, circ_cluster_end to circ_cluster_end+30 for my $i($circ_cluster_start-$min_intron .. $circ_cluster_end+$min_intron){ $circ_cluster_range{$chr}{$i} = $circ_cluster_id_total; } $circ_num_now += @sort_circ; &proc_bar($circ_num_now, scalar(keys %circ_info)); } print "\n"; $circ_num_now = 0; print scalar(keys %circ_info), " circRNAs are divided into $circ_cluster_id_total clusters.\n"; my $timeD = time; print "Scanning SAM file for splicing detection within circRNAs...\n"; #[HASH:]according to circRNA ranges, detect splice sites within each circRNA; ##[HASH:circ_cluster_mapping{cluster_id}{loci_id}{'start'}->mapping_start,{'end'}->mapping_end]record each local mapping in a circRNA loci cluster; ##[HASH:coverage{chromosome}{base_coordinate}]calculate sequencing depth in each base within circRNA loci cluster; #[HASH:chr_size{chromosome}->length of reference chromosome]record length of each reference provided; my (%positive_reads, %sense_strand, %strand_circ); my @loci_validated; my (%group, %cluster, %chr_cluster, $cluster_num, $cluster_num2, %site1_cluster, %site2_cluster); my (%strand_long_exon, @long_exon_mapping); my (%final_cluster, %cigars_cluster); my (@PE_reads, $pre_read); my %positive_reads_count; my (%coverage, %cluster_read, @pre_clusters); my ($detected, $detected_add) = (0, 0); open SAM, "<", $sam or die "CIRI-AS cannot open $sam: $!"; &proc_bar($circ_num_now, scalar(%junction_read)); my $one_200th_total = int(scalar(%junction_read)/200); unless ($one_200th_total > 0){ $one_200th_total = 1; } while(){ chomp; my @line = split /\t/; my $read_name = $line[0]; if($pre_read ne $read_name){ if (exists $junction_read{$pre_read}){ &record_mapping_detail($pre_read, @PE_reads); &mapping_check1($pre_read, @PE_reads); ## sub mapping_check1 print ASSAM "1:\t$_\n" for @PE_reads; $circ_num_now ++; if ($circ_num_now % $one_200th_total == 0){ &proc_bar($circ_num_now, scalar(keys %junction_read)); } }elsif (@pre_clusters > 0 and @PE_reads > 2){ ## $cluster_read{$pre_cluster} == @PE_reads &mapping_check1_add($pre_read, scalar(@pre_clusters), @PE_reads); ## sub mapping_check1 print ASSAM "0:\t$_\n" for @PE_reads; } if(scalar (keys %strand_long_exon) == 2){ if( @{$strand_long_exon{'1'}} == 1 and @{$strand_long_exon{'0'}} == 1 and ${$strand_long_exon{'1'}}[0] eq ${$strand_long_exon{'0'}}[0] ){ &exon_length_estimation($pre_read, ${$strand_long_exon{'1'}}[0], @long_exon_mapping); } } @pre_clusters = (); @PE_reads = (); %strand_long_exon = (); @long_exon_mapping = (); } if($line[4] >= $mapq_thres){ my $msid = &MSID($line[5]); next if ${$msid}[-1] < 0; my $map_end = $line[3] + ${$msid}[-1] - 1; my ($loci_start, $loci_end) = (0, -1); if (exists $circ_cluster_range{$line[2]}{$line[3]}){ my $circ_cluster = $circ_cluster_id{$circ_cluster_range{$line[2]}{$line[3]}}; $loci_start = $line[3]; if($map_end <= $circ_cluster_loci{ $circ_cluster }{'end'}){ $loci_end = $map_end; }else{ $loci_end = $circ_cluster_loci{ $circ_cluster }{'end'}; } push @pre_clusters, $circ_cluster; }elsif(exists $circ_cluster_range{$line[2]}{$map_end}){ my $circ_cluster = $circ_cluster_id{$circ_cluster_range{$line[2]}{$map_end}}; $loci_end = $map_end; $loci_start = $circ_cluster_loci{ $circ_cluster }{'start'}; push @pre_clusters, $circ_cluster; } for my $i($loci_start .. $loci_end){ $coverage{$line[2]}{$i} ++; } if ( $line[2] eq $sample_chr_1st and (exists $long_exon_range{$line[3]} or exists $long_exon_range{$map_end}) ){ my $ten2b7 = &ten2b($line[1],7); if( exists $long_exon_range{$line[3]} ){ push @{$strand_long_exon{$ten2b7}}, $long_exon_range{$line[3]}; push @long_exon_mapping, $_; }else{ push @{$strand_long_exon{$ten2b7}}, $long_exon_range{$map_end}; push @long_exon_mapping, $_; } } } push @PE_reads, $_; $pre_read = $read_name; } close SAM; if (exists $junction_read{$pre_read}){ &record_mapping_detail($pre_read, @PE_reads); &mapping_check1($pre_read, @PE_reads); ## sub mapping_check1 print ASSAM "1:\t$_\n" for @PE_reads; $circ_num_now ++; }elsif (@PE_reads > 2 and @pre_clusters > 0){ &mapping_check1_add($pre_read, scalar(@pre_clusters), @PE_reads); ## sub mapping_check1 print ASSAM "0:\t$_\n" for @PE_reads; } %cluster_read = (); &proc_bar($circ_num_now, scalar(keys %junction_read)); print "\n"; if($circ_num_now == 0){ print LOG "No circular junction reads of circRNAs were found from SAM. Please make sure $sam is the SAM used for circRNA detection by CIRI!\nPrediction failed.\n"; print "No circular junction reads of circRNAs were found from SAM. Please make sure $sam is the SAM used for circRNA detection by CIRI!\nPrediction failed.\n"; die; }elsif($circ_num_now < .8*scalar(keys %junction_read)){ print LOG "A large proportion of circular junction reads of circRNAs were not found from SAM. Please make sure $sam is the SAM used for circRNA detection by CIRI without modification!\nPrediction failed.\n"; print "A large proportion of circular junction reads of circRNAs were found from SAM. Please make sure $sam is the SAM used for circRNA detection by CIRI without modification!\nPrediction failed.\n"; die; }elsif($circ_num_now < scalar(keys %junction_read)){ print "Some circular junction reads of circRNAs were not found from SAM. It seems that SAM was modified. Anyway, prediction continues...\n"; } $circ_num_now = 0; my $timeE = time; my ($total_lib_length, $ave_lib_length, %insert_length_distribution, $max_insert_length, $min_insert_length, @sort_insert_length); if(@library_lengths_all>=2_000){ ##@library_lengths_all records all sampled insert lengths; %insert_length_distribution records all insert length distribution. @sort_insert_length = sort {$a <=> $b} @library_lengths_all; ($max_insert_length, $min_insert_length) = ($sort_insert_length[-1], $sort_insert_length[0]); for my $length(@library_lengths_all){ print LIBLEN "library_length:\t", $length+$read_length,"\n"; $insert_length_distribution{$length} ++; $total_lib_length += $length; } print "Insert length estimation from ", scalar(@library_lengths_all); printf " PE reads: %.1f bp\n", $total_lib_length/@library_lengths_all+$read_length; }elsif(!defined $anno_gtf){ print "No annotation file is provided. Insert length estimation and correction of psi are skipped.\n"; }elsif(scalar(keys %long_exon4lib_len) == 0){ 1; }else{ print "As no enough PE reads are detected, insert length estimation and correction of psi estimation for alternative spliced are skipped.\n"; } my @sort_circ_cluster = sort { $circ_cluster_loci{$a}{'chr'} cmp $circ_cluster_loci{$b}{'chr'} or $circ_cluster_loci{$a}{'start'} <=> $circ_cluster_loci{$b}{'start'} or $circ_cluster_loci{$a}{'end'} <=> $circ_cluster_loci{$b}{'end'} } keys %circ_cluster_loci; for my $circ_cluster(@sort_circ_cluster){ my $chr = $circ_cluster_loci{$circ_cluster}{'chr'}; my $start = $circ_cluster_loci{$circ_cluster}{'start'}; my $end = $circ_cluster_loci{$circ_cluster}{'end'}; for my $i($start .. $end){ if (exists $coverage{$chr}{$i}){ print COV "$chr\t$i\t$coverage{$chr}{$i}\n"; }else{ print COV "$chr\t$i\t0\n"; } } } my @positive_read_names = keys %positive_reads; print ($detected + $detected_add); print " candidate splice junctions within circRNAs from ", scalar(@positive_read_names), " reads are recognized, "; print "of which $detected are from known BSJ reads.\n"; print LOG ($detected + $detected_add); print LOG " candidate splice junctions within circRNAs from ", scalar(@positive_read_names), " reads are recognized, "; print LOG "of which $detected are from known BSJ reads.\n"; print "Splicing signals checking starts...\n"; my (%chr_seq_1file, $this_chr); if(defined $ref_1file){ open REF, "<", $ref_1file or die "CIRI-AS cannot open $ref_1file: $!"; while(){ chomp; if(/^>(\S+)/){ $this_chr = $1; }else{ $chr_seq_1file{$this_chr} .= $_; } } } &splice_loci_check(@positive_read_names); my $timeF = time; print scalar(@loci_validated)," candidate splice junctions have splicing signals.\n"; print LOG scalar(@loci_validated)," candidate splice junctions have splicing signals.\n"; print "Clustering starts...\n"; &cluster_reads(@loci_validated); my $timeG = time; for my $j(0 .. $cluster_num-1){ my $clusterID = 'cluster'.$j; my %cigar_cluster; my %CIGAR_count; for (@{$cluster{$clusterID}}){ for my $n(0 .. 2){ #push @{$cigar_cluster{$n}}, $positive_reads{${$_}[0]}{${$_}[1]}{"cigar"}[$n] unless (!defined $positive_reads{${$_}[0]}{${$_}[1]}{"cigar"}[$n] or $positive_reads{${$_}[0]}{${$_}[1]}{"cigar"}[$n] ~~ @{$cigar_cluster{$n}}); $cigar_cluster{$n}{$positive_reads{${$_}[0]}{${$_}[1]}{"cigar"}[$n]} ++ if defined $positive_reads{${$_}[0]}{${$_}[1]}{"cigar"}[$n]; } } for my $cigar_position(0 .. 2){ if (defined ($cigar_cluster{$cigar_position}) and scalar(keys %{$cigar_cluster{$cigar_position}}) > 0){ $CIGAR_count{$cigar_position} = scalar(keys %{$cigar_cluster{$cigar_position}}); }else{ $CIGAR_count{$cigar_position} = 0; } } if ($CIGAR_count{"0"}+$CIGAR_count{"1"}+$CIGAR_count{"2"} >= $strigency){ $cluster_num2 ++; $final_cluster{$cluster_num2} = $clusterID; $cigars_cluster{$clusterID} = $CIGAR_count{"0"}."_".$CIGAR_count{"1"}."_".$CIGAR_count{"2"}; } } print SPL "ID\tchr\tsplice_start\tsplice_end\t#supporting_reads\tSM_MS_SMS\tjunction_reads_ID\n"; for my $i(1 .. $cluster_num2){ my $clusterID = $final_cluster{$i}; print SPL "$chr_cluster{$clusterID}:$site1_cluster{$clusterID}|$site2_cluster{$clusterID}\t$chr_cluster{$clusterID}\t$site1_cluster{$clusterID}\t$site2_cluster{$clusterID}\t".scalar( @{$cluster{$clusterID}} )."\t$cigars_cluster{$clusterID}\t"; print SPL ${$_}[0],'(',$junction_read{${$_}[0]},'),' for @{$cluster{$clusterID}}; print SPL "\n"; } print "In sum, $cluster_num2 non-redundant splice junctions within circRNAs are detected.\n"; print LOG "In sum, $cluster_num2 non-redundant splice junctions within circRNAs are detected.\n"; my $timeH = time; print "Cirexon prediction according to splice junctions and sequencing depths...\n"; &proc_bar($circ_num_now, scalar(keys %circ_info)); my @sort_chr = sort {$a cmp $b} (keys %circ_chr); my $cluster_num_start = 1; print ASOUT "circRNA_id\tchr\tstart\tend\tstrand\t#junction_reads\tPCC(MS_SM_SMS)\t#non_junction_reads\tjunction_reads_ratio\tcircRNA_type\tgene_id\t"; #predicted_cirexon\t#predicted_cirexon\n print ASOUT "cirexon_id\tcirexon_start\tcirexon_end\t#start_supporting_BSJ_read\t#end_supporting_BSJ_read\tsequencing_depth_median\tif_ICF\n"; print ASJAV "circRNA_id\tchr\tstart\tend\tknown_linear_exon\tpredicted_cirexon\tsequencing_depth\tjunction_read_mapping\n"; print ASCAT "circRNA_id\talternatively_spliced_exon\tAS_type\tpsi_estimation_without_correction\tpsi_estimation_after_correction\n"; for my $chr (@sort_chr){ my @circ_chr_sort = sort {$circ_info{$a}[1] <=> $circ_info{$b}[1] or $circ_info{$a}[2] <=> $circ_info{$b}[2]} (keys %{$circ_chr{$chr}}); my @chr_exon_sort; my $chr_exon_start = 0; if($gtf == 1 or $gff >= 2){ @chr_exon_sort = sort { $start_exon{$chr}{$a} <=> $start_exon{$chr}{$b} or $end_exon{$chr}{$a} <=> $end_exon{$chr}{$b} } (keys %{$start_exon{$chr}}); } ##added 0704 for strand of circ my $chr_seq; unless (scalar (keys %strand_circ) == scalar (keys %circ_info)){ if(defined $ref_dir){ open CHR, "<", $ref_dir."/".$chr.".fa" or open CHR, "<", $ref_dir."/".$chr.".fasta" or die "CIRI-AS cannot open chromosome file of $chr in directory $ref_dir: $!"; while(){ chomp; unless(/^>/){ $chr_seq .= $_; } } close CHR; }elsif(length($chr_seq_1file{$chr})>0 ){ $chr_seq = $chr_seq_1file{$chr}; }else{ die "Vital errors met when reading reference of $chr!"; } } CIRC: for my $circ (@circ_chr_sort){ my (%cirexon, %icf); my (@validated_exon, @validated_intron_retained); my ($circ_chr, $circ_start, $circ_end, $circ_exp, $circ_PCC, $non_circ_exp, $circ_rel_exp, $circ_type, $circ_gene_id) = @{$circ_info{$circ}}; my @splice_in_circ; my (%supporting_read_start, %supporting_read_end, %supporting_read_in_circ, %candidate_exons, %junction_read_splicing, @sort_junction_read_mapping); #%splice_cluster, %splices_inside_candidates, my (%splice_read_start, %splice_read_end, %splice2junction); my (%validated_exon_start, %validated_exon_end); my (%junction_read_mapping, %junction_read_coverage, %splice_across, @candidate_exon_sort_length, %validation_exon_loci); #print ASOUT "$circ\t$circ_chr\t$circ_start\t$circ_end\t$circ_exp\t$circ_PCC\t$non_circ_exp\t$circ_rel_exp\t$circ_type\t$circ_gene_id\t"; print ASJAV "$circ\t$circ_chr\t$circ_start\t$circ_end\t"; unless (exists $strand_circ{$circ}){ my $start_2bp = substr($chr_seq, $circ_start-3, 2); my $end_2bp = substr($chr_seq, $circ_end, 2); if($start_2bp =~ /AG/i and $end_2bp =~ /GT/i){ $strand_circ{$circ} = '+'; }elsif($start_2bp =~ /AC/i and $end_2bp =~ /CT/i){ $strand_circ{$circ} = '-'; }elsif($start_2bp =~ /AG/i or $end_2bp =~ /GT/i){ $strand_circ{$circ} = '+'; }elsif($start_2bp =~ /AC/i or $end_2bp =~ /CT/i){ $strand_circ{$circ} = '-'; }else{ if($gtf == 1 or $gff >= 2){ open ANNO, "<", $anno_gtf or die "CIRI-AS cannot open the annotation file $anno_gtf: $!"; while(){ chomp; my @line = split /\t/; if ($line[0] eq $circ_chr){ if ($line[3] eq $circ_start and $line[2] eq 'exon'){ if ($line[6] eq '+' or $line[6] eq '-'){ $strand_circ{$circ} = $line[6]; last; } }elsif ($line[4] eq $circ_end and $line[2] eq 'exon'){ if ($line[6] eq '+' or $line[6] eq '-'){ $strand_circ{$circ} = $line[6]; last; } } } } close ANNO; } unless ( exists($strand_circ{$circ}) ){ print "CIRI-AS doesn't know strand of $circ.\n"; print LOG "CIRI-AS doesn't know strand of $circ.\n"; print ASJAV "\n"; next CIRC; } } } CLST: for my $i($cluster_num_start .. $cluster_num2){ my $clusterID = $final_cluster{$i}; if($chr_cluster{$clusterID} lt $circ_chr){ $cluster_num_start = $i; next CLST; }elsif($chr_cluster{$clusterID} gt $circ_chr){ last CLST; }elsif($site2_cluster{$clusterID} < $circ_start){ $cluster_num_start = $i; next CLST; }elsif($site2_cluster{$clusterID} > $circ_end){ #modified on 030615 from site2_cluster to site1_cluster last CLST; }elsif($site1_cluster{$clusterID} < $circ_end and $site2_cluster{$clusterID} > $circ_start){ push @splice_in_circ, $clusterID; } } my %strand_read_num; while(my ($read, undef) = each %{$circ_junc_reads{$circ}} ){ #if(exists $sense_strand{$read}){ # $strand_read_num{$sense_strand{$read}} ++; #} for my $if_reverse(0 .. 1){ next unless exists $read_range{$read}{$if_reverse}; for my $i (0 .. $#{$read_range{$read}{$if_reverse}{'start'}}){ my @ranges = ($read_range{$read}{$if_reverse}{'chr'}[$i], $read_range{$read}{$if_reverse}{'start'}[$i], $read_range{$read}{$if_reverse}{'end'}[$i]); if($ranges[0] eq $chr and $ranges[1] >= $circ_start-6 and $ranges[2] <= $circ_end+6){ $junction_read_mapping{$ranges[1].'-'.$ranges[2]}{'start'} = $ranges[1]; $junction_read_mapping{$ranges[1].'-'.$ranges[2]}{'end'} = $ranges[2]; push @{$junction_read_mapping{$ranges[1].'-'.$ranges[2]}{'reads'}}, [$read, $if_reverse, $i]; for my $j($ranges[1] .. $ranges[2]){ $junction_read_coverage{$j} ++; } } } } } #my @strand_sort = sort {$strand_read_num{$b} <=> $strand_read_num{$a}} keys %strand_read_num; #$strand_circ{$circ} = $strand_sort[0]; @sort_junction_read_mapping = sort { $junction_read_mapping{$a}{'start'} <=> $junction_read_mapping{$b}{'start'} or $junction_read_mapping{$a}{'end'} <=> $junction_read_mapping{$b}{'end'} } keys (%junction_read_mapping); my $sort_junction_read_mapping_start = 0; for my $clusterID (@splice_in_circ){ print LOG "splice:",($site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}),"in_$circ:\t"; $supporting_read_start{$site1_cluster{$clusterID}} += 0; $supporting_read_end{$site2_cluster{$clusterID}} += 0; $supporting_read_in_circ{$site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}} += 0; $splice_read_start{$site1_cluster{$clusterID}} += scalar (@{$cluster{$clusterID}}); $splice_read_end{$site2_cluster{$clusterID}} += scalar (@{$cluster{$clusterID}}); for my $read (@{$cluster{$clusterID}}){ if ($junction_read{${$read}[0]} eq $circ){ print LOG "${$read}[0],"; $splice2junction{$site1_cluster{$clusterID}}{$site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}} = $site2_cluster{$clusterID}; $splice2junction{$site2_cluster{$clusterID}}{$site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}} = $site1_cluster{$clusterID}; $supporting_read_in_circ{$site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}} ++; $supporting_read_start{$site1_cluster{$clusterID}} ++; $supporting_read_end{$site2_cluster{$clusterID}} ++; push @{$junction_read_splicing{$site2_cluster{$clusterID}.':'.$site1_cluster{$clusterID}}}, $read; #only read_name, no info on if first read or not } } print LOG "\n"; unless (exists $splice_across{$site2_cluster{$clusterID}}){ STMP1: for my $i($sort_junction_read_mapping_start .. $#sort_junction_read_mapping){ if($junction_read_mapping{$sort_junction_read_mapping[$i]}{'end'} < $site2_cluster{$clusterID}){ $sort_junction_read_mapping_start = $i; next STMP1; }elsif($junction_read_mapping{$sort_junction_read_mapping[$i]}{'start'} > $site2_cluster{$clusterID}){ #modified on 030615 from site2_cluster to site1_cluster last STMP1; }elsif($junction_read_mapping{$sort_junction_read_mapping[$i]}{'start'} <= $site2_cluster{$clusterID}-6 and $junction_read_mapping{$sort_junction_read_mapping[$i]}{'end'} >= $site2_cluster{$clusterID}+6){ push @{$splice_across{$site2_cluster{$clusterID}}}, @{$junction_read_mapping{$sort_junction_read_mapping[$i]}{'reads'}}; } } } unless (exists $splice_across{$site1_cluster{$clusterID}}){ STMP2: for my $i($sort_junction_read_mapping_start .. $#sort_junction_read_mapping){ if($junction_read_mapping{$sort_junction_read_mapping[$i]}{'end'} < $site1_cluster{$clusterID}){ next STMP2; }elsif($junction_read_mapping{$sort_junction_read_mapping[$i]}{'start'} > $site1_cluster{$clusterID}){ #modified on 030615 from site2_cluster to site1_cluster last STMP2; }elsif($junction_read_mapping{$sort_junction_read_mapping[$i]}{'start'} <= $site1_cluster{$clusterID}-6 and $junction_read_mapping{$sort_junction_read_mapping[$i]}{'end'} >= $site1_cluster{$clusterID}+6){ push @{$splice_across{$site1_cluster{$clusterID}}}, @{$junction_read_mapping{$sort_junction_read_mapping[$i]}{'reads'}}; } } } } while (my ($site, $reads_ref) = each %splice_across){ print LOG "$circ:across_site:$site\t"; print LOG "${$_}[0]," for @{$reads_ref}; } print LOG "\n"; $supporting_read_start{$circ_start} = 'start'; $supporting_read_end{$circ_end} = 'end'; my @sorted_start_loci = sort {$a <=> $b} (keys %supporting_read_start); my @sorted_end_loci = sort {$a <=> $b} (keys %supporting_read_end); my (%candidate_exons); for my $i(0 .. $#sorted_start_loci){ for my $j(0 .. $#sorted_end_loci){ #modified on 030915 from min_exon_length to min_exon_length-1, from max_exon_length to max_exon_length-1, from 1 to $sorted_end_loci[$j]-$sorted_start_loci[$i]+1 if ($sorted_end_loci[$j] >= $sorted_start_loci[$i] + $min_exon_length-1 and ($supporting_read_start{$sorted_start_loci[$i]}+$supporting_read_end{$sorted_end_loci[$j]}>0 or $i == 0 or $j == $#sorted_end_loci) and ($sorted_end_loci[$j] <= $sorted_start_loci[$i]+$max_exon_length-1)){ $candidate_exons{$sorted_start_loci[$i].':'.$sorted_end_loci[$j]} = $sorted_end_loci[$j]-$sorted_start_loci[$i]+1; } } } my @exon_bingo; if($gtf == 1 or $gff >= 2){ EXON: for my $i($chr_exon_start .. $#chr_exon_sort){ if($start_exon{$chr}{$chr_exon_sort[$i]} >= $circ_start and $end_exon{$chr}{$chr_exon_sort[$i]} <= $circ_end){ push @exon_bingo, $i; }elsif($end_exon{$chr}{$chr_exon_sort[$i]} < $circ_start){ $chr_exon_start = $i; }elsif($start_exon{$chr}{$chr_exon_sort[$i]} > $circ_end){ last EXON; } } print ASJAV "$start_exon{$chr}{$chr_exon_sort[$_]}:$end_exon{$chr}{$chr_exon_sort[$_]}," for @exon_bingo; }else{ print ASJAV "n/a"; } print ASJAV "\t"; if(scalar (keys %candidate_exons) >0 ){ my @candidate_exon_sort_length = sort {$candidate_exons{$a} <=> $candidate_exons{$b}} keys (%candidate_exons); my (%validated_intron_retained_info); CAND: for my $cand_exon(@candidate_exon_sort_length){ my ($if_exon_validated, $if_intron_retention) = ([0,0], [0,0]); my ($cand_exon_start, $cand_exon_end) = split ':', $cand_exon; print LOG "$cand_exon(exon_$circ)"; if (exists $validated_exon_start{$cand_exon_start} and exists $validated_exon_end{$cand_exon_end}){ ${$if_exon_validated}[0] = 0; print LOG "retained_intron\t"; my (%exon_end_across_count); my ($last_exon_end, $foremost_exon_start) = ($cand_exon_start, $cand_exon_end); for my $exon (@{$validated_exon_start{$cand_exon_start}}){ my ($exon_start, $exon_end) = split ':', $exon; if( $splice_read_end{$exon_end} >= 2 ){ $last_exon_end = $exon_end if $last_exon_end < $exon_end; } if (exists $splice_across{$exon_end} and @{$splice_across{$exon_end}} > 0){ $exon_end_across_count{$exon_end} = scalar(@{$splice_across{$exon_end}}); }else{ $exon_end_across_count{$exon_end} = 0; } } for my $exon (@{$validated_exon_end{$cand_exon_end}}){ my ($exon_start, $exon_end) = split ':', $exon; if( $splice_read_start{$exon_start} >= 2 ){ $foremost_exon_start = $exon_start if $foremost_exon_start > $exon_start; } if (exists $splice_across{$exon_start} and @{$splice_across{$exon_start}} > 0){ $exon_end_across_count{$exon_start} = scalar(@{$splice_across{$exon_start}}); }else{ $exon_end_across_count{$exon_start} = 0; } } if($exon_end_across_count{$last_exon_end}>0 and $exon_end_across_count{$foremost_exon_start}>0){ if(exists $junction_read_splicing{$last_exon_end.':'.$foremost_exon_start} and @{$junction_read_splicing{$last_exon_end.':'.$foremost_exon_start}}>=2){ $if_intron_retention = &intron_retention_validation($circ_chr, $cand_exon_start, $cand_exon_end, $exon_end_across_count{$last_exon_end}, $exon_end_across_count{$foremost_exon_start}, scalar(@{$junction_read_splicing{$last_exon_end.':'.$foremost_exon_start}}), $last_exon_end, $foremost_exon_start, \%junction_read_coverage); }else{ ${$if_intron_retention}[0] = 0; } }else{ ${$if_intron_retention}[0] = -1; } print LOG ${$if_intron_retention}[0]; if (${$if_intron_retention}[0] >= 1){ my $if_inner_intron_validated; PRE: for my $pre_intron(@validated_intron_retained){ my ($intron_start, $intron_end) = split ':', $pre_intron; if ($cand_exon_start <= $intron_start and $cand_exon_end >= $intron_end){ $if_inner_intron_validated = "\t${pre_intron}_validated!\n"; last PRE; } } if (defined $if_inner_intron_validated){ print LOG $if_inner_intron_validated; }else{ #print ASCAT "\t$cand_exon(IL)"; print ASJAV "$cand_exon(intron_retention),"; #print ASOUT "$cand_exon,"; print LOG "\n"; push @validated_intron_retained, $cand_exon; $validated_intron_retained_info{$cand_exon}{'splice'} = $last_exon_end.':'.$foremost_exon_start; $validated_intron_retained_info{$cand_exon}{'raw_psi'} = sprintf("%.3f", ($exon_end_across_count{$last_exon_end}+$exon_end_across_count{$foremost_exon_start})/($exon_end_across_count{$last_exon_end}+$exon_end_across_count{$foremost_exon_start}+2*@{$junction_read_splicing{$last_exon_end.':'.$foremost_exon_start}})); } }else{ print LOG "\n"; } }else{ my ($start_across, $end_across); my ($start_tag2, $end_tag2) = (0, 0); if (exists $splice_across{$cand_exon_start}){ $start_across = scalar(@{$splice_across{$cand_exon_start}}); }else{ $start_across = 0; } if (exists $splice_across{$cand_exon_end}){ $end_across = scalar(@{$splice_across{$cand_exon_end}}); }else{ $end_across = 0; } if (exists $splice_read_start{$cand_exon_start}){ $start_tag2 = $splice_read_start{$cand_exon_start}; }if (exists $splice_read_end{$cand_exon_end}){ $end_tag2 = $splice_read_end{$cand_exon_end}; } while(my ($other_start, undef) = each %validated_exon_start){ if ($other_start == $cand_exon_start and $supporting_read_end{$cand_exon_end} == 0){ print LOG "exon\tno_reads_supporting_alternative_splicing_site\t-11\t"; #print ASOUT "$cand_exon(0_0),"; next CAND; } } while(my ($other_end, undef) = each %validated_exon_end){ if ($other_end == $cand_exon_end and $supporting_read_start{$cand_exon_start} == 0){ print LOG "exon\tno_reads_supporting_alternative_splicing_site\t-12\t"; #print ASOUT "$cand_exon(0_0),"; next CAND; } } print LOG "exon\t"; #will add filters here $if_exon_validated = &exon_coverage_validation_single($circ_chr, $cand_exon_start, $cand_exon_end, $supporting_read_start{$cand_exon_start}, $supporting_read_end{$cand_exon_end}, $start_tag2, $end_tag2, $start_across, $end_across, \%junction_read_coverage); print LOG "${$if_exon_validated}[0]\n"; } if (${$if_exon_validated}[0] >= 1){ push @validated_exon, $cand_exon; $validation_exon_loci{$cand_exon}{'start'} = $cand_exon_start; $validation_exon_loci{$cand_exon}{'end'} = $cand_exon_end; push @{$validated_exon_start{$cand_exon_start}}, $cand_exon; push @{$validated_exon_end{$cand_exon_end}}, $cand_exon; my $if_ICF = 1; for my $i(@exon_bingo){ if ($start_exon{$chr}{$chr_exon_sort[$i]} <= $cand_exon_start+6 and $end_exon{$chr}{$chr_exon_sort[$i]} >= $cand_exon_end-6){ $if_ICF = 0; last; } } $cirexon{$cand_exon}{'start'} = $cand_exon_start; $cirexon{$cand_exon}{'end'} = $cand_exon_end; $cirexon{$cand_exon}{'st_sp'} = $supporting_read_start{$cand_exon_start}; $cirexon{$cand_exon}{'ed_sp'} = $supporting_read_end{$cand_exon_end}; $cirexon{$cand_exon}{'cov_md'} = ${$if_exon_validated}[1]; if ($if_ICF == 1 and defined $anno_gtf){ print ASJAV "$cand_exon(ICF),"; #print ASOUT "$cand_exon(ICF),"; $icf{$cand_exon} = 1; }else{ print ASJAV "$cand_exon,"; #print ASOUT "$cand_exon,"; } }elsif (${$if_intron_retention}[0] >= 1){ my $if_ICF = 1; for my $i(@exon_bingo){ if ($start_exon{$chr}{$chr_exon_sort[$i]} <= $cand_exon_start+6 and $end_exon{$chr}{$chr_exon_sort[$i]} >= $cand_exon_end-6){ $if_ICF = 0; last; } } $cirexon{$cand_exon}{'start'} = $cand_exon_start; $cirexon{$cand_exon}{'end'} = $cand_exon_end; $cirexon{$cand_exon}{'st_sp'} = $supporting_read_start{$cand_exon_start}; $cirexon{$cand_exon}{'ed_sp'} = $supporting_read_end{$cand_exon_end}; $cirexon{$cand_exon}{'cov_md'} = ${$if_intron_retention}[1]; if ($if_ICF == 1 and defined $anno_gtf){ #print ASOUT "$cand_exon(ICF),"; $icf{$cand_exon} = 1; } } #print ASOUT "$cand_exon(${if_exon_validated}_$if_intron_retention),"; } if (@validated_exon > 0){ #print ASOUT "\t", scalar(@validated_exon)+scalar(@validated_intron_retained),"\n"; my @cirexon_sort = sort {$cirexon{$a}{'start'} <=> $cirexon{$b}{'start'} or $cirexon{$a}{'end'} <=> $cirexon{$b}{'end'}} keys %cirexon; my (%cirexon_id, @id_cirexon); $cirexon_id{$cirexon_sort[0]} = 0; $id_cirexon[0] = [$cirexon_sort[0]]; for my $i (1 .. $#cirexon_sort){ if ($cirexon{$cirexon_sort[$i]}{'start'} > $cirexon{$cirexon_sort[$i-1]}{'end'}){ $cirexon_id{$cirexon_sort[$i]} = $cirexon_id{$cirexon_sort[$i-1]}+1; $id_cirexon[$cirexon_id{$cirexon_sort[$i]}] = [$cirexon_sort[$i]]; }else{ $cirexon_id{$cirexon_sort[$i]} = $cirexon_id{$cirexon_sort[$i-1]}; push @{$id_cirexon[$cirexon_id{$cirexon_sort[$i-1]}]}, $cirexon_sort[$i]; } } if($strand_circ{$circ} eq '+'){ for my $i (0 .. $#id_cirexon){ for my $cand_exon(@{$id_cirexon[$i]}){ print ASOUT "$circ\t$circ_chr\t$circ_start\t$circ_end\t$strand_circ{$circ}\t$circ_exp\t$circ_PCC\t$non_circ_exp\t$circ_rel_exp\t$circ_type\t$circ_gene_id\t"; print ASOUT "cirexon", $i+1, "\t", $cirexon{$cand_exon}{'start'}, "\t", $cirexon{$cand_exon}{'end'}, "\t", $cirexon{$cand_exon}{'st_sp'}, "\t", $cirexon{$cand_exon}{'ed_sp'}, "\t", $cirexon{$cand_exon}{'cov_md'}, "\t"; if (exists $icf{$cand_exon}){ print ASOUT "ICF\n"; }else{ print ASOUT "non_ICF\n"; } } } }else{ for my $i (0 .. $#id_cirexon){ for my $cand_exon(@{$id_cirexon[$#id_cirexon-$i]}){ print ASOUT "$circ\t$circ_chr\t$circ_start\t$circ_end\t$strand_circ{$circ}\t$circ_exp\t$circ_PCC\t$non_circ_exp\t$circ_rel_exp\t$circ_type\t$circ_gene_id\t"; print ASOUT "cirexon", $i+1, "\t", $cirexon{$cand_exon}{'start'}, "\t", $cirexon{$cand_exon}{'end'}, "\t", $cirexon{$cand_exon}{'st_sp'}, "\t", $cirexon{$cand_exon}{'ed_sp'}, "\t", $cirexon{$cand_exon}{'cov_md'}, "\t"; if (exists $icf{$cand_exon}){ print ASOUT "ICF\n"; }else{ print ASOUT "non_ICF\n"; } } } } print ASJAV "\t"; my @validated_exon_length_sort = sort {$validation_exon_loci{$a}{'start'} <=> $validation_exon_loci{$b}{'start'} or $validation_exon_loci{$a}{'end'} <=> $validation_exon_loci{$b}{'end'}} @validated_exon; my (%SE_exon, %AS_exon, %record_skipping_exon); my (%link, %search_exon_id_on_path); #%exon_across my %exon_sort_id; for my $i(0 .. $#validated_exon_length_sort){ my $exon = $validated_exon_length_sort[$i]; $exon_sort_id{$exon} = $i; } ##reconstruct all possible paths from circ_start to circ_end according to splices or neighbouring relation; my (%path, $path_num); for my $i(0 .. $#validated_exon_length_sort){ my $exon = $validated_exon_length_sort[$i]; my ($exon_start, $exon_end) = split ':', $exon; my ($exon_neighbour_start, $exon_neighbour_end, @neighbouring_exons_start); my ($tag_next, $tag_linked) = (0, 0); my $y = 0; my (@group_linked); ##initiate paths with all start exons if($exon_start == $circ_start){ $path_num ++; $path{$path_num} = [$i]; } ##add all possible linked exons to corresponding paths according to splices ##also record the links if(exists $splice2junction{$exon_end}){ while (my (undef, $other_exon_start) = each %{$splice2junction{$exon_end}}){ if(exists $validated_exon_start{$other_exon_start}){ for my $other_exon(@{$validated_exon_start{$other_exon_start}}){ $link{$i}{$exon_sort_id{$other_exon}} = $supporting_read_in_circ{$exon_end.':'.$other_exon_start}; #supporting junction read counts as value push @group_linked, $exon_sort_id{$other_exon}; } } } } ##find all neighboured exons for my $j($i+1 .. $#validated_exon_length_sort){ my $exon_next = $validated_exon_length_sort[$j]; my ($exon_next_start, $exon_next_end) = split ':', $exon_next; if($tag_next == 1 and $exon_next_start != $exon_neighbour_start and $exon_next_end != $exon_neighbour_end){ last; #stop if not a neighboured exon }elsif($tag_next == 1){ push @neighbouring_exons_start, $j; $tag_linked ++ if exists $link{$i}{$j}; }elsif($tag_next == 0 and $exon_next_start > $exon_end ){ push @neighbouring_exons_start, $j; ($exon_neighbour_start, $exon_neighbour_end) = ($exon_next_start, $exon_next_end); #record the ends as criterion of a neighboured exon $tag_next = 1; $tag_linked ++ if exists $link{$i}{$j}; } } ##add all neighboured exons to corresponding paths if no splice available ##also record the links if($tag_linked == 0 and $tag_next == 1){ for my $exon_next_start_id(@neighbouring_exons_start){ $link{$i}{$exon_next_start_id} = 0; push @group_linked, $exon_next_start_id; } } for my $exon_linked_id(@group_linked){ my @group_bingo; for my $m(1 .. $path_num){ if ($path{$m}[-1] == $i){ push @group_bingo, $m; } } for my $m (@group_bingo){ $y ++; $path{$path_num+$y} = [@{$path{$m}}, $exon_linked_id]; } } $path_num += $y; } ##find paths with circ_end as end and delete others for my $i(1 .. $path_num){ my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$path{$i}[-1]]; if ($exon1_end == $circ_end){ print LOG2 "$circ->path:$i\t"; for my $exon_id_on_path(0 .. $#{$path{$i}}){ $search_exon_id_on_path{$path{$i}[$exon_id_on_path]}{$i} = $exon_id_on_path; print LOG2 "$exon_id_on_path->$path{$i}[$exon_id_on_path]=>$validated_exon_length_sort[$path{$i}[$exon_id_on_path]],"; } print LOG2 "\n"; }else{ delete $path{$i}; } } if(scalar(keys %path) >= 2){ my (%AS_type_exon, %SE_type_exon, %rel_abu_exon); my (%all_units, %key_units, %path_key_unit, %key_unit_in_path, %key_unit_splice_backward_path, %key_unit_splice_forward_path); my (%key_unit_group, $key_unit_group_num, %key_unit_from); # %key_AS_end_unit my (%se_group_id, %key_SE_end_unit); #, %splice_out_path my (%insert_length_clockwise, %insert_length_anti); my (%important_splice, %splice_supporting_read_count_clockwise, %splice_supporting_read_count_anti); my (%rel_exp, %rel_exp_splice); ##find all units in all paths while(my ($i, $units_ref) = each %path){ print LOG2 "all_unit_on_$i:"; for my $j(@{$units_ref}){ $all_units{$j}{$i} = 1; print LOG2 "$j,"; } print LOG2 "\t"; } print LOG2 "\n"; ##find key units that does not exist in all paths print LOG2 "key_unit:"; while(my ($j, $i_ref) = each %all_units){ if(scalar (keys %{$i_ref}) < scalar (keys %path)){ $key_units{$j} = 1; print LOG2 "$j,"; } } print LOG2 "\n"; ##record key units that exists in each path ##record key units with their spliced units in each path while(my ($i, $units_ref) = each %path){ for my $j_id(0 .. $#{$units_ref}){ if(exists $key_units{${$units_ref}[$j_id]}){ $path_key_unit{$i}{${$units_ref}[$j_id]} = 1; if($j_id == 0){ $key_unit_in_path{${$units_ref}[$j_id]}{$i} = [undef, ${$units_ref}[$j_id+1]]; push @{$key_unit_splice_forward_path{${$units_ref}[$j_id]}{${$units_ref}[$j_id+1]}}, $i; }elsif($j_id == $#{$units_ref}){ $key_unit_in_path{${$units_ref}[$j_id]}{$i} = [${$units_ref}[$j_id-1], undef]; push @{$key_unit_splice_backward_path{${$units_ref}[$j_id]}{${$units_ref}[$j_id-1]}}, $i; }else{ $key_unit_in_path{${$units_ref}[$j_id]}{$i} = [${$units_ref}[$j_id-1], ${$units_ref}[$j_id+1]]; push @{$key_unit_splice_backward_path{${$units_ref}[$j_id]}{${$units_ref}[$j_id-1]}}, $i; push @{$key_unit_splice_forward_path{${$units_ref}[$j_id]}{${$units_ref}[$j_id+1]}}, $i; } } } } ##record other units while(my ($j, undef) = each %key_units){ while(my ($i, $units_ref) = each %path){ unless (exists $path_key_unit{$i}{$j}){ $path_key_unit{$i}{$j} = 0; } } } ##cluster related exons with alternative splice sites into groups; my @key_unit_total = keys %key_units; for my $i(0 .. $#key_unit_total){ #from $#key_unit_total-1 to $#key_unit_total my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$key_unit_total[$i]]; unless(exists $key_unit_from{$key_unit_total[$i]}){ $key_unit_group_num ++; $key_unit_from{$key_unit_total[$i]} = $key_unit_group_num; $key_unit_group{$key_unit_group_num} = [$key_unit_total[$i]]; } for my $j($i+1 .. $#key_unit_total){ my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[$key_unit_total[$j]]; if ( $exon1_start == $exon2_start or $exon1_end == $exon2_end ){ if (exists $key_unit_from{$key_unit_total[$j]} and $key_unit_from{$key_unit_total[$j]} != $key_unit_from{$key_unit_total[$i]}){ my @tentative_group = @{$key_unit_group{$key_unit_from{$key_unit_total[$i]}}}; delete $key_unit_group{$key_unit_from{$key_unit_total[$i]}}; for my $x(@tentative_group){ $key_unit_from{$x} = $key_unit_from{$key_unit_total[$j]}; } push @{$key_unit_group{$key_unit_from{$key_unit_total[$j]}}}, @tentative_group; }elsif(!exists $key_unit_from{$key_unit_total[$j]}){ $key_unit_from{$key_unit_total[$j]} = $key_unit_from{$key_unit_total[$i]}; push @{$key_unit_group{$key_unit_from{$key_unit_total[$i]}}}, $key_unit_total[$j]; } } } } ##find critical splices for each key unit ##record corresponding insert lengths of each critical splice my (%splice_unique_group_unit); #, %splice_out_group_unit); while(my ($id, $group_ref) = each %key_unit_group){ print LOG2 "alternative_spliced_exon_group:$id\t"; my (%start_loci, %end_loci); for my $j(@{$group_ref}){ my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$j]; print LOG2 "$j=$validated_exon_length_sort[$j],"; $start_loci{$exon1_start} ++; $end_loci{$exon1_end} ++; } if(scalar (keys %start_loci) == 1){ print LOG2 "\tsame_start_loci"; for my $j(@{$group_ref}){ if( scalar (keys %end_loci) >= 2 ){ if ($strand_circ{$circ} eq '+'){ $AS_type_exon{$j} = 1; }else{ $AS_type_exon{$j} = 2; } } print LOG2 "\t$j=$validated_exon_length_sort[$j]"; my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$j]; while(my ($j2, $i_ref) = each %{$key_unit_splice_forward_path{$j}}){ my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[$j2]; $important_splice{$exon1_end.':'.$exon2_start}{$id}{$j} = 1; $splice_unique_group_unit{$id}{$exon1_end.':'.$exon2_start}{'right:'.$j} = 1; for my $i(@{$i_ref}){ my ($insert_length1, $insert_length2, @key_units_path1, @key_units_path2); for my $j2i(@{$path{$i}}){ if ($j2i <= $j){ $insert_length1 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path1, $j2i; }else{ $insert_length2 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path2, $j2i; } } print LOG2 "\tsplice=$exon1_end:$exon2_start=>"; print LOG2 "clockwise_length=$insert_length1;anti_clockwise_length=$insert_length2"; @{$insert_length_clockwise{$exon1_end.':'.$exon2_start}{$insert_length1}} = @key_units_path1; @{$insert_length_anti{$exon1_end.':'.$exon2_start}{$insert_length2}} = @key_units_path2; } } } } if(scalar (keys %end_loci) == 1){ print LOG2 "\tsame_end_loci"; for my $j(@{$group_ref}){ if( scalar (keys %start_loci) >= 2 ){ if ($strand_circ{$circ} eq '-'){ $AS_type_exon{$j} = 1; }else{ $AS_type_exon{$j} = 2; } } print LOG2 "\t$j=$validated_exon_length_sort[$j]"; my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$j]; while(my ($j2, $i_ref) = each %{$key_unit_splice_backward_path{$j}}){ my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[$j2]; $important_splice{$exon2_end.':'.$exon1_start}{$id}{$j} = 2; $splice_unique_group_unit{$id}{$exon2_end.':'.$exon1_start}{'left:'.$j} = 1; for my $i(@{$i_ref}){ my ($insert_length1, $insert_length2, @key_units_path1, @key_units_path2); for my $j2i(@{$path{$i}}){ if ($j2i <= $j2){ $insert_length1 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path1, $j2i; }else{ $insert_length2 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path2, $j2i; } } print LOG2 "\tsplice=$exon2_end:$exon1_start=>"; print LOG2 "clockwise_length=$insert_length1;anti_clockwise_length=$insert_length2"; @{$insert_length_clockwise{$exon2_end.':'.$exon1_start}{$insert_length1}} = @key_units_path1; @{$insert_length_anti{$exon2_end.':'.$exon1_start}{$insert_length2}} = @key_units_path2; } } } } print LOG2 "\n"; } ##find skipping exon group from above groups; ##record corresponding insert lengths of each critical splice outside while(my ($id, $group_ref) = each %key_unit_group){ my $out_num = 0; my %splice_out_id; PATH: while(my ($i, $key_unit_ref) = each %path_key_unit){ my $group_in_path_tag = 0; for my $j(@{$group_ref}){ if (${$key_unit_ref}{$j} == 1){ $group_in_path_tag ++; } } if ($group_in_path_tag == 0){ print LOG2 "splice_out:"; for my $j2(0 .. $#{$path{$i}}-1){ if($path{$i}[$j2] < ${$group_ref}[0] and $path{$i}[$j2+1] > ${$group_ref}[0]){ my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[$path{$i}[$j2]]; my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[$path{$i}[$j2+1]]; print LOG2 "\troute:$i=>$exon1_end:$exon2_start\t"; my ($insert_length1, $insert_length2, @key_units_path1, @key_units_path2); for my $j2i (@{$path{$i}}){ if ($j2i <= $path{$i}[$j2]){ $insert_length1 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path1, $j2i; }else{ $insert_length2 += $candidate_exons{ $validated_exon_length_sort[$j2i] }; push @key_units_path2, $j2i; } } print LOG2 "clockwise_length=$insert_length1;anti_clockwise_length=$insert_length2\n"; @{$insert_length_clockwise{$exon1_end.':'.$exon2_start}{$insert_length1}} = @key_units_path1; @{$insert_length_anti{$exon1_end.':'.$exon2_start}{$insert_length2}} = @key_units_path2; unless (exists $splice_out_id{$exon1_end.':'.$exon2_start}){ $out_num ++; $splice_out_id{$exon1_end.':'.$exon2_start} = $out_num; $important_splice{$exon1_end.':'.$exon2_start}{$id}{'all_out'.$out_num} = 3; $splice_unique_group_unit{$id}{$exon1_end.':'.$exon2_start}{'all_out:all'} = 1; #.$splice_out_id{$exon1_end.':'.$exon2_start} } next PATH; } } print LOG2 "\n"; } } } ##count junction read pairs of clockwise or anti_clockwise for each splice ##connect each splice with other important splice of key unit group my (%IL_clockwise_splice_type, %IL_anti_splice_type); while (my ($splice, undef) = each %important_splice){ print LOG2 "splice:$splice\n"; $splice_supporting_read_count_clockwise{$splice} = 0; $splice_supporting_read_count_anti{$splice} = 0; for my $read_info(@{$junction_read_splicing{$splice}}){ my @paired_read_end_loci = &paired_read_end_mapping($read_info, $circ); if($paired_read_end_loci[0] == 0){ print LOG2 "direct_split_reads\n"; }elsif($paired_read_end_loci[0] == -1){ print LOG2 "non_pe_reads\n"; }elsif($paired_read_end_loci[0] == 2){ if($paired_read_end_loci[1] == 1){ $splice_supporting_read_count_clockwise{$splice} ++; }else{ $splice_supporting_read_count_anti{$splice} ++; } } } print LOG2 "total_read_count_of_splice=", scalar(@{$junction_read_splicing{$splice}}), ";splice_supporting_read_count_clockwise=$splice_supporting_read_count_clockwise{$splice};splice_supporting_read_count_anti=$splice_supporting_read_count_anti{$splice}\n"; my %length_id_j; while(my ($insert_length1,$unit_path_ref) = each %{$insert_length_clockwise{$splice}}){ print LOG2 "insert_length_clockwise=$insert_length1"; for my $j_id(0 .. $#{$unit_path_ref}-1){ my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[${$unit_path_ref}[$j_id]]; my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[${$unit_path_ref}[$j_id+1]]; if(exists $important_splice{$exon1_end.':'.$exon2_start}){ print LOG2 "\tother_splice=$exon1_end:$exon2_start=>"; while(my ($id, $type_ref) = each %{$important_splice{$exon1_end.':'.$exon2_start}}){ while (my ($type, undef) = each %{$type_ref}){ print LOG2 "$id=$type,"; $length_id_j{$id}{$type}{$exon1_end.':'.$exon2_start} = $insert_length1; } } } } print LOG2 "\n"; } while(my ($id, $type_ref) = each %length_id_j){ if(scalar(keys %{$type_ref})>=2){ while (my ($type, $splice_ref) = each %{$type_ref}){ while(my ($splice2, $insert_length1) = each %{$splice_ref}){ print LOG2 "$id\t$type\t$splice2\t$insert_length1\n"; $IL_clockwise_splice_type{$splice}{$insert_length1}{$id}{$splice2} = $type; } } } } my %length_id_j2; while(my ($insert_length1,$unit_path_ref) = each %{$insert_length_anti{$splice}}){ print LOG2 "insert_length_anti=$insert_length1"; for my $j_id(0 .. $#{$unit_path_ref}-1){ my ($exon1_start, $exon1_end) = split ':', $validated_exon_length_sort[${$unit_path_ref}[$j_id]]; my ($exon2_start, $exon2_end) = split ':', $validated_exon_length_sort[${$unit_path_ref}[$j_id+1]]; if(exists $important_splice{$exon1_end.':'.$exon2_start}){ print LOG2 "\tother_splice=$exon1_end:$exon2_start=>"; while(my ($id, $type_ref) = each %{$important_splice{$exon1_end.':'.$exon2_start}}){ while (my ($type, undef) = each %{$type_ref}){ print LOG2 "$id=$type,"; $length_id_j2{$id}{$type}{$exon1_end.':'.$exon2_start} = $insert_length1; } } } } print LOG2 "\n"; } while(my ($id, $type_ref) = each %length_id_j2){ if(scalar(keys %{$type_ref})>=2){ while (my ($type, $splice_ref) = each %{$type_ref}){ while(my ($splice2, $insert_length1) = each %{$splice_ref}){ print LOG2 "$id\t$type\t$splice2\t$insert_length1\n"; $IL_anti_splice_type{$splice}{$insert_length1}{$id}{$splice2} = $type; } } } } } my @sort_key_unit_group = sort {$key_unit_group{$a}[0] <=> $key_unit_group{$b}[0]} (keys %key_unit_group); if(@sort_key_unit_group == 1){ G_ID: for my $id(@sort_key_unit_group){ my %splice_type4unit; while(my ($splice, $type_ref) = each %{$splice_unique_group_unit{$id}}){ while(my ($type, undef) = each %{$type_ref}){ my @types = split ':', $type; push @{$splice_type4unit{$types[1]}{$types[0]}}, $splice; } } my ($splice_out_normalized, $total_normalized); my ($splice_out_count, $total_count); my %splice_in_normalized4unit; my %splice_in_count; my $if_out_available = 1; if(exists $splice_type4unit{'all'}){ for my $splice (@{$splice_type4unit{'all'}{'all_out'}}){ print LOG2 "out=$splice\t"; my $normalized_count = &normalize_count4splice($splice_supporting_read_count_clockwise{$splice}, $splice_supporting_read_count_anti{$splice}, \%{$insert_length_clockwise{$splice}}, \%{$insert_length_anti{$splice}}); $splice_out_count += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; $total_count += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; if($normalized_count ne 'n/a'){ $splice_out_normalized += $normalized_count; $total_normalized += $normalized_count; }else{ $if_out_available = 0; } } } for my $j(@{$key_unit_group{$id}}){ my (%count_type,%normalized_count_type); my (%count_without_normalization); while (my ($type, $splice_ref) = each %{$splice_type4unit{$j}}){ my $splice_available_count=0; my $total_normalized_count_type=0; for my $splice (@{$splice_ref}){ print LOG2 "$j=$type=$splice\t"; my $normalized_count = &normalize_count4splice($splice_supporting_read_count_clockwise{$splice}, $splice_supporting_read_count_anti{$splice}, \%{$insert_length_clockwise{$splice}}, \%{$insert_length_anti{$splice}}); if($normalized_count ne 'n/a'){ $total_normalized_count_type += $normalized_count; #$normalized_count_type{$type} += $normalized_count; $splice_available_count++; } $count_without_normalization{$type} += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; } if($splice_available_count == scalar(@{$splice_ref})){ $normalized_count_type{$type} = $total_normalized_count_type; } } if(exists $count_without_normalization{'right'} and exists $count_without_normalization{'left'}){ $total_count += ($count_without_normalization{'right'}+$count_without_normalization{'left'})/2; $splice_in_count{$j} = ($count_without_normalization{'right'}+$count_without_normalization{'left'})/2; }elsif(exists $count_without_normalization{'right'}){ $total_count += $count_without_normalization{'right'}; $splice_in_count{$j} = $count_without_normalization{'right'}; }elsif(exists $count_without_normalization{'left'}){ $total_count += $count_without_normalization{'left'}; $splice_in_count{$j} = $count_without_normalization{'left'}; } if(exists $normalized_count_type{'right'} and exists $normalized_count_type{'left'}){ $splice_in_normalized4unit{$j} = ($normalized_count_type{'right'}+$normalized_count_type{'left'})/2; $total_normalized += ($normalized_count_type{'right'}+$normalized_count_type{'left'})/2; }elsif(exists $normalized_count_type{'right'}){ $splice_in_normalized4unit{$j} = $normalized_count_type{'right'}; $total_normalized += $normalized_count_type{'right'}; }elsif(exists $normalized_count_type{'left'}){ $splice_in_normalized4unit{$j} = $normalized_count_type{'left'}; $total_normalized += $normalized_count_type{'left'}; }#else{ # print LOG2 "rel_abundance\t$circ\t$j\t",$validated_exon_length_sort[$j],"\tneither_end_available\n"; # next G_ID; #} } if(exists $splice_type4unit{'all'}){ for my $j(@{$key_unit_group{$id}}){ $SE_type_exon{$j} = 1; } } #if(@{$key_unit_group{$id}} >= 2){ # for my $j(@{$key_unit_group{$id}}){ # $AS_type_exon{$j} = 1; # } #} if($total_normalized > 0 and $if_out_available == 1){ for my $j(@{$key_unit_group{$id}}){ $rel_abu_exon{$j}{'raw'} = sprintf("%.3f", $splice_in_count{$j}/$total_count); $rel_abu_exon{$j}{'mod'} = sprintf("%.3f", $splice_in_normalized4unit{$j}/$total_normalized); printf LOG2 "rel_abundance\t$circ\t$validated_exon_length_sort[$j]\t$j\t%.3f\t%.3f\n",$splice_in_count{$j}/$total_count, $splice_in_normalized4unit{$j}/$total_normalized; } }elsif($total_count > 0){ for my $j(@{$key_unit_group{$id}}){ $rel_abu_exon{$j}{'raw'} = sprintf("%.3f", $splice_in_count{$j}/$total_count); $rel_abu_exon{$j}{'mod'} = 'n/a'; printf LOG2 "rel_abundance\t$circ\t$validated_exon_length_sort[$j]\t$j\t%.3f\tn/a\n",$splice_in_count{$j}/$total_count; } }else{ for my $j(@{$key_unit_group{$id}}){ $rel_abu_exon{$j}{'raw'} = 'n/a'; $rel_abu_exon{$j}{'mod'} = 'n/a'; print LOG2 "rel_abundance\t$circ\t$validated_exon_length_sort[$j]\t$j\tn/a\tn/a\n"; } } } }else{ for my $id(@sort_key_unit_group){ my %splice_type4unit; while(my ($splice, $type_ref) = each %{$splice_unique_group_unit{$id}}){ while(my ($type, undef) = each %{$type_ref}){ my @types = split ':', $type; push @{$splice_type4unit{$types[1]}{$types[0]}}, $splice; } } my ($splice_out_count, $total_count); my %splice_in_count; if(exists $splice_type4unit{'all'}){ for my $splice (@{$splice_type4unit{'all'}{'all_out'}}){ print LOG2 "out=$splice\t"; $splice_out_count += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; $total_count += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; } } for my $j(@{$key_unit_group{$id}}){ my (%count_without_normalization); while (my ($type, $splice_ref) = each %{$splice_type4unit{$j}}){ for my $splice (@{$splice_ref}){ print LOG2 "$j=$type=$splice\t"; $count_without_normalization{$type} += $splice_supporting_read_count_clockwise{$splice}+$splice_supporting_read_count_anti{$splice}; } } if(exists $count_without_normalization{'right'} and exists $count_without_normalization{'left'}){ $total_count += ($count_without_normalization{'right'}+$count_without_normalization{'left'})/2; $splice_in_count{$j} = ($count_without_normalization{'right'}+$count_without_normalization{'left'})/2; }elsif(exists $count_without_normalization{'right'}){ $total_count += $count_without_normalization{'right'}; $splice_in_count{$j} = $count_without_normalization{'right'}; }elsif(exists $count_without_normalization{'left'}){ $total_count += $count_without_normalization{'left'}; $splice_in_count{$j} = $count_without_normalization{'left'}; } } print LOG2 "\n"; if(exists $splice_type4unit{'all'}){ for my $j(@{$key_unit_group{$id}}){ $SE_type_exon{$j} = 1; } } #if(@{$key_unit_group{$id}} >= 2){ # for my $j(@{$key_unit_group{$id}}){ # $AS_type_exon{$j} = 1; # } #} if($total_count > 0){ for my $j(@{$key_unit_group{$id}}){ $rel_abu_exon{$j}{'raw'} = sprintf("%.3f", $splice_in_count{$j}/$total_count); $rel_abu_exon{$j}{'mod'} = 'n/a'; printf LOG2 "rel_abundance\t$circ\t$validated_exon_length_sort[$j]\t$j\t%.3f\tn/a\n",$splice_in_count{$j}/$total_count; } }else{ for my $j(@{$key_unit_group{$id}}){ $rel_abu_exon{$j}{'raw'} = 'n/a'; $rel_abu_exon{$j}{'mod'} = 'n/a'; print LOG2 "rel_abundance\t$circ\t$validated_exon_length_sort[$j]\t$j\tn/a\tn/a\n"; } } } } for my $j(@key_unit_total){ unless(exists $SE_type_exon{$j}){ $SE_type_exon{$j} = 0; } unless(exists $AS_type_exon{$j}){ $AS_type_exon{$j} = 0; } #print ASCAT "$circ\t$validated_exon_length_sort[$j]\t$SE_type_exon{$j}\t$AS_type_exon{$j}\t0\t",$rel_abu_exon{$j}{'raw'},"\t",$rel_abu_exon{$j}{'mod'},"\n"; print ASCAT "$circ\t$validated_exon_length_sort[$j]\t"; if($SE_type_exon{$j} == 1){ print ASCAT "ES,"; }if($AS_type_exon{$j} == 1){ print ASCAT "A5SS,"; }if($AS_type_exon{$j} == 2){ print ASCAT "A3SS,"; } print ASCAT "\t",$rel_abu_exon{$j}{'raw'},"\t",$rel_abu_exon{$j}{'mod'},"\n"; } } if(@validated_intron_retained >= 1){ for my $exon_with_intron_retention(@validated_intron_retained){ print LOG2 "$circ\texon_with_intron_retention=$exon_with_intron_retention\n"; my ($exon_with_intron_start, $exon_with_intron_end) = split ':', $exon_with_intron_retention; my $splice = $validated_intron_retained_info{$exon_with_intron_retention}{'splice'}; my ($intron_start, $intron_end) = split ':', $splice; my $intron_length = $intron_end-$intron_start-1; my $start_unit_id = $exon_sort_id{$exon_with_intron_start.':'.$intron_start}; my $end_unit_id = $exon_sort_id{$intron_end.':'.$exon_with_intron_end}; if(scalar(keys %path) > 1){ print ASCAT "$circ\t$exon_with_intron_retention\tIR,\t", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, "\tn/a\n"; printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\tn/a\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}; ##altered after debug Oct.28th.15 }elsif(scalar(keys %path) == 1){ print LOG2 "start_exon=$exon_with_intron_start:$intron_start\n"; print LOG2 "end_exon=$intron_end:$exon_with_intron_end\n"; my ($insert_length_clockwise, $insert_length_anti); my %normalized_count; while(my ($i, $units_ref) = each %path){ for my $j(@{$units_ref}){ if($j <= $start_unit_id){ $insert_length_clockwise += $candidate_exons{ $validated_exon_length_sort[$j] }; }elsif($j >= $end_unit_id){ $insert_length_anti += $candidate_exons{ $validated_exon_length_sort[$j] }; } } } my ($splice_supporting_read_count_clockwise, $splice_supporting_read_count_anti) = (0, 0); for my $read_info(@{$junction_read_splicing{$splice}}){ my @paired_read_end_loci = &paired_read_end_mapping($read_info, $circ); if($paired_read_end_loci[0] == 2){ if($paired_read_end_loci[1] == 1){ $splice_supporting_read_count_clockwise ++; }else{ $splice_supporting_read_count_anti ++; } } } $normalized_count{'splice'} = &normalize_count4splice_intron_retention($splice_supporting_read_count_clockwise, $splice_supporting_read_count_anti, $insert_length_clockwise, $insert_length_anti); print LOG2 "splice:$splice:\tsplice_supporting_read_count_clockwise:$splice_supporting_read_count_clockwise\tsplice_supporting_read_count_anti:$splice_supporting_read_count_anti\tnormalized_count:",$normalized_count{'splice'},"\n"; my (%across_supporting_read_count_clockwise, %across_supporting_read_count_anti); for my $end($intron_start, $intron_end){ $across_supporting_read_count_clockwise{$end} = 0; $across_supporting_read_count_anti{$end} = 0; for my $read_info(@{$splice_across{$intron_start}}){ my @paired_read_end_loci = &paired_read_end_mapping([${$read_info}[0], ${$read_info}[2]], $circ, ${$read_info}[1]); if($paired_read_end_loci[0] == 2){ if($paired_read_end_loci[1] == 1){ $across_supporting_read_count_clockwise{$end} ++; }else{ $across_supporting_read_count_anti{$end} ++; } } } if($end == $intron_start){ $normalized_count{$end} = &normalize_count4splice_intron_retention($across_supporting_read_count_clockwise{$end}, $across_supporting_read_count_anti{$end}, $insert_length_clockwise, $insert_length_anti+$intron_length); }else{ $normalized_count{$end} = &normalize_count4splice_intron_retention($across_supporting_read_count_clockwise{$end}, $across_supporting_read_count_anti{$end}, $insert_length_clockwise+$intron_length, $insert_length_anti); } print LOG2 "end:$end:\tacross_supporting_read_count_clockwise:$across_supporting_read_count_clockwise{$end}\tcross_supporting_read_count_anti:$across_supporting_read_count_anti{$end}\tnormalized_count:$normalized_count{$end}\n"; } if($normalized_count{'splice'} > 0){ if($normalized_count{$intron_start} > 0 and $normalized_count{$intron_end} > 0 ){ printf ASCAT "$circ\t$exon_with_intron_retention\tIR,\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, ($normalized_count{$intron_start}+$normalized_count{$intron_end})/($normalized_count{$intron_start}+$normalized_count{$intron_end}+2*$normalized_count{'splice'}); printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, ($normalized_count{$intron_start}+$normalized_count{$intron_end})/($normalized_count{$intron_start}+$normalized_count{$intron_end}+2*$normalized_count{'splice'}); }elsif($normalized_count{$intron_start} > 0){ printf ASCAT "$circ\t$exon_with_intron_retention\tIR,\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, $normalized_count{$intron_start}/($normalized_count{$intron_start}+$normalized_count{'splice'}); printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, $normalized_count{$intron_start}/($normalized_count{$intron_start}+$normalized_count{'splice'}); }elsif($normalized_count{$intron_end} > 0){ printf ASCAT "$circ\t$exon_with_intron_retention\tIR,\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, $normalized_count{$intron_end}/($normalized_count{$intron_end}+$normalized_count{'splice'}); printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\t%.3f\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, $normalized_count{$intron_end}/($normalized_count{$intron_end}+$normalized_count{'splice'}); }else{ print ASCAT "$circ\t$exon_with_intron_retention\tIR,\t", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, "\tn/a\n"; printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\tn/a\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}; } }else{ print ASCAT "$circ\t$exon_with_intron_retention\tIR,\t", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}, "\tn/a\n"; printf LOG2 "rel_abundance\t$circ\t$exon_with_intron_retention\t$start_unit_id->$end_unit_id\t%.3f\tn/a\n", $validated_intron_retained_info{$exon_with_intron_retention}{'raw_psi'}; } ##altered after debug Oct.28th.15 }else{ print LOG2 "no path\n"; } } } }else{ #print ASOUT "n/a\t0\n"; print ASJAV "n/a\t"; } }else{ #print ASOUT "n/a\t0\n"; print ASJAV "n/a\t"; } for my $i($circ_start .. $circ_end){ if (exists $coverage{$chr}{$i}){ print ASJAV "$coverage{$chr}{$i},"; }else{ print ASJAV "0,"; } } print ASJAV "\t"; my @sort_splicing = sort {$a <=> $b or $a cmp $b} keys %junction_read_splicing; my %record_reads; for my $splice(@sort_splicing){ print ASJAV '<',$splice,"::"; for my $read_in_splice_info( @{$junction_read_splicing{$splice}}){ my $read_in_splice = ${$read_in_splice_info}[0]; $record_reads{$read_in_splice} ++; print ASJAV $read_in_splice,"("; for my $i (0 .. $#{$read_range{$read_in_splice}{'1'}{'start'}}){ #modified on 030715 according to modification in sub record_mapping_detail my @ranges = ($read_range{$read_in_splice}{'1'}{'chr'}[$i], $read_range{$read_in_splice}{'1'}{'start'}[$i], $read_range{$read_in_splice}{'1'}{'end'}[$i]); if($ranges[0] eq $chr and $ranges[1] >= $circ_start-6 and $ranges[2] <= $circ_end+6){ print ASJAV "$ranges[1]-$ranges[2]!",${$read_mapq{$read_in_splice}{'1'}}[$i],"|"; }else{ print ASJAV "NA|"; } } print ASJAV ")("; for my $i (0 .. $#{$read_range{$read_in_splice}{'0'}{'start'}}){ #modified on 030715 according to modification in sub record_mapping_detail my @ranges = ($read_range{$read_in_splice}{'0'}{'chr'}[$i], $read_range{$read_in_splice}{'0'}{'start'}[$i], $read_range{$read_in_splice}{'0'}{'end'}[$i]); if($ranges[0] eq $chr and $ranges[1] >= $circ_start-6 and $ranges[2] <= $circ_end+6){ print ASJAV "$ranges[1]-$ranges[2]!",${$read_mapq{$read_in_splice}{'0'}}[$i],"|"; }else{ print ASJAV "NA|"; } } print ASJAV ")"; } print ASJAV ">"; } print ASJAV "= $circ_start-6 and $ranges[2] <= $circ_end+6){ print ASJAV "$ranges[1]-$ranges[2]!",${$read_mapq{$read}{'1'}}[$i],"|"; }else{ print ASJAV "NA|"; } } print ASJAV ")("; for my $i (0 .. $#{$read_range{$read}{'0'}{'start'}}){ #modified on 030715 according to modification in sub record_mapping_detail my @ranges = ($read_range{$read}{'0'}{'chr'}[$i], $read_range{$read}{'0'}{'start'}[$i], $read_range{$read}{'0'}{'end'}[$i]); if($ranges[0] eq $chr and $ranges[1] >= $circ_start-6 and $ranges[2] <= $circ_end+6){ print ASJAV "$ranges[1]-$ranges[2]!",${$read_mapq{$read}{'0'}}[$i],"|"; }else{ print ASJAV "NA|"; } } print ASJAV ")"; } print ASJAV ">\n"; } $circ_num_now += @circ_chr_sort; &proc_bar($circ_num_now, scalar(keys %circ_info)); } &proc_bar($circ_num_now, scalar(keys %circ_info)); my $timeI = time; print "\n"; print "Time Usage:\n"; if($gtf == 1 or $gff >= 2){ printf "scanning annotation and recording known exons: Running time: %.1f minutes(s)\n", ($timeB-$timeA)/60; } printf "scanning circRNA list: Running time: %.1f minutes(s)\n", ($timeC-$timeB)/60; printf "clustering circRNAs: Running time: %.1f minutes(s)\n", ($timeD-$timeC)/60; printf "scanning SAM and calculate sequencing depths: Running time: %.1f minutes(s)\n", ($timeE-$timeD)/60; printf "splicing signal checking: Running time: %.1f minutes(s)\n", ($timeF-$timeE)/60; printf "clustering splice junctions: Running time: %.1f minutes(s)\n", ($timeH-$timeF)/60; #printf "filtering splice junctions: Running time: %.1f minutes(s)\n", ($timeH-$timeG)/60; printf "cirexon and alternative splicing prediction: Running time: %.1f minutes(s)\n", ($timeI-$timeH)/60; print LOG "Time Usage:\n"; if($gtf == 1 or $gff >= 2){ printf LOG "scanning annotation and recording known exons: Running time: %.1f minutes(s)\n", ($timeB-$timeA)/60; } printf LOG "scanning ciri_list: Running time: %.1f minutes(s)\n", ($timeC-$timeB)/60; printf LOG "clustering circRNAs: Running time: %.1f minutes(s)\n", ($timeD-$timeC)/60; printf LOG "scanning SAM and calculate sequencing depths: Running time: %.1f minutes(s)\n", ($timeE-$timeD)/60; printf LOG "splicing signal checking: Running time: %.1f minutes(s)\n", ($timeF-$timeE)/60; printf LOG "clustering splice junctions: Running time: %.1f minutes(s)\n", ($timeH-$timeF)/60; #printf LOG "filtering splice junctions: Running time: %.1f minutes(s)\n", ($timeH-$timeG)/60; printf LOG "cirexon and alternative splicing prediction: Running time: %.1f minutes(s)\n", ($timeI-$timeH)/60; sub normalize_count4splice_intron_retention{ #&normalize_count4splice_intron_retention($across_supporting_read_count_clockwise{$end}, $across_supporting_read_count_anti{$end}, $insert_length_clockwise, $insert_length_anti+$intron_length); my @read_count = ($_[1], $_[0]); my @IL_splice_type = ($_[3], $_[2]); my %prob_distribution_ori; for my $i(0 .. 1){ my $circle_length = $IL_splice_type[1-$i] + $IL_splice_type[$i]; my $circle_num = 0; my $prob_distribution; while(1){ ##altered after debug Oct.28th.15 if($circle_num > 5){ print LOG2 "error:seems infinite circles in script!\n"; last; } ##altered after debug Oct.28th.15 my $IL_length = $IL_splice_type[$i] + $circle_num*$circle_length; print LOG2 "clockwise:$i"; if($IL_length >= $sort_insert_length[int(@sort_insert_length*.975)-1]){ print LOG2 "=IL_length:$IL_length>=",$sort_insert_length[int(@sort_insert_length*.975)-1],"\n"; last; }elsif($IL_length <= $sort_insert_length[int(@sort_insert_length*.025)-1]){ print LOG2 "=IL_length:$IL_length<=",$sort_insert_length[int(@sort_insert_length*.025)-1],"\n"; $circle_num ++; next; }else{ for my $j(-2 .. 2){ $prob_distribution += $insert_length_distribution{$IL_length+$j}; print LOG2 "=IL_length:$IL_length=circle_num:$circle_num;deviation:$j;prob_distribution:",$insert_length_distribution{$IL_length+$j},","; } print LOG2 "\n"; ##altered after debug Oct.28th.15 $circle_num ++; } } $prob_distribution_ori{$i} = $prob_distribution if (defined $prob_distribution and $prob_distribution>0); } if(exists $prob_distribution_ori{'0'} and exists $prob_distribution_ori{'1'}){ if($read_count[0] >= 20*$read_count[1] and $prob_distribution_ori{'0'}>0){ return $read_count[0]/$prob_distribution_ori{'0'}; }elsif($read_count[1] >= 20*$read_count[0] and $prob_distribution_ori{'1'}>0){ return $read_count[1]/$prob_distribution_ori{'1'}; }elsif($prob_distribution_ori{'0'}>0 and $prob_distribution_ori{'1'}>0){ return ($read_count[0]/$prob_distribution_ori{'0'}+$read_count[1]/$prob_distribution_ori{'1'})/2; }else{ return 'n/a'; } }elsif(exists $prob_distribution_ori{'0'} and $prob_distribution_ori{'0'}>0){ return $read_count[0]/$prob_distribution_ori{'0'}; }elsif(exists $prob_distribution_ori{'1'} and $prob_distribution_ori{'1'}>0){ return $read_count[1]/$prob_distribution_ori{'1'}; }else{ return 'n/a'; } } sub normalize_count4splice{ #&normalize_count4splice($splice_supporting_read_count_clockwise{$splice}, $splice_supporting_read_count_anti{$splice}, \%{$insert_length_clockwise{$splice}}, \%{$insert_length_anti{$splice}}) my @read_count = ($_[1], $_[0]); my @IL_splice_type = ($_[3], $_[2]); my %prob_distribution_ori; for my $i(0 .. 1){ if(scalar(keys %{$IL_splice_type[$i]}) == 1){ my @IL_clockwise = keys %{$IL_splice_type[$i]}; my %circle_length; while (my ($IL_anti, undef) = each %{$IL_splice_type[1-$i]}){ $circle_length{$IL_clockwise[0]+$IL_anti} = 1; } my @circ_length_sort = sort {$a <=> $b}(keys %circle_length); if(@circ_length_sort == 1){ my $circle_num = 0; my $prob_distribution; while(1){ ##altered after debug Oct.28th.15 if($circle_num > 5){ print LOG2 "error:seems infinite circles in script!\n"; last; } ##altered after debug Oct.28th.15 my $IL_length = $IL_clockwise[0] + $circle_num*$circ_length_sort[0]; print LOG2 "clockwise:$i"; if($IL_length >= $sort_insert_length[int(@sort_insert_length*.975)-1]){ print LOG2 "=IL_length:$IL_length>=",$sort_insert_length[int(@sort_insert_length*.975)-1],"\n"; last; }elsif($IL_length <= $sort_insert_length[int(@sort_insert_length*.025)-1]){ print LOG2 "=IL_length:$IL_length<=",$sort_insert_length[int(@sort_insert_length*.025)-1],"\n"; $circle_num ++; next; }else{ for my $j(-2 .. 2){ $prob_distribution += $insert_length_distribution{$IL_length+$j}; print LOG2 "=IL_length:$IL_length=circle_num:$circle_num;deviation:$j;prob_distribution:",$insert_length_distribution{$IL_length+$j},","; } print LOG2 "\n"; ##altered after debug Oct.28th.15 $circle_num ++; } } $prob_distribution_ori{$i} = $prob_distribution if (defined $prob_distribution and $prob_distribution>0); #last; }elsif($circ_length_sort[0]+$IL_clockwise[0] >= $sort_insert_length[int(@sort_insert_length*.975)-1]){ my $prob_distribution; for my $j(-2 .. 2){ $prob_distribution += $insert_length_distribution{$IL_clockwise[0]+$j}; print LOG2 "clockwise:$i=IL_length:$IL_clockwise[0]=circle_num:0;deviation:$j;prob_distribution:",$insert_length_distribution{$IL_clockwise[0]+$j},","; } print LOG2 "\n"; $prob_distribution_ori{$i} = $prob_distribution if (defined $prob_distribution and $prob_distribution>0); } } } if(exists $prob_distribution_ori{'0'} and exists $prob_distribution_ori{'1'}){ #my @ori_sort = sort {$read_count[$a] <=> $read_count[$b]} (0,1); if($read_count[0] >= 20*$read_count[1] and $prob_distribution_ori{'0'}>0){ return $read_count[0]/$prob_distribution_ori{'0'}; }elsif($read_count[1] >= 20*$read_count[0] and $prob_distribution_ori{'1'}>0){ return $read_count[1]/$prob_distribution_ori{'1'}; }elsif($prob_distribution_ori{'0'}>0 and $prob_distribution_ori{'1'}>0){ return ($read_count[0]/$prob_distribution_ori{'0'}+$read_count[1]/$prob_distribution_ori{'1'})/2; }else{ return 'n/a'; } }elsif(exists $prob_distribution_ori{'0'} and $prob_distribution_ori{'0'}>0){ return $read_count[0]/$prob_distribution_ori{'0'}; }elsif(exists $prob_distribution_ori{'1'} and $prob_distribution_ori{'1'}>0){ return $read_count[1]/$prob_distribution_ori{'1'}; }else{ return 'n/a'; } } ##debugged sub route_comparison{ my %route_in_path_length = %{$_[0]}; my (%all_unit, %key_units); my @units_in_route = values %route_in_path_length; my %route_comparison_length; for my $i (0 .. $#units_in_route){ for my $x (@{$units_in_route[$i]}){ $all_unit{$x} ++; } } while(my ($x, $times) = each %all_unit){ if ($times < @units_in_route){ $key_units{$x} = 1; } } while(my ($length, $route_ref) = each %route_in_path_length){ for my $x (@{$route_ref}){ if(exists $key_units{$x}){ $route_comparison_length{$length}{$x} = 1; } } } while(my ($x, $times) = each %key_units){ while(my ($length, $route_ref) = each %route_in_path_length){ if (!exists $route_comparison_length{$length}{$x}){ $route_comparison_length{$length}{$x} = 0; } } } \%route_comparison_length; } sub paired_read_end_mapping{ #my @paired_read_end_loci = &paired_read_end_mapping($read_info, $circ); my ($read_info, $circ, $if_reverse, $read, $seg_id); if(@_ == 2){ ($read_info, $circ) = @_; ($read, $seg_id) = @{$read_info}; $if_reverse = $positive_reads{$read}{$seg_id}{'if_reverse'}; }else{ ($read_info, $circ, $if_reverse) = @_; ($read, $seg_id) = @{$read_info}; } my $not_reverse; if($if_reverse == 0){ $not_reverse = 1; }else{ $not_reverse = 0; } my @back_splice; my ($chr, $circ_start, $circ_end) = split /[:|]/, $circ; my (@exon0_id_group, @last_seg, @first_seg, @results, @result0); #@insert_length, print LOG2 "$read\tif_reverse:$if_reverse\t";#exons:"; for my $i(0 .. 1){ my @sort_read_mapping = sort {$read_range{$read}{$i}{'read_st'}[$a] <=> $read_range{$read}{$i}{'read_st'}[$b] or $read_range{$read}{$i}{'read_ed'}[$a] <=> $read_range{$read}{$i}{'read_ed'}[$b]} (0 .. $#{$read_range{$read}{$i}{'read_st'}}); for my $j(0 .. $#sort_read_mapping){ my $l = $sort_read_mapping[$j]; if($read_range{$read}{$i}{'chr'}[$l] eq $chr and $read_range{$read}{$i}{'start'}[$l] >= $circ_start-6 and $read_range{$read}{$i}{'end'}[$l] <= $circ_end+6){ $first_seg[$i] = $l; last; } } for my $j(0 .. $#sort_read_mapping){ my $l = $sort_read_mapping[$#sort_read_mapping-$j]; if($read_range{$read}{$i}{'chr'}[$l] eq $chr and $read_range{$read}{$i}{'start'}[$l] >= $circ_start-6 and $read_range{$read}{$i}{'end'}[$l] <= $circ_end+6){ $last_seg[$i] = $l; last; } } print LOG2 "first_seg($i):$first_seg[$i];last_seg($i):$last_seg[$i]\t"; print LOG2 "start_loci($i):".$read_range{$read}{$i}{'start'}[$first_seg[$i]].";end_loci($i):".$read_range{$read}{$i}{'end'}[$last_seg[$i]]."\t"; } if (@last_seg == 2 and @first_seg == 2){ my @insert_length_calculation; for my $i (0 .. 1){ if($read_range{$read}{$i}{'start'}[$first_seg[$i]] >= $read_range{$read}{$i}{'end'}[$last_seg[$i]]){ $back_splice[$i] = 1; }elsif(abs($read_range{$read}{$i}{'start'}[$first_seg[$i]] - $circ_start)<=6){ $back_splice[$i] = 2; }elsif(abs($read_range{$read}{$i}{'end'}[$last_seg[$i]] - $circ_end)<=6){ $back_splice[$i] = 3; }else{ $back_splice[$i] = 0; } } print LOG2 "read0:".$read_range{$read}{'0'}{'start'}[$first_seg[0]].','.$read_range{$read}{'0'}{'end'}[$last_seg[0]]."-$back_splice[0]\t"; print LOG2 "read1:".$read_range{$read}{'1'}{'start'}[$first_seg[1]].','.$read_range{$read}{'1'}{'end'}[$last_seg[1]]."-$back_splice[1]\t"; ### [ ------- ] $back_splice[$i] = 0 ### [-> <----] $back_splice[$i] = 1 ### [<------ ] $back_splice[$i] = 2 ### [ ------>] $back_splice[$i] = 3 my @read_PE_mapping = ($read_range{$read}{$if_reverse}{'start'}[$first_seg[$if_reverse]], $read_range{$read}{$if_reverse}{'end'}[$last_seg[$if_reverse]], $read_range{$read}{$not_reverse}{'start'}[$first_seg[$not_reverse]], $read_range{$read}{$not_reverse}{'end'}[$last_seg[$not_reverse]]); ### [ 0---*---1 ] ### [ 2-------3 ] [->3 2<----] 2[<------3 ] [ 2------>]3 print LOG2 "read_PE_mapping:"; print LOG2 "$_," for @read_PE_mapping; print LOG2 "\t"; if($back_splice[$if_reverse] == 0 and $back_splice[$not_reverse] >= 1){ ### [ 0---*---1 ] [ 0---*---1 ] [ 0---*---1 ] [ 0-----*-----1 ] [ 0---*---1] [ 0---*---1 ] [ 0---*---1 ] [ 0---*---1 ] ### 2[<------3 ] [ 2------>]3 [->3 2<----] [--->3 2<--------] [--->3 2<--] [ 2------>]3 [--->3 2<--] 2[<------3 ] if( $read_PE_mapping[0] >= $read_PE_mapping[3] and $back_splice[$not_reverse] == 2){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }elsif( $read_PE_mapping[1] <= $read_PE_mapping[2] and $back_splice[$not_reverse] == 3){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }elsif( $read_PE_mapping[0] >= $read_PE_mapping[3] and $read_PE_mapping[1] <= $read_PE_mapping[2] ){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }elsif( $read_PE_mapping[0] <= $read_PE_mapping[3] and $read_PE_mapping[1] >= $read_PE_mapping[2] and $back_splice[$not_reverse] == 1){ push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[1]+1, '-']; push @insert_length_calculation, [$read_PE_mapping[0], $read_PE_mapping[3]+1, '-']; }elsif( $read_PE_mapping[0] <= $read_PE_mapping[2] and $read_PE_mapping[1] >= $read_PE_mapping[2] and $read_PE_mapping[0] >= $read_PE_mapping[3] and $back_splice[$not_reverse] == 1){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }elsif( $read_PE_mapping[0] <= $read_PE_mapping[2] and $read_PE_mapping[1] >= $read_PE_mapping[2] and $back_splice[$not_reverse] == 3){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }elsif( $read_PE_mapping[0] <= $read_PE_mapping[3] and $read_PE_mapping[1] >= $read_PE_mapping[3] and $read_PE_mapping[1] <= $read_PE_mapping[2] and $back_splice[$not_reverse] == 1 ){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }elsif( $read_PE_mapping[0] <= $read_PE_mapping[3] and $read_PE_mapping[1] >= $read_PE_mapping[3] and $back_splice[$not_reverse] == 2){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }else{ print LOG2 "0,n/a,-10\n"; return (0, 'n/a'); } print LOG2 "insert_length_calculation(clockwise):"; print LOG2 "$_," for @{$insert_length_calculation[0]}; print LOG2 ";insert_length_calculation(anti):"; print LOG2 "$_," for @{$insert_length_calculation[1]}; print LOG2 "\n"; (scalar(@insert_length_calculation), $if_reverse, @insert_length_calculation); }elsif($back_splice[$if_reverse] >= 1 and $back_splice[$not_reverse] >= 1){ if($back_splice[$if_reverse] == 1 and $back_splice[$not_reverse] == 1){ ### [-*---1 0--] [--1 0---*-] ### [->3 2<----] [---->3 2<-] if($read_PE_mapping[0] >= $read_PE_mapping[2] and $read_PE_mapping[1] <= $read_PE_mapping[2]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }elsif($read_PE_mapping[0] <= $read_PE_mapping[2] and $read_PE_mapping[0] >= $read_PE_mapping[3]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ print LOG2 "0,n/a,-20\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 1 and $back_splice[$not_reverse] == 2){ ### [-*---1 0--] ### 2[<----->3 ] if($read_PE_mapping[0] >= $read_PE_mapping[3] and $read_PE_mapping[3] >= $read_PE_mapping[1]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ print LOG2 "0,n/a,-30\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 1 and $back_splice[$not_reverse] == 3){ ### [-*---1 0--] ### [ 2<----->]3 if($read_PE_mapping[2] >= $read_PE_mapping[1] and $read_PE_mapping[2] <= $read_PE_mapping[0]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }else{ print LOG2 "0,n/a,-40\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 2 and $back_splice[$not_reverse] == 1){ ### 0[---*---1 ] ### [->3 2<----] if($read_PE_mapping[1] >= $read_PE_mapping[3] and $read_PE_mapping[2] >= $read_PE_mapping[1]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }else{ print LOG2 "0,n/a,-50\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 3 and $back_splice[$not_reverse] == 1){ ### [ 0---*---]1 ### [->3 2<----] if($read_PE_mapping[0] >= $read_PE_mapping[3] and $read_PE_mapping[2] >= $read_PE_mapping[0]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ print LOG2 "0,n/a,-60\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 2 and $back_splice[$not_reverse] == 2){ ### 0[---*--1 ] ### 2[<----->3 ] if($read_PE_mapping[1] <= $read_PE_mapping[3]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[3], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ ### 0[---*---1 ] ### 2[<---->3 ] push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[1], 0]; } }elsif($back_splice[$if_reverse] == 3 and $back_splice[$not_reverse] == 3){ ### [ 0--*---]1 ### [ 2<----->]3 if($read_PE_mapping[0] >= $read_PE_mapping[2]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[2], $read_PE_mapping[0], 0]; }else{ ### [ 0---*---]1 ### [ 2<---->]3 push @insert_length_calculation, [$read_PE_mapping[0], $read_PE_mapping[2], 0]; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; } }elsif($back_splice[$if_reverse] == 3 and $back_splice[$not_reverse] == 2){ ### [ 0---*---]1 ### 2[<----->3 ] if($read_PE_mapping[0] >= $read_PE_mapping[3]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ print LOG2 "0,n/a,-70\n"; return (0, 'n/a'); } }elsif($back_splice[$if_reverse] == 2 and $back_splice[$not_reverse] == 3){ ### 0[---*---1 ] ### [ 2<----->]3 if($read_PE_mapping[1] <= $read_PE_mapping[2]){ push @insert_length_calculation, [$read_PE_mapping[1], $read_PE_mapping[2]-1, '+']; push @insert_length_calculation, [$read_PE_mapping[3], $read_PE_mapping[0]-1, '+']; }else{ print LOG2 "0,n/a,-80\n"; return (0, 'n/a'); } }else{ print LOG2 "0,n/a,-100\n"; return (0, 'n/a'); } print LOG2 "insert_length_calculation(proximal):"; print LOG2 "$_," for @{$insert_length_calculation[0]}; print LOG2 ";insert_length_calculation(remote):"; print LOG2 "$_," for @{$insert_length_calculation[1]}; print LOG2 "\n"; (scalar(@insert_length_calculation), $if_reverse, @insert_length_calculation); }elsif($back_splice[$if_reverse] >= 1){ print LOG2 "0,n/a,-1\n"; (0, 'n/a'); }elsif($back_splice[$if_reverse] == 0){ print LOG2 "0,n/a,-2\n"; (0, 'n/a'); }else{ print LOG2 "0,n/a,-3\n"; (0, 'n/a'); } }else{ print LOG2 "-1,n/a,-4\n"; (-1, 'n/a'); } } sub exon_length_estimation{ my $read = shift; my $long_exon = shift; my ($exon_chr, $exon_start, $exon_end) = split ':', $long_exon; my (%line, %start, %end, %length); my $library_length; for my $i(0 .. 1){ @{$line{$i}} = split /\t/, $_[$i]; my $ten2b7 = &ten2b(${$line{$i}}[1],7); my $msid = &MSID(${$line{$i}}[5]); $start{$i} = ${$line{$i}}[3]; $end{$i} = ${$line{$i}}[3] + ${$msid}[-1] - 1; $length{$i} = ${$msid}[-1]; print LIBLEN "$read\t$ten2b7\t$start{$i}\t$end{$i}\t$length{$i}\t$exon_chr\t$exon_start\t$exon_end\n"; } my @sort_mapping_index = sort {$start{$a} <=> $start{$b} or $end{$a} <=> $end{$b}} (0,1); if( abs($length{$sort_mapping_index[1]} - $read_length) <= 5 and abs($length{$sort_mapping_index[0]} - $read_length) <= 5 and $start{$sort_mapping_index[0]} >= $exon_start-4 and $end{$sort_mapping_index[1]} <= $exon_end+4 ){ $library_length = ($start{$sort_mapping_index[1]} - $start{$sort_mapping_index[0]} + $end{$sort_mapping_index[1]} - $end{$sort_mapping_index[0]})/2; print LIBLEN "0:\t$start{$sort_mapping_index[0]},$start{$sort_mapping_index[1]}\t$end{$sort_mapping_index[0]},$end{$sort_mapping_index[1]}\t$library_length\n"; #push @library_lengths_all, $library_length; }elsif( abs($start{$sort_mapping_index[0]}-$exon_start) <= 4 and abs($start{$sort_mapping_index[1]}-$exon_start) <= 4 ){ $library_length = $end{$sort_mapping_index[1]} - $end{$sort_mapping_index[0]}; print LIBLEN "1:\t$start{$sort_mapping_index[0]},$start{$sort_mapping_index[1]}\t$end{$sort_mapping_index[0]},$end{$sort_mapping_index[1]}\t$library_length\n"; #push @library_lengths_all, $library_length; }elsif( abs($end{$sort_mapping_index[1]}-$exon_end) <= 4 and abs($end{$sort_mapping_index[0]}-$exon_end) <= 4 ){ $library_length = $start{$sort_mapping_index[1]} - $start{$sort_mapping_index[0]}; print LIBLEN "2:\t$start{$sort_mapping_index[0]},$start{$sort_mapping_index[1]}\t$end{$sort_mapping_index[0]},$end{$sort_mapping_index[1]}\t$library_length\n"; #push @library_lengths_all, $library_length; }elsif( abs($start{$sort_mapping_index[0]}-$exon_start) <= 4 and $length{$sort_mapping_index[1]} == $read_length ){ $library_length = $end{$sort_mapping_index[1]} - $end{$sort_mapping_index[0]}; print LIBLEN "3:\t$start{$sort_mapping_index[0]},$start{$sort_mapping_index[1]}\t$end{$sort_mapping_index[0]},$end{$sort_mapping_index[1]}\t$library_length\n"; #push @library_lengths_all, $library_length; }elsif( abs($end{$sort_mapping_index[1]}-$exon_end) <= 4 and $length{$sort_mapping_index[0]} == $read_length ){ $library_length = $start{$sort_mapping_index[1]} - $start{$sort_mapping_index[0]}; print LIBLEN "4:\t$start{$sort_mapping_index[0]},$start{$sort_mapping_index[1]}\t$end{$sort_mapping_index[0]},$end{$sort_mapping_index[1]}\t$library_length\n"; #push @library_lengths_all, $library_length; } push @library_lengths_all, int($library_length); } sub record_mapping_detail{ my $read = shift; for (@_){ my @line = split /\t/; my $msid = &MSID_start($line[5]); my $ten2b5 = &ten2b($line[1], 5); next unless ${$msid}[-1] > 0; #modified on 030715 from string of range to two mapping ends of the read push @{$read_range{$read}{$ten2b5}{'chr'}}, $line[2]; push @{$read_range{$read}{$ten2b5}{'start'}}, $line[3]; push @{$read_range{$read}{$ten2b5}{'end'}}, $line[3]+${$msid}[-1]-1; push @{$read_range{$read}{$ten2b5}{'read_st'}}, ${$msid}[1]; push @{$read_range{$read}{$ten2b5}{'read_ed'}}, ${$msid}[1]+${$msid}[2]-1; push @{$read_mapq{$read}{$ten2b5}}, $line[4]; } } sub MSID_start{ #my($M_sum, $S_num, $S_num1, $S_num2, @I_num, @D_num, $I_sum, $D_sum, $pos_adjustment, $length); my @counts = split /[MSIDH]/, $_[0]; if (@counts == 1){ if ($_[0] eq "${read_length}M"){ [0, 1, $read_length, $read_length]; }else{ [0, undef, undef, -1]; } }else{ $_[0] =~ s/H/S/g; my @styles = split /\d+/, $_[0]; shift @styles; if (@counts == 2){ if ($styles[0] eq 'M' and $styles[1] eq 'S'){ [1, 1, $counts[0], $counts[0]]; }elsif ($styles[0] eq 'S' and $styles[1] eq 'M'){ [-1, $counts[0]+1, $counts[1], $counts[1]]; } }elsif (@counts == 3){ if ($styles[0] eq 'S' and $styles[2] eq 'S'){ [10, $counts[0]+1, $counts[1], $counts[1]]; }elsif ($styles[0] eq 'M' and $styles[1] eq 'D' and $styles[2] eq 'M'){ [0, 1, $read_length, $read_length+$counts[1]]; }elsif ($styles[0] eq 'M' and $styles[1] eq 'I' and $styles[2] eq 'M'){ [0, 1, $read_length, $read_length-$counts[1]]; } }elsif ($styles[0] eq 'M' and $styles[-1] eq 'S'){ my ($M_sum, $D_sum, $I_sum); for my $i(0 .. $#styles-1){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; }elsif($styles[$i] eq 'I'){ $I_sum += $counts[$i]; } } [1, 1, $M_sum+$I_sum, $M_sum+$D_sum]; }elsif ($styles[0] eq 'S' and $styles[-1] eq 'M'){ my ($M_sum, $D_sum, $I_sum); for my $i(1 .. $#styles){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; }elsif($styles[$i] eq 'I'){ $I_sum += $counts[$i]; } } [-1, $counts[0]+1, $M_sum+$I_sum, $M_sum+$D_sum]; }elsif ($styles[0] eq 'M' and $styles[-1] eq 'M'){ my ($M_sum, $D_sum); for my $i(0 .. $#styles){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; } } [0, 1, $read_length, $M_sum+$D_sum]; }elsif ($styles[0] eq 'S' and $styles[-1] eq 'S'){ my ($M_sum, $D_sum, $I_sum); for my $i(1 .. $#styles-1){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; }elsif($styles[$i] eq 'I'){ $I_sum += $counts[$i]; } } [10, $counts[0]+1, $M_sum+$I_sum, $M_sum+$D_sum]; } } #\@cigar_units; } sub intron_retention_validation { #&intron_retention_validation($circ_chr, $cand_exon_start, $cand_exon_end, $exon_end_across_count{$last_exon_end}, $exon_end_across_count{$foremost_exon_start}, scalar(@{$junction_read_splicing{$last_exon_end.':'.$foremost_exon_start}}), $last_exon_end, $foremost_exon_start, \%junction_read_coverage); my ($chr, $start, $end, $tag_start, $tag_end, $inner_supporting_read_count, $last_exon_end, $foremost_exon_start, $ref_junction_cov) = @_; #exon_end_mt2_num, $exon_start_mt2_num, my (@U_pre, @U_after, $sample_size4U_test); my ($U_pre_total, $U_after_total, $Z_pre_total, $Z_after_total); my (@Z_pre, @Z_after, $decline_H1_count_pre, $decline_H1_count_after); #$Z_alpha, my ($total_cov0, $max_junc_gap, $cont_junc_cov0, $junc_cov0_count) = (0, 0, 0, 0); my $last_group = int(($foremost_exon_start-$last_exon_end-2)/($min_intron-5)); my @cov_all; for my $i($start .. $end){ my $cov; my $group = -1; if ($i > $last_exon_end and $i < $foremost_exon_start){ $group = int(($i-$last_exon_end-1)/($min_intron-5)); } if(exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i}"; $cov = $coverage{$chr}{$i}; }else{ $total_cov0 ++; print LOG "0"; $cov = 0; } if(exists ${$ref_junction_cov}{$i}){ print LOG "(${$ref_junction_cov}{$i}),"; $cont_junc_cov0 = 0 if ${$ref_junction_cov}{$i} >0; }else{ $junc_cov0_count ++; $cont_junc_cov0 ++; $max_junc_gap = $cont_junc_cov0 if $cont_junc_cov0 > $max_junc_gap; print LOG "(0),"; } if($group>=0){ for my $j(6 .. $min_intron){ if($cov > $coverage{$chr}{$start-$j}){ $U_pre[$group] ++; $U_pre_total ++; }elsif($cov == $coverage{$chr}{$start-$j}){ $U_pre[$group] += .5; $U_pre_total += .5; } if($cov > $coverage{$chr}{$end+$j}){ $U_after[$group] ++; $U_after_total ++; }elsif($cov == $coverage{$chr}{$end+$j}){ $U_after[$group] += .5; $U_after_total += .5; } } } push @cov_all, $cov; } my @cov_sort = sort {$a <=> $b} @cov_all; if($last_group > 0){ for my $i($foremost_exon_start-$min_intron+5 .. $foremost_exon_start-($foremost_exon_start-$last_exon_end-2)%($min_intron-5)-2){ for my $j(6 .. $min_intron){ if($coverage{$chr}{$i} > $coverage{$chr}{$start-$j}){ $U_pre[-1] ++; }elsif($coverage{$chr}{$i} == $coverage{$chr}{$start-$j}){ $U_pre[-1] += .5; } if($coverage{$chr}{$i} > $coverage{$chr}{$end+$j}){ $U_after[-1] ++; }elsif($coverage{$chr}{$i} == $coverage{$chr}{$end+$j}){ $U_after[-1] += .5; } } } $sample_size4U_test = $min_intron-5; }else{ $sample_size4U_test = $foremost_exon_start-$last_exon_end-1; } print LOG "\tpre:"; for my $i($start-$min_intron .. $start-6){ if (exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i},"; }else{ print LOG "0,"; } } printf LOG "\tafter:"; for my $i($end+6 .. $end+$min_intron){ if (exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i},"; }else{ print LOG "0,"; } } printf LOG "\t"; for my $i(0 .. $last_group){ if($sample_size4U_test>=$min_intron-5){ $Z_pre[$i] = &Z_calculation($U_pre[$i], $sample_size4U_test, $min_intron-5); $Z_after[$i] = &Z_calculation($U_after[$i], $sample_size4U_test, $min_intron-5); }else{ $Z_pre[$i] = 'n/a'; $Z_after[$i] = 'n/a'; } $decline_H1_count_pre ++ if $Z_pre[$i] <= $Z_alpha; $decline_H1_count_after ++ if $Z_after[$i] <= $Z_alpha; print LOG "U_pre:$U_pre[$i];Z_pre:$Z_pre[$i];U_after:$U_after[$i];Z_after:$Z_after[$i],"; } print LOG "\ttag_start:$tag_start\ttag_end:$tag_end\tlast_exon_end:$last_exon_end\tforemost_exon_start:$foremost_exon_start\ttotal_cov0:$total_cov0;junc_cov0_count:$junc_cov0_count;max_junc_gap:$max_junc_gap\tZ_alpha:$Z_alpha\t"; #U_pre:$U_pre;U_after:$U_after;Z_pre:$Z_pre;Z_after:$Z_after\t"; #.05:1.6449, .01:2.3263 if($foremost_exon_start-$last_exon_end-1 > $min_intron){ $Z_pre_total = &Z_calculation($U_pre_total, $foremost_exon_start-$last_exon_end-1, $min_intron-5); $Z_after_total = &Z_calculation($U_after_total, $foremost_exon_start-$last_exon_end-1, $min_intron-5); }else{ $Z_pre_total = 'n/a'; $Z_after_total = 'n/a'; } print LOG "U_pre_total:$U_pre_total;U_after_total:$U_after_total;Z_pre_total:$Z_pre_total;Z_after_total:$Z_after_total\t"; if ($tag_start == 0) { 0; [0, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_end == 0) { -1; [-1, $cov_sort[int($#cov_sort/2)]]; }elsif ($inner_supporting_read_count <= 2) { -2; [-2, $cov_sort[int($#cov_sort/2)]]; }elsif ($foremost_exon_start-$last_exon_end-1 < $min_intron) { -4; [-4, $cov_sort[int($#cov_sort/2)]]; }elsif ($total_cov0>0) { -5; [-5, $cov_sort[int($#cov_sort/2)]]; }elsif ($decline_H1_count_pre > 0) { -6; [-6, $cov_sort[int($#cov_sort/2)]]; }elsif ($decline_H1_count_after > 0) { -7; [-7, $cov_sort[int($#cov_sort/2)]]; }elsif ($Z_pre_total <= $Z_alpha) { -8; [-8, $cov_sort[int($#cov_sort/2)]]; }elsif ($Z_after_total <= $Z_alpha) { -9; [-9, $cov_sort[int($#cov_sort/2)]]; }else{ 1; [1, $cov_sort[int($#cov_sort/2)]]; } } sub Z_calculation{ my ($Wxy, $n, $m) = @_; my $Wy = $Wxy+$n*($n+1)/2; my $N = $n+$m; my $Z = ($Wy-$n*($N+1)/2)/sqrt($m*$n*($N+1)/12); #$Z; } sub exon_coverage_validation_single{ #$circ_chr, $cand_exon_start, $cand_exon_end, $supporting_read_start{$cand_exon_start}, $supporting_read_end{$cand_exon_end}, $start_across, $end_across, \%junction_read_coverage #$circ_chr, $cand_exon_start, $cand_exon_end, $supporting_read_start{$cand_exon_start}, $supporting_read_end{$cand_exon_end}, $splice_read_start{$cand_exon_start}, $splice_read_end{$cand_exon_end}, $start_across, $end_across, \%junction_read_coverage; my ($chr, $start, $end, $tag_start, $tag_end, $tag_start2, $tag_end2, $start_across, $end_across, $ref_junction_cov) = @_; my ($total_cov0, $max_junc_gap, $cont_junc_cov0, $junc_cov0_count) = (0, 0, 0, 0); my (@U_pre, @U_after, $sample_size4U_test); my ($U_pre_total, $U_after_total, $Z_pre_total, $Z_after_total); my (@Z_pre, @Z_after, $decline_H1_count_pre, $decline_H1_count_after); #$Z_alpha, my @cov_all; my $last_group = int(($end-$start)/($min_intron-5)); for my $i($start .. $end){ my $cov; my $group; $group = int(($i-$start)/($min_intron-5)); if(exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i}"; $cov = $coverage{$chr}{$i}; }else{ $total_cov0 ++; print LOG "0"; $cov = 0; } if(exists ${$ref_junction_cov}{$i}){ print LOG "(${$ref_junction_cov}{$i}),"; $cont_junc_cov0 = 0 if ${$ref_junction_cov}{$i} >0; }else{ $junc_cov0_count ++; $cont_junc_cov0 ++; $max_junc_gap = $cont_junc_cov0 if $cont_junc_cov0 > $max_junc_gap; print LOG "(0),"; } for my $j(6 .. $min_intron){ if($cov > $coverage{$chr}{$start-$j}){ $U_pre[$group] ++; $U_pre_total ++; }elsif($cov == $coverage{$chr}{$start-$j}){ $U_pre[$group] += .5; $U_pre_total += .5; } if($cov > $coverage{$chr}{$end+$j}){ $U_after[$group] ++; $U_after_total ++; }elsif($cov == $coverage{$chr}{$end+$j}){ $U_after[$group] += .5; $U_after_total += .5; } } push @cov_all, $cov; } my @cov_sort = sort {$a <=> $b} @cov_all; if($last_group > 0){ for my $i($end-$min_intron+6 .. $end-($end-$start)%($min_intron-5)-1){ for my $j(6 .. $min_intron){ if($coverage{$chr}{$i} > $coverage{$chr}{$start-$j}){ $U_pre[-1] ++; }elsif($coverage{$chr}{$i} == $coverage{$chr}{$start-$j}){ $U_pre[-1] += .5; } if($coverage{$chr}{$i} > $coverage{$chr}{$end+$j}){ $U_after[-1] ++; }elsif($coverage{$chr}{$i} == $coverage{$chr}{$end+$j}){ $U_after[-1] += .5; } } } $sample_size4U_test = $min_intron-5; }else{ $sample_size4U_test = $end-$start+1; } printf LOG "\t"; for my $i($start-$min_intron .. $start-6){ if (exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i},"; }else{ print LOG "0,"; } } printf LOG "\t"; for my $i($end+6 .. $end+$min_intron){ if (exists $coverage{$chr}{$i}){ print LOG "$coverage{$chr}{$i},"; }else{ print LOG "0,"; } } printf LOG "\t"; for my $i(0 .. $last_group){ $Z_pre[$i] = &Z_calculation($U_pre[$i], $sample_size4U_test, $min_intron-5); $Z_after[$i] = &Z_calculation($U_after[$i], $sample_size4U_test, $min_intron-5); $decline_H1_count_pre ++ if $Z_pre[$i] <= $Z_alpha; $decline_H1_count_after ++ if $Z_after[$i] <= $Z_alpha; print LOG "U_pre:$U_pre[$i];Z_pre:$Z_pre[$i];U_after:$U_after[$i];Z_after:$Z_after[$i],"; } print LOG "\ttag_start:$tag_start;tag_start2:$tag_start2;start_across:$start_across\ttag_end:$tag_end;tag_end2:$tag_end2;end_across:$end_across\ttotal_cov0:$total_cov0;junc_cov0_count:$junc_cov0_count;max_junc_gap:$max_junc_gap\tZ_alpha:$Z_alpha\t"; #U_pre:$U_pre;U_after:$U_after\t;Z_pre:$Z_pre;Z_after:$Z_after $Z_pre_total = &Z_calculation($U_pre_total, $end-$start+1, $min_intron-5); $Z_after_total = &Z_calculation($U_after_total, $end-$start+1, $min_intron-5); print LOG "U_pre_total:$U_pre_total;U_after_total:$U_after_total;Z_pre_total:$Z_pre_total;Z_after_total:$Z_after_total\t"; if ($tag_start eq '0' and $start_across > 0) { 0; [0, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_end eq '0' and $end_across > 0) { -1; [-1, $cov_sort[int($#cov_sort/2)]]; }elsif ($total_cov0 > 0) { -2; [-2, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_start2 <2 and $tag_start ne 'start' and $decline_H1_count_pre > 0) { -3; [-3, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_end2 <2 and $tag_end ne 'end' and $decline_H1_count_after > 0) { -4; [-4, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_start2 <2 and $tag_start ne 'start' and $Z_pre_total <= $Z_alpha) { -8; [-8, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_end2 <2 and $tag_end ne 'end' and $Z_after_total <= $Z_alpha) { -9; [-9, $cov_sort[int($#cov_sort/2)]]; }else{ if ($junc_cov0_count/($end-$start+1) <= .2){ 1; [1, $cov_sort[int($#cov_sort/2)]]; }elsif ($tag_start ne '0' and $tag_end ne '0' and $max_junc_gap<$min_intron){ 2; [2, $cov_sort[int($#cov_sort/2)]]; }else{ -10; [-10, $cov_sort[int($#cov_sort/2)]]; } } } sub exon_coverage_validation{ my ($chr, $start, $end, $tag_start, $tag_end) = @_; my ($sum_inside, $sum_pre, $sum_after, $min_inside, $ave_inside, $ave_pre, $ave_after); my ($result_pre, $result_after) = (1, 1); $min_inside = $coverage{$chr}{$start}; for my $i($start .. $end){ if(exists $coverage{$chr}{$i}){ $sum_inside += $coverage{$chr}{$i}; $min_inside = $coverage{$chr}{$i} if $min_inside > $coverage{$chr}{$i}; print LOG "$coverage{$chr}{$i},"; }else{ $min_inside = 0; print LOG "0,"; } } $ave_inside = $sum_inside/($end - $start + 1); printf LOG "\t%.2f\t", $ave_inside, $min_inside; if($tag_start eq 'start'){ print LOG "start\tstart\t"; }else{ for my $i($start-$min_intron .. $start-6){ if (exists $coverage{$chr}{$i}){ $sum_pre += $coverage{$chr}{$i}; print LOG "$coverage{$chr}{$i},"; if ($coverage{$chr}{$i} >= $min_inside){ $result_pre --; #last; } }else{ print LOG "0,"; } } $ave_pre = $sum_pre/($min_intron-5); printf LOG "\t%.2f\t", $ave_pre; if ($ave_pre*2 > $ave_inside or $min_inside == 0){ $result_pre --; } } print LOG "\t"; if($tag_end eq 'end'){ print LOG "end\tend\t"; }else{ for my $i($end+6 .. $end+$min_intron){ if (exists $coverage{$chr}{$i}){ $sum_after += $coverage{$chr}{$i}; print LOG "$coverage{$chr}{$i},"; if ($coverage{$chr}{$i} >= $min_inside){ $result_after --; } }else{ print LOG "0,"; } } $ave_after = $sum_after/($min_intron-5); printf LOG "\t%.2f\t", $ave_after; if ($ave_after*2 > $ave_inside or $min_inside == 0){ $result_after --; } } print LOG "\t$result_pre,$result_after\t"; if($result_pre == 1 and $result_after == 1){ 1; }else{ 0; } } sub mapping_check1{ my $read_name = shift; my (@read1, @read2); for(@_){ my @line = split (/\t/, $_, 3); if(&ten2b($line[1],5) == 1){ push @read1, $_; }else{ push @read2, $_; } } &mapping_check2($read_name, 1, @read1); &mapping_check2($read_name, 0, @read2); } sub mapping_check1_add{ my $read_name = shift; my $cluster_num = shift; my (@read1, @read2); for(@_){ my @line = split (/\t/, $_, 3); if(&ten2b($line[1],5) == 1){ push @read1, $_; }else{ push @read2, $_; } } &mapping_check2_add($read_name, $cluster_num, 1, @read1); &mapping_check2_add($read_name, $cluster_num, 0, @read2); } sub mapping_check2{ my $read_name = shift; my $if_reverse = shift; my (@strand, @chr, @pos, @MAPQ, @cigar, @match, $z, @attributes, $na_tag); my $seq = ''; my ($circ_chr, $circ_start, $circ_end) = @{$circ_info{ $junction_read{$read_name} }}[0 .. 2]; for(@_){ my @line = split /\t/; push @strand, &ten2b($line[1], 5); push @chr, $line[2]; push @pos, $line[3]; push @MAPQ, $line[4]; push @cigar, $line[5]; my $MSID = &MSID($line[5]); push @match, $MSID; if($seq eq '' and length($line[9]) == $read_length){ $seq = $line[9]; $z = $strand[-1]; } my @length = @$MSID; if($length[-1] > 0){ push @attributes, [$strand[-1], $line[2], $line[3], $line[3]+$length[-1]-1, $read_name]; }else{ push @attributes, [0, 0, 0, 0, $read_name]; } } READ: for my $i(0 .. $#_-1){ for my $j($i+1 .. $#_){ if($chr[$i] eq $chr[$j] and $chr[$i] eq $circ_chr and $strand[$i] == $strand[$j]){ if ($strand[$i] != $z){ $seq = &comp_rev($seq); $z = $strand[$i]; } if(${$match[$i]}[0]*${$match[$j]}[0] == -1){ my @match1 = @{$match[$i]}; my @match2 = @{$match[$j]}; my $cir_scale = $match1[0]*($pos[$i]+$match1[2])+$match2[0]*($pos[$j]+$match2[2]); if(abs($match1[1]-$match2[1])<=6 and $cir_scale < 0 and $pos[$i] >= $circ_start-6 and $pos[$i]+$match1[-1]-1 <= $circ_end+6 and $pos[$j] >= $circ_start-6 and $pos[$j]+$match2[-1]-1 <= $circ_end+6 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my @pos_com = sort{$a <=> $b}($pos[$i]+$match1[2], $pos[$j]+$match2[2]); my $end_adjustment1 = int(($match1[1]*$match1[0]+$match2[1]*$match2[0])/2); my $end_adjustment2 = $match1[1]*$match1[0]+$match2[1]*$match2[0] - $end_adjustment1; my($x, $y) = sort {${$match[$a]}[0] <=> ${$match[$b]}[0]} ($i, $j); my $CIGAR = [$cigar[$x], $cigar[$y], undef]; #larger position is first, while smaller position is second. if (exists $positive_reads_count{$read_name} and $chr[$i] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected ++; $positive_reads{$read_name}{"chr"} = $chr[$i]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos_com[0]-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos_com[1]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } }elsif(abs(${$match[$i]}[0]*${$match[$j]}[0]) == 10){ my($x, $y) = sort {${$match[$a]}[0] <=> ${$match[$b]}[0]} ($i, $j); if(${$match[$x]}[0] == -1){ my $cir_scale = $pos[$y]+${$match[$y]}[-1]-1-$pos[$x]; if(abs($read_length-${$match[$y]}[2]-${$match[$x]}[1])<=6 and $cir_scale < 0 and $pos[$y] >= $circ_start-6 and $pos[$x]+${$match[$x]}[-1]-1 <= $circ_end+6 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my $end_adjustment1 = int((${$match[$y]}[1]+${$match[$y]}[-1]-${$match[$x]}[1])/2); my $end_adjustment2 = ${$match[$y]}[1]+${$match[$y]}[-1]-${$match[$x]}[1]-$end_adjustment1; my $CIGAR = [$cigar[$x], undef, $cigar[$y]]; if (exists $positive_reads_count{$read_name} and $chr[$x] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected ++; $positive_reads{$read_name}{"chr"} = $chr[$x]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos[$y]+${$match[$y]}[-1]-1-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos[$x]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } }else{ my $cir_scale = $pos[$x]+${$match[$x]}[-1]-1-$pos[$y]; if(abs(${$match[$x]}[1]-${$match[$y]}[1])<=6 and $cir_scale < 0 and $pos[$x] >= $circ_start-6 and $pos[$y]+${$match[$y]}[-1]-1 <= $circ_end+6 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my $end_adjustment1 = int((${$match[$x]}[1]-${$match[$y]}[1])/2); my $end_adjustment2 = ${$match[$x]}[1]-${$match[$y]}[1] - $end_adjustment1; my $CIGAR = [undef, $cigar[$x], $cigar[$y]]; if (exists $positive_reads_count{$read_name} and $chr[$x] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected ++; $positive_reads{$read_name}{"chr"} = $chr[$x]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos[$x]+${$match[$x]}[-1]-1-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos[$y]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } } } } } } if ($na_tag == 1){ $detected -= scalar (%{$positive_reads{$read_name}}); delete $positive_reads{$read_name}; delete $positive_reads_count{$read_name}; } } sub mapping_check2_add{ my $read_name = shift; my $cluster_num = shift; my $if_reverse = shift; my (@strand, @chr, @pos, @MAPQ, @cigar, @match, $z, @attributes, $na_tag); my $seq = ''; my $detected = 0; for(@_){ my @line = split /\t/; push @strand, &ten2b($line[1], 5); push @chr, $line[2]; push @pos, $line[3]; push @MAPQ, $line[4]; push @cigar, $line[5]; my $MSID = &MSID($line[5]); push @match, $MSID; if($seq eq '' and length($line[9]) == $read_length){ $seq = $line[9]; $z = $strand[-1]; } my @length = @$MSID; if($length[-1] > 0){ push @attributes, [$strand[-1], $line[2], $line[3], $line[3]+$length[-1]-1, $read_name]; }else{ push @attributes, [0, 0, 0, 0, $read_name]; } } READ: for my $i(0 .. $#_-1){ for my $j($i+1 .. $#_){ if($chr[$i] eq $chr[$j] and $strand[$i] == $strand[$j]){ if ($strand[$i] != $z){ $seq = &comp_rev($seq); $z = $strand[$i]; } if(${$match[$i]}[0]*${$match[$j]}[0] == -1){ my @match1 = @{$match[$i]}; my @match2 = @{$match[$j]}; my $cir_scale = $match1[0]*($pos[$i]+$match1[2])+$match2[0]*($pos[$j]+$match2[2]); if(abs($match1[1]-$match2[1])<=6 and $cir_scale < 0 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my @pos_com = sort{$a <=> $b}($pos[$i]+$match1[2], $pos[$j]+$match2[2]); my $end_adjustment1 = int(($match1[1]*$match1[0]+$match2[1]*$match2[0])/2); my $end_adjustment2 = $match1[1]*$match1[0]+$match2[1]*$match2[0] - $end_adjustment1; my($x, $y) = sort {${$match[$a]}[0] <=> ${$match[$b]}[0]} ($i, $j); my $CIGAR = [$cigar[$x], $cigar[$y], undef]; #larger position is first, while smaller position is second. if (exists $positive_reads_count{$read_name} and $chr[$i] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected_add ++; $positive_reads{$read_name}{"chr"} = $chr[$i]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos_com[0]-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos_com[1]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } }elsif(abs(${$match[$i]}[0]*${$match[$j]}[0]) == 10){ my($x, $y) = sort {${$match[$a]}[0] <=> ${$match[$b]}[0]} ($i, $j); if(${$match[$x]}[0] == -1){ my $cir_scale = $pos[$y]+${$match[$y]}[-1]-1-$pos[$x]; if(abs($read_length-${$match[$y]}[2]-${$match[$x]}[1])<=6 and $cir_scale < 0 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my $end_adjustment1 = int((${$match[$y]}[1]+${$match[$y]}[-1]-${$match[$x]}[1])/2); my $end_adjustment2 = ${$match[$y]}[1]+${$match[$y]}[-1]-${$match[$x]}[1]-$end_adjustment1; my $CIGAR = [$cigar[$x], undef, $cigar[$y]]; if (exists $positive_reads_count{$read_name} and $chr[$x] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected_add ++; $positive_reads{$read_name}{"chr"} = $chr[$x]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos[$y]+${$match[$y]}[-1]-1-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos[$x]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } }else{ my $cir_scale = $pos[$x]+${$match[$x]}[-1]-1-$pos[$y]; if(abs(${$match[$x]}[1]-${$match[$y]}[1])<=6 and $cir_scale < 0 and $MAPQ[$i] >= $mapq_uni and $MAPQ[$j] >= $mapq_uni and $MAPQ[$i]+$MAPQ[$j] >= $mapq_both){ my $end_adjustment1 = int((${$match[$x]}[1]-${$match[$y]}[1])/2); my $end_adjustment2 = ${$match[$x]}[1]-${$match[$y]}[1] - $end_adjustment1; my $CIGAR = [undef, $cigar[$x], $cigar[$y]]; if (exists $positive_reads_count{$read_name} and $chr[$x] ne $positive_reads{$read_name}{"chr"}){ $na_tag = 1; last READ; } $detected_add ++; $positive_reads{$read_name}{"chr"} = $chr[$x]; $positive_reads_count{$read_name} ++; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site2"} = $pos[$x]+${$match[$x]}[-1]-1-$end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"site1"} = $pos[$y]+$end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust1"} = $end_adjustment1; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"adjust2"} = $end_adjustment2; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"cigar"} = $CIGAR; $positive_reads{$read_name}{$positive_reads_count{$read_name}}{"if_reverse"} = $if_reverse; } } } } } } if ($na_tag == 1){ $detected_add -= scalar (%{$positive_reads{$read_name}}); delete $positive_reads{$read_name}; delete $positive_reads_count{$read_name}; } } sub cluster_reads{ my $pre; my @sort_reads1 = sort { $positive_reads{${$a}[0]}{"chr"} cmp $positive_reads{${$b}[0]}{"chr"} or $positive_reads{${$a}[0]}{${$a}[1]}{"site2"} <=> $positive_reads{${$b}[0]}{${$b}[1]}{"site2"} } @_; my %group_read; my %cluster_read; my $group_num = 0; for (@sort_reads1){ if ( $positive_reads{${$_}[0]}{${$_}[1]}{"site2"} - $positive_reads{${$pre}[0]}{${$pre}[1]}{"site2"} <= 3 and $positive_reads{${$_}[0]}{"chr"} eq $positive_reads{${$pre}[0]}{"chr"} ){ push @{$group{$group_read{$pre}}}, $_; $group_read{$_} = $group_read{$pre}; }else{ my $group_ID = 'group'.$group_num; $group_read{$_} = $group_ID; push @{$group{$group_ID}}, $_; $group_num++; } $pre = $_; } for my $i(0 .. $group_num-1){ my $group_ID = 'group'.$i; my @sort_reads2 = sort {$positive_reads{${$a}[0]}{${$a}[1]}{"site1"} <=> $positive_reads{${$b}[0]}{${$b}[1]}{"site1"}} @{$group{$group_ID}}; my $pre2; for (@sort_reads2){ if ($positive_reads{${$_}[0]}{${$_}[1]}{"site1"} - $positive_reads{${$pre2}[0]}{${$pre2}[1]}{"site1"} <= 3 and $positive_reads{${$_}[0]}{"chr"} eq $positive_reads{${$pre2}[0]}{"chr"}){ push @{$cluster{ $cluster_read{$pre2} }}, $_; $cluster_read{$_} = $cluster_read{$pre2}; }else{ my $clusterID = 'cluster'. $cluster_num; $cluster_read{$_} = $clusterID; push @{$cluster{$clusterID}}, $_; $cluster_num++; $chr_cluster{$clusterID} = $positive_reads{${$_}[0]}{"chr"}; } $pre2 = $_; } } for my $j(0 .. $cluster_num-1){ my $clusterID = 'cluster'.$j; my $x = int($#{$cluster{$clusterID}}/2); my $chromo = $chr_cluster{$clusterID}; $site1_cluster{$clusterID} = $positive_reads{$cluster{$clusterID}[$x][0]}{$cluster{$clusterID}[$x][1]}{"site1"}; $site2_cluster{$clusterID} = $positive_reads{$cluster{$clusterID}[$x][0]}{$cluster{$clusterID}[$x][1]}{"site2"}; } } sub splice_loci_check{ my @chr_type; my ($positive_num1, $positive_num2); my %read_in_chr; #my %sense_strand; my $pos_circ_num_now = 0; my @sort_reads = sort { $positive_reads{$a}{"chr"} cmp $positive_reads{$b}{"chr"} } @_; for my $read(@sort_reads){ push @chr_type, $positive_reads{$read}{"chr"} unless $positive_reads{$read}{"chr"} eq $chr_type[-1]; push @{ $read_in_chr{$positive_reads{$read}{"chr"}} }, $read; } &proc_bar($pos_circ_num_now, scalar(@sort_reads)); for my $chr(@chr_type){ my $chr_seq; if(defined $ref_dir){ open CHR, "<", $ref_dir."/".$chr.".fa" or open CHR, "<", $ref_dir."/".$chr.".fasta" or die "CIRI-AS cannot open chromosome file of $chr in directory $ref_dir: $!"; while(){ chomp; unless(/^>/){ $chr_seq .= $_; } } close CHR; }elsif(length($chr_seq_1file{$chr})>0 ){ $chr_seq = $chr_seq_1file{$chr}; }else{ die "Vital errors met when reading reference of $chr!"; } for my $read(@{$read_in_chr{$chr}}){ for my $i(1 .. $positive_reads_count{$read}){ my ($end_string1, $end_string2); my $diff_adjt; my $pinhead = 0; my $total_adjustment = $positive_reads{$read}{$i}{"adjust1"} + $positive_reads{$read}{$i}{"adjust2"}; if($positive_reads{$read}{$i}{"adjust2"}>=0){ $end_string1 = substr($chr_seq, $positive_reads{$read}{$i}{"site1"}-$positive_reads{$read}{$i}{"adjust1"}-4, 4+$total_adjustment); $end_string2 = substr($chr_seq, $positive_reads{$read}{$i}{"site2"}-$positive_reads{$read}{$i}{"adjust1"}-1, 4+$total_adjustment); }else{ $end_string1 = substr($chr_seq, $positive_reads{$read}{$i}{"site1"}+$positive_reads{$read}{$i}{"adjust2"}-4, 4-$total_adjustment); $end_string2 = substr($chr_seq, $positive_reads{$read}{$i}{"site2"}+$positive_reads{$read}{$i}{"adjust2"}-1, 4-$total_adjustment); } my @index_compare_result = &index_compare($end_string1, $end_string2); if ( $index_compare_result[0] != 0 ){ if ($index_compare_result[0] > 0){ $positive_num1 ++; #$sense_strand{$read} = $index_compare_result[2]; }else{ $positive_num2 ++; #$sense_strand{$read} = $index_compare_result[2]; } if ($positive_reads{$read}{$i}{"adjust2"} >= 0){ $diff_adjt = $index_compare_result[1] - 1 - $positive_reads{$read}{$i}{"adjust1"}; }else{ $diff_adjt = $index_compare_result[1] - 1 + $positive_reads{$read}{$i}{"adjust2"}; } push @loci_validated, [$read, $i]; $pinhead = 1; } if($pinhead == 1){ $positive_reads{$read}{$i}{"adjust1"} = $diff_adjt + $positive_reads{$read}{$i}{"adjust1"}; $positive_reads{$read}{$i}{"adjust2"} = $total_adjustment - $positive_reads{$read}{$i}{"adjust1"}; $positive_reads{$read}{$i}{"site1"} = $positive_reads{$read}{$i}{"site1"} + $diff_adjt; $positive_reads{$read}{$i}{"site2"} = $positive_reads{$read}{$i}{"site2"} + $diff_adjt; #$positive_reads{$read}{$i}{"strand"} = $positive_reads{$read}{$i}{"site2"} + $diff_adjt; } } #if($positive_num1 >0 and $positive_num1 > $positive_num2){ # $sense_strand{$read} = '-'; #}elsif($positive_num2 >0 and $positive_num2 > $positive_num1){ # $sense_strand{$read} = '+'; #} } $pos_circ_num_now += @{$read_in_chr{$chr}}; &proc_bar($pos_circ_num_now, scalar(@sort_reads)); } print "\n"; } sub index_compare{ my %base_index; my @bibase1 = ('AC', 'AG'); my @bibase2 = ('CT', 'GT'); my @result; my $strand_gtf; my $hint = 0; for my $i(@bibase1){ my $pre_index = -1; while(1){ my $index = index("\U$_[0]", $i, $pre_index+1); last if $index == -1; #push @{$base_index{$i}}, $index; $base_index{$i}{$index} ++; $pre_index = $index; } } for my $i(@bibase2){ my $pre_index = -1; while(1){ my $index = index("\U$_[1]", $i, $pre_index+1); last if $index == -1; #push @{$base_index{$i}}, $index; $base_index{$i}{$index} ++; $pre_index = $index; } } if( defined $base_index{'AC'} and defined $base_index{'CT'} ){ #@result = grep {$_ ~~ @{$base_index{'CT'}} } @{$base_index{'AC'}}; @result = grep { exists $base_index{'CT'}{$_} } keys %{$base_index{'AC'}}; $hint = 1 if @result>0; $strand_gtf = '-'; }elsif( defined $base_index{'AG'} and defined $base_index{'GT'} ){ #@result = grep {$_ ~~ @{$base_index{'GT'}} } @{$base_index{'AG'}}; @result = grep { exists $base_index{'GT'}{$_} } keys %{$base_index{'AG'}}; $hint = -1 if @result>0; $strand_gtf = '+'; } ($hint, $result[0], $strand_gtf); } sub comp_rev{ my $seq = reverse($_[0]); $seq =~ tr/ATCG/TAGC/; $seq; } sub MSID{ #my($M_sum, $S_num, $S_num1, $S_num2, @I_num, @D_num, $I_sum, $D_sum, $pos_adjustment, $length); my @counts = split /[MSIDH]/, $_[0]; if (@counts == 1){ if ($_[0] eq "${read_length}M"){ [0, 0, 0, $read_length]; }else{ [0, undef, undef, -1]; } }else{ $_[0] =~ s/H/S/g; my @styles = split /\d+/, $_[0]; shift @styles; if (@counts == 2){ if ($styles[0] eq 'M' and $styles[1] eq 'S'){ [1, $counts[0], $counts[0]-1, $counts[0]]; }elsif ($styles[0] eq 'S' and $styles[1] eq 'M'){ [-1, $counts[0], 0, $counts[1]]; }else{ [0, undef, undef, -2]; } }elsif (@counts == 3){ if ($styles[0] eq 'S' and $styles[2] eq 'S'){ [10, $counts[0], $counts[2], $counts[1]]; }elsif ($styles[0] eq 'M' and $styles[1] eq 'D' and $styles[2] eq 'M'){ [0, 0, 0, $read_length+$counts[1]]; }elsif ($styles[0] eq 'M' and $styles[1] eq 'I' and $styles[2] eq 'M'){ [0, 0, 0, $read_length-$counts[1]]; }else{ [0, undef, undef, -2]; } }elsif ($styles[0] eq 'M' and $styles[-1] eq 'S'){ my ($M_sum, $D_sum); for my $i(0 .. $#styles-1){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; } } [1, $read_length-$counts[-1], $M_sum+$D_sum-1, $M_sum+$D_sum]; }elsif ($styles[0] eq 'S' and $styles[-1] eq 'M'){ my ($M_sum, $D_sum); for my $i(1 .. $#styles){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; } } [-1, $counts[0], 0, $M_sum+$D_sum]; }elsif ($styles[0] eq 'M' and $styles[-1] eq 'M'){ my ($M_sum, $D_sum); for my $i(0 .. $#styles){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; } } [0, 0, 0, $M_sum+$D_sum]; }elsif ($styles[0] eq 'S' and $styles[-1] eq 'S'){ my ($M_sum, $D_sum); for my $i(1 .. $#styles-1){ if ($styles[$i] eq 'M'){ $M_sum += $counts[$i]; }elsif($styles[$i] eq 'D'){ $D_sum += $counts[$i]; } } [10, $counts[0], $counts[-1], $M_sum+$D_sum]; }else{ [0, undef, undef, -2]; } } #\@cigar_units; } sub ten2b{ my $b_string = sprintf("%b", $_[0]); if ($_[1] <= length($b_string)){ substr(reverse($b_string), $_[1]-1, 1); }else{ 0; } } sub proc_bar{ local $| = 1; my $block = $_[0]/$_[1]*50; if ($block > 1){ print "\r[".("-" x int($block-1)).'>'.(" " x (50 - int($block)))."] "; }else{ print "\r[".'>'.(" " x 49)."] "; } printf "%5.1f %%",$block*2; local $| = 0; } }