#!/usr/bin/perl use strict; use warnings; ######################################################### ######################################################### ###Opens Genomic Alignment files #Query (Genomic) Subject (protein) %ID Length QueryS QueryE SubS SubE Evalue #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 96.36 55 463427 463263 886 940 4e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 86.89 61 467532 467350 826 886 2e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 91.23 57 468242 468072 771 827 2e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 75.32 77 468606 468376 722 784 5e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 79.57 93 470087 469809 642 734 3e-35 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 86.21 58 472552 472379 586 641 2e-18 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 41.96 112 476666 476337 489 556 1e-06 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 79.49 39 479038 478922 435 473 1e-08 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 87.5 32 479657 479562 403 434 1e-07 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 92.45 53 494678 494520 304 356 1e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 91.67 48 496023 495880 256 303 2e-18 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 86.89 61 496358 496176 187 247 1e-22 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 87.5 40 498551 498432 152 191 4e-10 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 97.92 48 499332 499189 105 152 2e-18 0 #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 81.58 76 0406 0179 30 105 3e-29 0 &read_file($ARGV[0]); sub read_file { open (IN, "<$_[0]") || die "Can't open genomic alignment file\n"; my $hit_name = 0; my @lines = (); while (){ chomp $_; my @line = split (/\t/, $_); if (eof){ if ($hit_name ne "$line[0]\t$line[1]"){ my ($regions, $score) = &protein_region(\@lines); my %regions = %{$regions}; my %score = %{$score}; foreach my $number (keys %regions){ foreach my $protein_region (keys %{$regions{$number}}){ my @length = split (/\t/, $protein_region); my $length = $length[1] - $length[0] + 1; print "$hit_name\t$number\t$protein_region\t$regions{$number}{$protein_region}\t$length\t$score{$number}{$protein_region}\n"; } } ###################################### @lines = (); push (@lines, $_); $hit_name = "$line[0]\t$line[1]"; ($regions, $score) = &protein_region(\@lines); %regions = %{$regions}; %score = %{$score}; foreach my $number (keys %regions){ foreach my $protein_region (keys %{$regions{$number}}){ my @length = split (/\t/, $protein_region); my $length = $length[1] - $length[0] + 1; print "$hit_name\t$number\t$protein_region\t$regions{$number}{$protein_region}\t$length\t$score{$number}{$protein_region}\n"; } } }else{ push (@lines, $_); my ($regions, $score) = &protein_region(\@lines); my %regions = %{$regions}; my %score = %{$score}; foreach my $number (keys %regions){ foreach my $protein_region (keys %{$regions{$number}}){ my @length = split (/\t/, $protein_region); my $length = $length[1] - $length[0] + 1; print "$hit_name\t$number\t$protein_region\t$regions{$number}{$protein_region}\t$length\t$score{$number}{$protein_region}\n"; } } } }else{ if ($hit_name ne "0"){ if ($hit_name ne "$line[0]\t$line[1]"){ my ($regions, $score) = &protein_region(\@lines); my %regions = %{$regions}; my %score = %{$score}; foreach my $number (keys %regions){ foreach my $protein_region (keys %{$regions{$number}}){ my @length = split (/\t/, $protein_region); my $length = $length[1] - $length[0] + 1; print "$hit_name\t$number\t$protein_region\t$regions{$number}{$protein_region}\t$length\t$score{$number}{$protein_region}\n"; } } @lines = (); } } $hit_name = "$line[0]\t$line[1]"; push (@lines, $_); } } close (IN); } sub protein_region { ####################################################### ####################################################### ###This part removes overlapping segments (at first at the protein and genomic levels, and then only at the genomic) and returns on the best ################################################ ################################################ ###Returns the best overlap (at protein and genomic level) #print "####################\n"; my @alignments = @{$_[0]}; my %region = (); ###number => Gene_region => Genomic_region my %score = (); ###number => Gene_region => e_value my %length = (); ###start_position_protein => alignment_information my %id = (); ### Gene_region => Genomic_region => %id foreach (@alignments){ my @line_info = split (/\t/, $_); $id{"$line_info[6]\t$line_info[7]"}{"$line_info[4]\t$line_info[5]"} = $line_info[2]; my $length = $line_info[6]; $length{$_} = $length; } @alignments = sort {$length{$a} <=> $length{$b}} keys %length; %length = (); my $number_count = 0; foreach (@alignments){ #Query (Genomic) Subject (protein) %ID Length QueryS QueryE SubS SubE Evalue #ccf2000000113_0-0_ssa19 UNC5B_NP_001104619.1 97.92 48 499332 499189 105 152 2e-18 0 my @line_info = split (/\t/, $_); my $new_region_bin = 0; my @new_region = ($line_info[6], $line_info[7]); my @new_genomic = ($line_info[4], $line_info[5]); my $e_value = $line_info[8]; my $new_orientation = 1; if ($new_genomic[0] > $new_genomic[1]){ $new_orientation = 0; } my @overlaps = (); foreach my $number (keys %region){ foreach my $old_region (keys %{$region{$number}}){ my @old_region = split (/\t/, $old_region); my @old_genomic = split (/\t/, $region{$number}{$old_region}); my $old_e_value = $score{$number}{$old_region}; my $old_orientation = 1; if ($old_genomic[0] > $old_genomic[1]){ $old_orientation = 0; } if ($new_region[0] >= $old_region[0] and $new_region[1] <= $old_region[1]){ ###The new seq is completely within old seq if ($old_orientation == $new_orientation){ if ($new_orientation > 0){ if ($old_genomic[1] < $new_genomic[0] or $new_genomic[1] < $old_genomic[0]){ ###Doesn't overlap Genomic }else{ ###Does overlap if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } }else{ if ($old_genomic[1] > $new_genomic[0] or $new_genomic[1] > $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } } } }elsif($new_region[0] >= $old_region[0] and $new_region[0] <= $old_region[1]){ ###The new seq overhangs old seq => if ($old_orientation == $new_orientation){ if ($new_orientation > 0){ if ($old_genomic[1] < $new_genomic[0] or $new_genomic[1] < $old_genomic[0]){ ###Doesn't overlap Genomic }else{ ###Does overlap if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } }else{ if ($old_genomic[1] > $new_genomic[0] or $new_genomic[1] > $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } } } }elsif($new_region[1] >= $old_region[0] and $new_region[1] <= $old_region[1]){ if ($old_orientation == $new_orientation){ if ($new_orientation > 0){ if ($old_genomic[1] < $new_genomic[0] or $new_genomic[1] < $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } }else{ if ($old_genomic[1] > $new_genomic[0] or $new_genomic[1] > $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } } } }elsif($new_region[0] <= $old_region[0] and $new_region[1] >= $old_region[1]){ if ($old_orientation == $new_orientation){ if ($new_orientation > 0){ if ($old_genomic[1] < $new_genomic[0] or $new_genomic[1] < $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } }else{ if ($old_genomic[1] > $new_genomic[0] or $new_genomic[1] > $old_genomic[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $region{$number}{"$old_region[0]\t$old_region[1]"}; delete $score{$number}{"$old_region[0]\t$old_region[1]"}; } } } } } } } if ($new_region_bin == 0){ $region{$number_count}{"$line_info[6]\t$line_info[7]"} = "$line_info[4]\t$line_info[5]"; $score{$number_count}{"$line_info[6]\t$line_info[7]"} = $e_value; $number_count ++; } } ######################################### ######################################### ###Returns remaining best hit (at the genomic level only--Cannot have multiple overlapping) #my %region = (); ###number => Gene_region => Genomic_region #my %score = (); ###number => Gene_region => e_value my %genomic_region = (); ### number => Gene_region => Genomic_region (only the best for overlapping genomic regions) my %new_score = (); ### number => Gene_region => e_value $number_count = 0; foreach my $number (keys %region){ foreach my $new_gene_region (keys %{$region{$number}}){ my @new_genomic_region = split (/\t/, $region{$number}{$new_gene_region}); my $new_region_bin = 0; my $new_orientation = 1; my $e_value = $score{$number}{$new_gene_region}; if ($new_genomic_region[0] > $new_genomic_region[1]){ $new_orientation = 0; } foreach my $old_number (keys %genomic_region){ foreach my $old_gene_region (keys %{$genomic_region{$old_number}}){ my @old_genomic_region = split (/\t/, $genomic_region{$old_number}{$old_gene_region}); my $old_e_value = $new_score{$old_number}{$old_gene_region}; my $old_orientation = 1; if ($old_genomic_region[0] > $old_genomic_region[1]){ $old_orientation = 0; } if ($old_orientation == $new_orientation and $new_orientation == 1){ if ($old_genomic_region[1] < $new_genomic_region[0] or $new_genomic_region[1] < $old_genomic_region[0]){ ###Doesn't overlap Genomic }else{ ###Does overlap if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $genomic_region{$old_number}{$old_gene_region}; delete $new_score{$old_number}{$old_gene_region}; } } }elsif($old_orientation == $new_orientation and $new_orientation == 0){ if ($old_genomic_region[1] > $new_genomic_region[0] or $new_genomic_region[1] > $old_genomic_region[0]){ }else{ if ($old_e_value < $e_value){ $new_region_bin = 1; }else{ delete $genomic_region{$old_number}{$old_gene_region}; delete $new_score{$old_number}{$old_gene_region}; } } } } } if ($new_region_bin == 0){ $genomic_region{$number_count}{$new_gene_region} = "$new_genomic_region[0]\t$new_genomic_region[1]"; $new_score{$number_count}{$new_gene_region} = $e_value; $number_count ++; } } } %score = ();### number => gene_region => % id foreach my $number (keys %genomic_region){ foreach my $gene_region (keys %{$genomic_region{$number}}){ my $genomic_region = $genomic_region{$number}{$gene_region}; my $percent_id = $id{$gene_region}{$genomic_region}; $score{$number}{$gene_region} = $percent_id; } } return (\%genomic_region, \%score); }