File Coverage

lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm
Criterion Covered Total %
branch 256 350 73.1
subroutine 53 61 86.8
pod 4 8 50.0
total 313 419 74.7


line bran sub pod code
1       package Sanger::CGP::Vagrent::Annotators::AbstractAnnotator;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2014 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of VAGrENT.
9       #
10       # VAGrENT is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       ##########LICENCE##########
23        
24        
25   4   use strict;
26        
27   4   use Log::Log4perl;
28   4   use POSIX qw(ceil);
29   4   use Data::Dumper;
30   4   use Attribute::Abstract;
31   4   use Const::Fast qw(const);
32        
33   4   use Sanger::CGP::Vagrent qw($VERSION);
34        
35   4   use Sanger::CGP::Vagrent::Data::Annotation;
36   4   use Sanger::CGP::Vagrent::Data::AnnotationGroup;
37        
38   4   use base qw(Sanger::CGP::Vagrent::Ontology::SequenceOntologyClassifier);
39        
40       my $log = Log::Log4perl->get_logger(__PACKAGE__);
41        
42       # constant reference values for consensus splice site values.
43       const my @CONSENSUS_SPLICE_OFFSETS => (-2, -1, 1, 2, 5);
44       const my $CONSENSUS_SPLICE_BEFORE_BOUNDRY => -2;
45       const my $CONSENSUS_SPLICE_AFTER_BOUNDRY => 5;
46        
47       # constant value representing the cutoff for intronic calls
48       const my $INTRONIC_OFFSET_CUTOFF => 11;
49        
50       const my $UPDOWNSTREAM_5KB_CUTOFF => 5000;
51       const my $UPSTREAM_2KB_CUTOFF => -2000;
52       const my $DOWNSTREAM_500BP_CUTOFF => 500;
53        
54       1;
55        
56       sub new {
57   862 1 my $proto = shift;
58       my $class = ref($proto) || $proto;
59       my $self = {};
60       bless($self, $class);
61       $self->_init(@_);
62       $self->_ontologyInit(@_);
63       return $self;
64       }
65        
66       sub getConsensusSpliceOffsets {
67   0 0 return @CONSENSUS_SPLICE_OFFSETS;
68       }
69        
70       sub getAnnotation {
71   862 1 my ($self,$var) = @_;
72       my @ann = $self->_getAnnotation($var);
73       $self->_assignBookmarks(@ann);
74 50     if(exists $self->{'_only_bookmarked'} && $self->{'_only_bookmarked'}){
75       my @out;
76       foreach my $a(@ann){
77 0     if($a->hasBookmark()){
78       push(@out,$a);
79       }
80       }
81       return @out;
82       }
83       return @ann;
84       }
85        
86   4   sub _getAnnotation: Abstract;
87        
88       sub addMessage {
89   639 1 my ($self,$msg) = @_;
90       push(@{$self->{_msg}},ref($self).": ".$msg);
91       }
92        
93       sub _debug {
94   14766   my $self = shift;
95 50     if(exists($self->{_debug}) && defined($self->{_debug}) && $self->{_debug}){
96       return 1;
97       } else {
98       return 0;
99       }
100       }
101        
102       sub getMessages {
103   1 1 my $self = shift;
104 50     return @{$self->{_msg}} if defined($self->{_msg});
105       return undef;
106       }
107        
108       sub _clearMessages {
109   862   shift->{_msg} = undef;
110       }
111        
112       sub _assignBookmarks {
113   862   my ($self,@groups) = @_;
114 50     if(exists $self->{'_bookmarkers'} && defined $self->{'_bookmarkers'}){
115       foreach my $bm (@{$self->{'_bookmarkers'}}){
116       $bm->markAnnotation(@groups);
117       }
118       }
119       }
120        
121   4   sub _isStartGained: Abstract;
122        
123       sub _init {
124   862   my $self = shift;
125       my %vars = @_;
126       foreach my $k(keys(%vars)){
127 50     if($k eq 'transcriptSource' && $vars{'transcriptSource'}->isa('Sanger::CGP::Vagrent::TranscriptSource::AbstractTranscriptSource')){
  0      
  0      
  0      
128       $self->{'_transcriptSource'} = $vars{'transcriptSource'};
129       } elsif($k eq 'bookmarker'){
130 0     if(ref($vars{'bookmarker'}) eq 'ARRAY'){
  0      
131       foreach my $tmp(@{$vars{'bookmarker'}}){
132 0     if($tmp->isa('Sanger::CGP::Vagrent::Bookmarkers::AbstractBookmarker')){
133       push(@{$self->{'_bookmarkers'}},$tmp);
134       }
135       }
136       } elsif($vars{'bookmarker'}->isa('Sanger::CGP::Vagrent::Bookmarkers::AbstractBookmarker')){
137       push(@{$self->{'_bookmarkers'}},$vars{'bookmarker'});
138       }
139       } elsif($k eq 'only_bookmarked'){
140       $self->{'_only_bookmarked'} = $vars{'only_bookmarked'};
141       } elsif($k eq 'debug' && $vars{'debug'}){
142       $self->{'_debug'} = 1;
143       }
144       }
145       }
146        
147       sub _getmRNAPositions {
148   852   my ($self,$mut,$tran) = @_;
149       my $exonBeforeInTrans = undef;
150       my $exonContainingStartInTrans = undef;
151       my $exonsDuringInTrans = undef;
152       my $exonContainingEndInTrans = undef;
153       my $exonAfterInTrans = undef;
154       foreach my $e ($tran->getExons){
155 50     if($e->getChr() ne $mut->getChr()){
156       # thats not right, chromosomes don't match;
157       my $msg = 'Unable to process this variant, chromosome doesnt match transcript';
158       $self->addMessage($msg);
159       $log->error($msg);
160       return undef;
161       }
162 100     if($e->getMaxPos < $mut->getMinPos){
163       # exon before mutation on genome, they don't overlap
164 100     if($tran->getStrand == 1){
165       # we are the easy way up
166       $exonBeforeInTrans = $e;
167       } else {
168       # we are the tricky way up
169 100     unless(defined($exonAfterInTrans)){
170       $exonAfterInTrans = $e;
171       }
172       }
173       next;
174       }
175 100     if($mut->getMinPos <= $e->getMinPos && $e->getMaxPos <= $mut->getMaxPos){
176       # exon is completely covered by the mutation
177       push(@$exonsDuringInTrans,$e);
178       next;
179       }
180        
181 100     if($mut->getMaxPos < $e->getMinPos){
182       # exon after mutation on genome, they don't overlap
183 100     if($tran->getStrand == 1){
184       # we are the easy way up
185 100     unless(defined($exonAfterInTrans)){
186       $exonAfterInTrans = $e;
187       }
188       } else {
189       # we are the tricky way up
190       $exonBeforeInTrans = $e;
191       }
192       next;
193       }
194 100     if($e->getMinPos <= $mut->getMinPos && $mut->getMinPos <= $e->getMaxPos ){
195       # start of mutation is in this exon on the genome
196 100     if($tran->getStrand == 1){
197       # we are the easy way up
198       $exonContainingStartInTrans = $e;
199       } else {
200       # we are the tricky way up
201       $exonContainingEndInTrans = $e;
202       }
203       }
204 100     if($e->getMinPos <= $mut->getMaxPos && $mut->getMaxPos <= $e->getMaxPos){
205       # end of mutation is in this exon on the genome
206 100     if($tran->getStrand == 1){
207       # we are the easy way up
208       $exonContainingEndInTrans = $e;
209       } else {
210       # we are the tricky way up
211       $exonContainingStartInTrans = $e;
212       }
213       }
214       }
215        
216 50     if($self->_debug){
217       print 'VAR '.Dumper($mut);
218       print 'BEFORE '.Dumper($exonBeforeInTrans);
219       print 'START '.Dumper($exonContainingStartInTrans);
220       print 'DURING '.Dumper($exonsDuringInTrans);
221       print 'END '.Dumper($exonContainingEndInTrans);
222       print 'AFTER '.Dumper($exonAfterInTrans);
223       }
224        
225       my $mutTranStart = undef;
226       my $mutStartOffset = undef;
227       my $mutTranEnd = undef;
228       my $mutEndOffset = undef;
229        
230 100     if(!defined($exonBeforeInTrans) &&
  100      
  100      
  50      
231       !defined($exonContainingStartInTrans)){
232       # variant starts before the transcript, set start to special case 0
233       my $tmpAfter;
234 100     if(defined($exonsDuringInTrans) && defined($exonsDuringInTrans->[0])){
  100      
235       $tmpAfter = $exonsDuringInTrans->[0];
236       } elsif(defined($exonContainingEndInTrans)){
237       $tmpAfter = $exonContainingEndInTrans;
238       } else {
239       $tmpAfter = $exonAfterInTrans;
240       }
241       $mutTranStart = 0;
242 100     if($tran->getStrand == 1){
243       $mutStartOffset = $mut->getMinPos - $tmpAfter->getMinPos;
244       } else {
245       $mutStartOffset = $tmpAfter->getMaxPos - $mut->getMaxPos;
246       }
247       } elsif (defined($exonBeforeInTrans) &&
248       !defined($exonContainingStartInTrans) &&
249       !defined($exonsDuringInTrans) &&
250       !defined($exonContainingEndInTrans) &&
251       !defined($exonAfterInTrans)) {
252       # variant starts after the transcript, set start to special case 0
253       $mutTranStart = 0;
254 100     if($tran->getStrand == 1){
255       $mutStartOffset = $mut->getMinPos - $exonBeforeInTrans->getMaxPos;
256       } else {
257       $mutStartOffset = $exonBeforeInTrans->getMinPos - $mut->getMaxPos;
258       }
259       } elsif(defined($exonContainingStartInTrans)){
260       # mutations starts within an exon;
261 100     if($tran->getStrand == 1){
262       # we are the easy way up
263       $mutTranStart = ($mut->getMinPos - $exonContainingStartInTrans->getMinPos) + $exonContainingStartInTrans->getRnaMinPos;
264       $mutStartOffset = 0;
265       } else {
266       # we are the tricky way up
267       $mutTranStart = ($exonContainingStartInTrans->getMaxPos - $mut->getMaxPos) + $exonContainingStartInTrans->getRnaMinPos;
268       $mutStartOffset = 0;
269       }
270 50     print "START - $mutTranStart - $mutStartOffset\n" if($self->_debug);
271       } elsif(defined($exonBeforeInTrans)) {
272       # mutation starts between exons, this makes things harder
273       my $before = $exonBeforeInTrans;
274       my $after = undef;
275       my $beforeBoundry = undef;
276       my $afterBoundry = undef;
277 100     if(defined($exonsDuringInTrans)){
  100      
278       $after = $exonsDuringInTrans->[0];
279       } elsif(defined($exonContainingEndInTrans)){
280       $after = $exonContainingEndInTrans;
281       } else {
282       $after = $exonAfterInTrans;
283       }
284 100     if($tran->getStrand == 1){
285       $beforeBoundry->{mut} = $mut->getMinPos;
286       $beforeBoundry->{exon} = $before->getMaxPos;
287       $beforeBoundry->{off} = $mut->getMinPos - $before->getMaxPos;
288       $beforeBoundry->{pos} = $before->getRnaMaxPos;
289       $afterBoundry->{mut} = $mut->getMinPos;
290       $afterBoundry->{exon} = $after->getMinPos;
291       $afterBoundry->{off} = $mut->getMinPos - $after->getMinPos;
292       $afterBoundry->{pos} = $after->getRnaMinPos;
293       } else {
294       $beforeBoundry->{mut} = $mut->getMaxPos;
295       $beforeBoundry->{exon} = $before->getMinPos;
296       $beforeBoundry->{off} = $before->getMinPos - $mut->getMaxPos;
297       $beforeBoundry->{pos} = $before->getRnaMaxPos;
298       $afterBoundry->{mut} = $mut->getMaxPos;
299       $afterBoundry->{exon} = $after->getMaxPos;
300       $afterBoundry->{off} = $after->getMaxPos - $mut->getMaxPos;
301       $afterBoundry->{pos} = $after->getRnaMinPos;
302       }
303 50     if($self->_debug){
304       print 'START BEFORE BOUNDRY '. Dumper($beforeBoundry);
305       print 'START AFTER BOUNDRY '.Dumper($afterBoundry);
306       }
307 100     if($mut->isa('Sanger::CGP::Vagrent::Data::Insertion') && $beforeBoundry->{off} + $afterBoundry->{off} == 0 && $beforeBoundry->{pos} + 1 == $afterBoundry->{pos}){
308       # very special case. we have an insertion starting at the mid point of an odd length intron. To avoid confusion
309       # we'll attach the start of the insertion to the exon after the insertion as that is going to be closer to the insertion end point
310       $mutTranStart = $afterBoundry->{pos};
311       $mutStartOffset = $afterBoundry->{off};
312       } else {
313 100     if(abs($beforeBoundry->{off}) > abs($afterBoundry->{off})){
314       $mutTranStart = $afterBoundry->{pos};
315       $mutStartOffset = $afterBoundry->{off};
316       } else {
317       $mutTranStart = $beforeBoundry->{pos};
318       $mutStartOffset = $beforeBoundry->{off};
319       }
320       }
321       } else {
322       my $msg = 'Unable to process this variant, very strange coordinate data';
323       $self->addMessage($msg);
324       $log->error($msg);
325       return undef;
326       }
327 100     if(!defined($exonBeforeInTrans) &&
  100      
  100      
  50      
328       !defined($exonContainingStartInTrans) &&
329       !defined($exonsDuringInTrans) &&
330       !defined($exonContainingEndInTrans) &&
331       defined($exonAfterInTrans)){
332       # variant ends before the transcripts starts, set end to special case 0
333       $mutTranEnd = 0;
334 100     if($tran->getStrand == 1){
335       $mutEndOffset = $mut->getMaxPos - $exonAfterInTrans->getMinPos;
336       } else {
337       $mutEndOffset = $exonAfterInTrans->getMaxPos - $mut->getMinPos;
338       }
339       } elsif(!defined($exonContainingEndInTrans) &&
340       !defined($exonAfterInTrans)){
341       # variant ends after the transcripts ends, set end to special case 0
342       my $tmpBefore;
343 100     if(defined($exonsDuringInTrans) && defined($exonsDuringInTrans->[scalar(@$exonsDuringInTrans) - 1])){
  100      
344       $tmpBefore = $exonsDuringInTrans->[scalar(@$exonsDuringInTrans) - 1];
345       } elsif(defined($exonContainingStartInTrans)){
346       $tmpBefore = $exonContainingStartInTrans;
347       } else {
348       $tmpBefore = $exonBeforeInTrans;
349       }
350       $mutTranEnd = 0;
351 100     if($tran->getStrand == 1){
352       $mutEndOffset = $mut->getMaxPos - $tmpBefore->getMaxPos;
353       } else {
354       $mutEndOffset = $tmpBefore->getMinPos - $mut->getMinPos;
355       }
356       } elsif(defined($exonContainingEndInTrans)){
357       # mutations ends within an exon;
358 100     if($tran->getStrand == 1){
359       # we are the easy way up
360       $mutTranEnd = ($mut->getMaxPos - $exonContainingEndInTrans->getMinPos) + $exonContainingEndInTrans->getRnaMinPos;
361       $mutEndOffset = 0;
362       } else {
363       # we are the tricky way up
364       $mutTranEnd = ($exonContainingEndInTrans->getMaxPos - $mut->getMinPos) + $exonContainingEndInTrans->getRnaMinPos;
365       $mutEndOffset = 0;
366       }
367 50     print "END - $mutTranEnd - $mutEndOffset\n" if($self->_debug);
368       } elsif(defined($exonAfterInTrans)) {
369       # mutation ends between exons, this makes things harder
370       my $before = undef;
371       my $after = $exonAfterInTrans;
372       my $beforeBoundry = undef;
373       my $afterBoundry = undef;
374 100     if(defined($exonsDuringInTrans)){
  100      
375       $before = $exonsDuringInTrans->[scalar(@$exonsDuringInTrans) - 1];
376       } elsif(defined($exonContainingStartInTrans)){
377       $before = $exonContainingStartInTrans;
378       } else {
379       $before = $exonBeforeInTrans;
380       }
381 100     if($tran->getStrand == 1){
382       $beforeBoundry->{mut} = $mut->getMaxPos;
383       $beforeBoundry->{exon} = $before->getMaxPos;
384       $beforeBoundry->{off} = $mut->getMaxPos - $before->getMaxPos;
385       $beforeBoundry->{pos} = $before->getRnaMaxPos;
386       $afterBoundry->{mut} = $mut->getMaxPos;
387       $afterBoundry->{exon} = $after->getMinPos;
388       $afterBoundry->{off} = $mut->getMaxPos - $after->getMinPos;
389       $afterBoundry->{pos} = $after->getRnaMinPos;
390       } else {
391       $beforeBoundry->{mut} = $mut->getMinPos;
392       $beforeBoundry->{exon} = $before->getMinPos;
393       $beforeBoundry->{off} = $before->getMinPos - $mut->getMinPos;
394       $beforeBoundry->{pos} = $before->getRnaMaxPos;
395       $afterBoundry->{mut} = $mut->getMinPos;
396       $afterBoundry->{exon} = $after->getMaxPos;
397       $afterBoundry->{off} = $after->getMaxPos - $mut->getMinPos;
398       $afterBoundry->{pos} = $after->getRnaMinPos;
399       }
400 50     if($self->_debug){
401       print 'END BEFORE BOUNDRY '. Dumper($beforeBoundry);
402       print 'END AFTER BOUNDRY '.Dumper($afterBoundry);
403       }
404 100     if($mut->isa('Sanger::CGP::Vagrent::Data::Insertion') && $beforeBoundry->{off} + $afterBoundry->{off} == 1 && $beforeBoundry->{pos} + 1 == $afterBoundry->{pos}){
405       # very special case. we have an insertion dead centre of an evenly sized intron. To avoid confusion
406       # we'll attach the end of the insertion to the exon before the insertion as that is the same as the start
407       $mutTranEnd = $beforeBoundry->{pos};
408       $mutEndOffset = $beforeBoundry->{off};
409       } else {
410 100     if(abs($beforeBoundry->{off}) > abs($afterBoundry->{off})){
411       $mutTranEnd = $afterBoundry->{pos};
412       $mutEndOffset = $afterBoundry->{off};
413       } else {
414       $mutTranEnd = $beforeBoundry->{pos};
415       $mutEndOffset = $beforeBoundry->{off};
416       }
417       }
418       } else {
419       my $msg = 'Unable to process this variant, very strange coordinate data';
420       $self->addMessage($msg);
421       $log->error($msg);
422       return undef;
423       }
424 50     print "mRNA VAR START = $mutTranStart $mutStartOffset\nmRNA VAR END = $mutTranEnd $mutEndOffset\n" if($self->_debug);
425       return ($mutTranStart,$mutStartOffset,$mutTranEnd,$mutEndOffset);
426       }
427        
428       sub _buildProteinAnnotation {
429   441   my ($self,$var,$tran,$cAnnot,$rAnnot) = @_;
430       my @classes;
431       my $wtDna = $tran->getCdsSeq;
432       my ($prePad,$postPad) = $self->_calculateCdsTranslationPadStrings($tran);
433       my $wtProt = Bio::Seq->new(-seq => $prePad . $wtDna . $postPad)->translate->seq(); # wild type protein sequence
434 100     unless($self->_canAnnotateToProtein($tran,$cAnnot)){
435 100     if($cAnnot->hasClassification($self->getComplexChangeVariantClass) && $cAnnot->getDescription eq 'c.0'){
436       # special case, the CDS has completely gone so the protein must follow.
437       return $self->_buildRemovedProteinAnnotation($var,$tran,$wtProt,($self->getDeletionClass));
438       }
439       my $msg = 'CDS annotation is non-coding or too complex, cant annotate this';
440       $self->addMessage($msg);
441       $log->info($msg);
442       my $pAnnot = $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
443       my $coversStart = 0;
444       my $coversEnd = 0;
445        
446 100     $coversStart = 1 if($self->_coversStartCodon($tran,$cAnnot));
447 100     $coversEnd = 1 if($self->_coversStopCodon($tran,$cAnnot));
448 100     if($coversStart){
449       $pAnnot->addClassification($self->getStartLostVariantClass);
450       }
451 100     if($coversEnd){
452       $pAnnot->addClassification($self->getStopLostVariantClass);
453       }
454       return $pAnnot;
455       }
456       my $cdsMinPos = $self->_getCdsMinPosForProteinCalculation($cAnnot);
457       my $cdsMaxPos = $self->_getCdsMaxPosForProteinCalculation($cAnnot);
458       my $desc = undef;
459       my $mutProtMin = undef;
460       my $mutProtMax = undef;
461       my $wt = undef;
462       my $mt = undef;
463       my $subtype = undef;
464       my $type = undef;
465 50     unless(defined $cdsMinPos && defined $cdsMaxPos) {
466       # something has gone wrong
467       return undef;
468       }
469        
470       my $mtDna = $self->_getMutatedCdsSequence($wtDna,$cdsMinPos,$cdsMaxPos,$cAnnot->getMt());
471       my $mtProt = Bio::Seq->new(-seq => $prePad . $mtDna . $postPad)->translate->seq(); # mutant protein sequence
472       my $maxMtProt = Bio::Seq->new(-seq => $prePad . $mtDna . substr($tran->getcDNASeq,$tran->getCdsMaxPos()))->translate->seq(); # maximised protein sequence, overruns the natural stop and translates to the end of the transcript
473 100     if($wtProt eq $mtProt){
  100      
474       # wt and mt protein sequences are the same, its silent
475       $mutProtMin = ceil(($cAnnot->getMinPos / 3));
476       $mutProtMax = ceil(($cAnnot->getMaxPos / 3));
477       $wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
478       $mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
479 50     if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
480       $desc = 'p.'.$wt.$mutProtMin.$mt;
481       } else {
482       $desc = 'p.(=)';
483       }
484       $type = $self->_getDefaultProteinAnnotationType();
485       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
486 50     if($wt eq '*'){
487       push(@classes,$self->getStopRetainedVariantClass);
488       } else {
489       push(@classes,$self->getSynonymousVariantClass);
490       }
491       } elsif($cAnnot->hasClassification($self->getFrameShiftVariantClass)){
492       # frame shift
493 50     if($wtProt eq $maxMtProt || $self->_sequenceStartsWithSequence($maxMtProt,$wtProt)){
494       my $msg = 'very strange frameshift, there is no change in the protein sequence so calling it unknown';
495       $self->addMessage($msg);
496       $log->info($msg);
497       return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
498       }
499       ($subtype,$wt,$mt,$mutProtMin,$mutProtMax,$desc) = $self->_calculateFrameShiftProteinChange($wtProt,$maxMtProt);
500 100     if($mutProtMin == 1){
501       # its frame shifted the start codon, no idea what this is going to cause.
502       push(@classes,$self->getStartLostVariantClass);
503       return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
504       }
505       $type = Sanger::CGP::Vagrent::Data::Annotation::getFrameShiftAnnotationType();
506       push(@classes,$self->getFrameShiftVariantClass);
507       } else {
508       $wt = $wtProt;
509       $mt = $mtProt;
510       $mutProtMin = 0;
511       while(substr($wt,0,1) eq substr($mt,0,1)){
512       substr($wt,0,1,'');
513       substr($mt,0,1,'');
514       $mutProtMin++;
515       }
516       while(substr($wt,-1,1) eq substr($mt,-1,1)){
517       substr($wt,-1,1,'');
518       substr($mt,-1,1,'');
519       }
520        
521       #warn "|$wt| to |$mt|\n";
522 100     if($wt ne ''){
  50      
523       # wild type residue has been changed
524 100     if($mt ne '' && length($wt) == length($mt) && length($wt) == 1){
  100      
  50      
525       # its a simple sub
526       $mutProtMin++;
527       $mutProtMax = $mutProtMin;
528       $type = Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType();
529       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
530       $desc = 'p.'.$wt.$mutProtMin.$mt;
531       push(@classes,$self->getSubstitutionClass);
532 50     if($wt eq $mt){
533       # this could still happen, but it should have already been caught
534       $desc = 'p.(=)';
535 0     if($wt eq '*'){
536       push(@classes,$self->getStopRetainedVariantClass);
537       } else {
538       push(@classes,$self->getSynonymousVariantClass);
539       }
540       } else {
541 100     if($mt eq '*'){
  100      
  100      
542       push(@classes,$self->getStopGainedVariantClass);
543       } elsif($wt eq '*'){
544       push(@classes,$self->getStopLostVariantClass);
545       } elsif($wt eq 'M' && $mutProtMin == 1){
546       push(@classes,$self->getStartLostVariantClass);
547       } else {
548       push(@classes,$self->getNonSynonymousVariantClass);
549       }
550       }
551       } elsif ($mt ne '' && (length($wt) > 1 || length($mt) > 1)){
552       # complex sub
553       $mutProtMin++;
554       $mutProtMax = ($mutProtMin + length($wt)) - 1;
555       $type = Sanger::CGP::Vagrent::Data::Annotation::getComplexAnnotationType();
556       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
557 100     if($mt =~ s/^(.*?\*).*$/$1/ || $mt eq '*'){
558       # this variant introduces a stop codon
559       push(@classes,$self->getStopGainedVariantClass);
560       push(@classes,$self->getComplexIndelClass);
561       } else {
562       push(@classes,$self->getComplexIndelClass);
563       }
564 100     if($mutProtMin == $mutProtMax){
565       # replaces single WT residue
566       $desc = 'p.'.substr($wtProt,($mutProtMin - 1),1).$mutProtMin.'delins'.$mt;
567       } else {
568       $desc = 'p.'.substr($wtProt,($mutProtMin - 1),1).$mutProtMin.'_'.substr($wtProt,($mutProtMax - 1),1).$mutProtMax.'delins'.$mt;
569       }
570       } elsif($mt eq ''){
571       # its an inframe deletion
572       $mt = '-';
573       $mutProtMin++;
574       $mutProtMax = ($mutProtMin + length($wt)) - 1;
575       $type = Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType();
576       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
577 50     if($mutProtMin == $mutProtMax){
578       $desc = 'p.'.$wt.$mutProtMin.'del'.$wt;
579       } else {
580       $desc = 'p.'.substr($wt,0,1).$mutProtMin.'_'.substr($wt,-1,1).$mutProtMax.'del'.$wt;
581       }
582       push(@classes,$self->getInFrameCodonLossVariantClass);
583       }
584       } elsif($wt eq '' && $mt ne '' && length($mt) > 0){
585       # This is an in frame insertion
586       $wt = '-';
587       $mutProtMax = $mutProtMin + 1;
588       $type = Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType();
589       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
590 100     if($mt =~ s/^(.*?\*).*$/$1/){
591       # this variant introduces a stop codon
592       push(@classes,$self->getStopGainedVariantClass);
593 50     if(length($mt) > 1){
594       push(@classes,$self->getInFrameCodonGainVariantClass);
595       }
596       } else {
597       push(@classes,$self->getInFrameCodonGainVariantClass);
598       }
599       $desc = 'p.'.substr($wtProt,($mutProtMin - 1),1).$mutProtMin.'_'.substr($wtProt,($mutProtMax - 1),1).$mutProtMax.'ins'.$mt;
600       } else {
601       my $msg = 'Unhandled case, reporting unknown protein annotation';
602       $self->addMessage($msg);
603       $log->info($msg);
604       return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
605       }
606       }
607        
608       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new( wt => $wt,
609       mt => $mt,
610       minpos => $mutProtMin,
611       minOffset => 0,
612       maxpos => $mutProtMax,
613       maxOffset => 0,
614       acc => $tran->getProteinAccession,
615       accversion => $tran->getProteinAccessionVersion,
616       db => $tran->getDatabase,
617       dbversion => $tran->getDatabaseVersion,
618       seqlength => length($wtProt),
619       description => $desc,
620       context => Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(),
621       type => $type,
622       subtype => $subtype);
623       $anno->addClassification(@classes);
624       return $anno;
625       }
626        
627   4   sub _getMutatedCdsSequence: Abstract;
628        
629   4   sub _getCdsMinPosForProteinCalculation: Abstract;
630        
631   4   sub _getCdsMaxPosForProteinCalculation: Abstract;
632        
633       sub _buildCDSAnnotation {
634   441   my ($self,$var,$tran,$rAnnot) = @_;
635       my @classes;
636 100     unless($self->_canAnnotateToCDS($tran,$rAnnot)){
637       my $msg = 'mRNA annotation doesnt affect the CDS, cant annotate this';
638       $self->addMessage($msg);
639       $log->info($msg);
640       return $self->_buildUnknownCDSAnnotation($var,$tran,$rAnnot,@classes);
641       }
642       my ($cdsMin,$cdsMinOffset,$cdsMax,$cdsMaxOffset) = (undef,undef,undef,undef);
643 100     if($rAnnot->getMinPos < $tran->getCdsMinPos){
  100      
644       $cdsMin = 1;
645       $cdsMinOffset = 0;
646       } elsif($rAnnot->getMinPos == $tran->getCdsMinPos && $rAnnot->getMinOffset() < 0){
647       $cdsMin = 1;
648 50     if($self->_isIntronicOffsetDistance($rAnnot->getMinOffset())){
649       $cdsMinOffset = 0;
650       } else {
651       $cdsMinOffset = $rAnnot->getMinOffset();
652       }
653       } else {
654       $cdsMin = ($rAnnot->getMinPos - $tran->getCdsMinPos) + 1;
655       $cdsMinOffset = $rAnnot->getMinOffset();
656       }
657        
658 100     if($rAnnot->getMaxPos > $tran->getCdsMaxPos){
  50      
659       $cdsMax = length($tran->getCdsSeq);
660       $cdsMaxOffset = 0;
661       } elsif($rAnnot->getMaxPos == $tran->getCdsMaxPos && $rAnnot->getMaxOffset() > 0){
662       $cdsMax = length($tran->getCdsSeq);
663 0     if($self->_isIntronicOffsetDistance($rAnnot->getMaxOffset())){
664       $cdsMaxOffset = 0;
665       } else {
666       $cdsMaxOffset = $rAnnot->getMaxOffset();
667       }
668       } else {
669       $cdsMax = ($rAnnot->getMaxPos() - $tran->getCdsMinPos) + 1;
670       $cdsMaxOffset = $rAnnot->getMaxOffset();
671       }
672        
673 50     print "CDS: $cdsMin , $cdsMinOffset - $cdsMax, $cdsMaxOffset\n" if $self->_debug();
674        
675       my $wt = $self->_getWildTypeStringForCDSAnno($var,$tran,$rAnnot);
676       my $mt = $self->_getMutantStringForCDSAnno($var,$tran,$rAnnot);
677       my $desc = $self->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt);
678       my $subtype = $rAnnot->getSubtype();
679       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new( wt => uc($wt),
680       mt => uc($mt),
681       minpos => $cdsMin,
682       minOffset => $cdsMinOffset,
683       maxpos => $cdsMax,
684       maxOffset => $cdsMaxOffset,
685       acc => $tran->getAccession,
686       accversion => $tran->getAccessionVersion,
687       db => $tran->getDatabase,
688       dbversion => $tran->getDatabaseVersion,
689       seqlength => length($tran->getCdsSeq),
690       description => $desc,
691       context => Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(),
692       type => $self->_getDefaultCDSAnnotationType(),
693       subtype => $subtype);
694        
695       foreach my $rc($rAnnot->getClassifications){
696 100     next if($rc eq $self->get5KBUpStreamVariantClass ||
697       $rc eq $self->get2KBUpStreamVariantClass ||
698       $rc eq $self->get5KBDownStreamVariantClass ||
699       $rc eq $self->get500BPDownStreamVariantClass );
700       $anno->addClassification($rc);
701       }
702       $anno->addClassification(@classes);
703       return $anno;
704        
705        
706        
707       }
708        
709   4   sub _getDefaultCDSAnnotationType: Abstract;
710        
711   4   sub _getDefaultProteinAnnotationType: Abstract;
712        
713   4   sub _getWildTypeStringForCDSAnno: Abstract;
714        
715   4   sub _getMutantStringForCDSAnno: Abstract;
716        
717   4   sub _getCDSDescriptionString: Abstract;
718        
719       sub _buildUnknownCDSAnnotation {
720   152   my ($self,$mut,$tran,$rnaAnno,@classes) = @_;
721       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new( wt => '?',
722       mt => '?',
723       minpos => 0,
724       minOffset => 0,
725       maxpos => 0,
726       maxOffset => 0,
727       acc => $tran->getAccession,
728       accversion => $tran->getAccessionVersion,
729       db => $tran->getDatabase,
730       dbversion => $tran->getDatabaseVersion,
731       seqlength => length($tran->getCdsSeq),
732       description => 'c.?',
733       context => Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(),
734       type => Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(),
735       subtype => Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype());
736       $anno->addClassification(@classes);
737       $anno->addClassification($self->getUnknownVariantClass);
738       return $anno;
739       }
740        
741       sub _buildUnaffectedCDSAnnotation {
742   0   my ($self,$mut,$tran,$rnaAnno,@classes) = @_;
743       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new(wt => '-',
744       mt => '-',
745       minpos => 0,
746       minOffset => 0,
747       maxpos => 0,
748       maxOffset => 0,
749       acc => $tran->getAccession,
750       accversion => $tran->getAccessionVersion,
751       db => $tran->getDatabase,
752       dbversion => $tran->getDatabaseVersion,
753       seqlength => length($tran->getCdsSeq),
754       description => 'c.=',
755       context => Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(),
756       type => Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(),
757       subtype => Sanger::CGP::Vagrent::Data::Annotation::getNoneAnnotationSubtype());
758       $anno->addClassification(@classes);
759       return $anno;
760       }
761        
762       sub _buildUnknownProteinAnnotation {
763   377   my ($self,$mut,$tran,$cdsAnno,$protLength,@classes) = @_;
764       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new(wt => '?',
765       mt => '?',
766       minpos => 0,
767       minOffset => 0,
768       maxpos => 0,
769       maxOffset => 0,
770       acc => $tran->getProteinAccession,
771       accversion => $tran->getProteinAccessionVersion,
772       db => $tran->getDatabase,
773       dbversion => $tran->getDatabaseVersion,
774       seqlength => $protLength,
775       description => 'p.?',
776       context => Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(),
777       type => Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(),
778       subtype => Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype());
779       $anno->addClassification(@classes);
780       $anno->addClassification($self->getUnknownVariantClass);
781       return $anno;
782       }
783        
784       sub _buildUnaffectedProteinAnnotation {
785   0   my ($self,$mut,$tran,$cdsAnno,$protLength,@classes) = @_;
786       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new(wt => '-',
787       mt => '-',
788       minpos => 0,
789       minOffset => 0,
790       maxpos => 0,
791       maxOffset => 0,
792       acc => $tran->getProteinAccession,
793       accversion => $tran->getProteinAccessionVersion,
794       db => $tran->getDatabase,
795       dbversion => $tran->getDatabaseVersion,
796       seqlength => $protLength,
797       description => 'p.=',
798       context => Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(),
799       type => Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(),
800       subtype => Sanger::CGP::Vagrent::Data::Annotation::getNoneAnnotationSubtype());
801       $anno->addClassification(@classes);
802       return $anno;
803       }
804        
805       sub _buildRemovedProteinAnnotation {
806   3   my ($self,$mut,$tran,$wtProt,@classes) = @_;
807       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new( wt => $wtProt,
808       mt => '-',
809       minpos => 1,
810       minOffset => 0,
811       maxpos => length($wtProt),
812       maxOffset => 0,
813       acc => $tran->getProteinAccession,
814       accversion => $tran->getProteinAccessionVersion,
815       db => $tran->getDatabase,
816       dbversion => $tran->getDatabaseVersion,
817       seqlength => length($wtProt),
818       description => 'p.0',
819       context => Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(),
820       type => $self->_getDefaultProteinAnnotationType(),
821       subtype => Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype());
822       $anno->addClassification(@classes);
823       return $anno;
824       }
825        
826       sub _buildUnknownMRNAAnnotation {
827   308   my ($self,$var,$tran,@classes) = @_;
828       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new(wt => '?',
829       mt => '?',
830       minpos => 0,
831       minOffset => 0,
832       maxpos => 0,
833       maxOffset => 0,
834       acc => $tran->getAccession,
835       accversion => $tran->getAccessionVersion,
836       db => $tran->getDatabase,
837       dbversion => $tran->getDatabaseVersion,
838       seqlength => length($tran->getcDNASeq),
839       description => 'r.?',
840       context => Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(),
841       type => Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(),
842       subtype => Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype());
843        
844       $anno->addClassification(@classes);
845       return $anno;
846       }
847        
848       sub _isOffsetAConsensusSpliceDistance {
849   66   my ($self,$offset) = @_;
850       foreach my $cf(@CONSENSUS_SPLICE_OFFSETS){
851 100     if($offset == $cf){
852       return 1;
853       }
854       }
855       return 0;
856       }
857        
858       sub _getConsesnsusSpliceBeforeBoundry {
859   189   return $CONSENSUS_SPLICE_BEFORE_BOUNDRY;
860       }
861        
862       sub _getConsesnsusSpliceAfterBoundry {
863   194   return $CONSENSUS_SPLICE_AFTER_BOUNDRY;
864       }
865        
866       sub _isIntronicOffsetDistance {
867   692   my ($self,$offset) = @_;
868 100     if(abs($offset) >= $INTRONIC_OFFSET_CUTOFF){
869       return 1;
870       }
871       return 0;
872       }
873        
874       sub _isWithin5KBOffsetDistance {
875   147   my ($self,$offset) = @_;
876 100     if(abs($offset) <= $UPDOWNSTREAM_5KB_CUTOFF){
877       return 1;
878       }
879       return 0;
880       }
881        
882       sub getUpDownStream5kbCutoff {
883   0 0 return $UPDOWNSTREAM_5KB_CUTOFF;
884       }
885        
886       sub _isWithin2KBUpstreamOffsetDistance {
887   142   my ($self,$offset) = @_;
888 100     if($offset < 0 && $offset >= $UPSTREAM_2KB_CUTOFF){
889       return 1;
890       }
891       return 0;
892       }
893        
894       sub getUpStream2kbCutoff {
895   0 0 return $UPSTREAM_2KB_CUTOFF;
896       }
897        
898       sub _isWithin500BPDownstreamOffsetDistance {
899   121   my ($self,$offset) = @_;
900 100     if($offset > 0 && $offset <= $DOWNSTREAM_500BP_CUTOFF){
901       return 1;
902       }
903       return 0;
904       }
905        
906       sub getDownStream500bpCutoff {
907   0 0 return $DOWNSTREAM_500BP_CUTOFF;
908       }
909        
910        
911       sub _coversStartCodon {
912   371   my ($self,$tran,$anno) = @_;
913 50     unless($tran->isProteinCoding){
914       # if the transcript isn't protein coding it can't have a start codon
915       return 0;
916       }
917        
918       my ($startMin,$startMax);
919 50     if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
  50      
920       $startMin = $tran->getCdsMinPos;
921       $startMax = $tran->getCdsMinPos + 2;
922       } elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
923       $startMin = 1;
924       $startMax = 3;
925       } else {
926       # don't know, assume no
927       return 0;
928       }
929        
930 100     if($anno->hasClassification($self->getInsertionClass)){
931       # insertions are a special case, coordinates are outside the variant
932 50     if($anno->getMinPos < $startMax && $anno->getMaxPos > $startMin){
933       # var started before the end of the first codon, and ended after the start of the first codon
934       return 1;
935       }
936       } else {
937 100     if(($anno->getMinPos < $startMax || ($anno->getMinPos == $startMax && $anno->getMinOffset == 0)) &&
938       ($anno->getMaxPos > $startMin || ($anno->getMaxPos == $startMin && $anno->getMaxOffset == 0))){
939       # var started before the end of the first codon OR explicitly on the last base of the first codon AND
940       # var ended after the start of the first codon OR explicitly on the first base of the first codon
941       return 1;
942       }
943       }
944       return 0;
945       }
946        
947       sub _coversStopCodon {
948   371   my ($self,$tran,$anno) = @_;
949 50     unless($tran->isProteinCoding){
950       # if the transcript isn't protein coding it can't have a stop codon
951       return 0;
952       }
953       my ($stopMin,$stopMax);
954 50     if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
  50      
955       $stopMin = $tran->getCdsMaxPos - 2;
956       $stopMax = $tran->getCdsMaxPos;
957       } elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
958       $stopMin = $tran->getCdsLength - 2;
959       $stopMax = $tran->getCdsLength;
960       } else {
961       # don't know, assume no
962       return 0;
963       }
964        
965 100     if($anno->hasClassification($self->getInsertionClass)){
966       # insertions are a special case, coordinates are outside the variant
967 50     if($anno->getMinPos < $stopMax && $anno->getMaxPos > $stopMin){
968       # var started before the end of the last codon, and ended after the start of the last codon
969       return 1;
970       }
971       } else {
972 100     if(($anno->getMinPos < $stopMax || ($anno->getMinPos == $stopMax && $anno->getMinOffset == 0)) &&
973       ($anno->getMaxPos > $stopMin || ($anno->getMaxPos == $stopMin && $anno->getMaxOffset == 0))){
974       # var started before the end of the last codon OR explicitly on the last base of the last codon AND
975       # var ended after the start of the last codon OR explicitly on the first base of the last codon
976       return 1;
977       }
978       }
979       return 0;
980       }
981        
982       sub _getTranscriptSource {
983   862   return shift->{_transcriptSource};
984       }
985        
986       sub _calculateCdsTranslationPadStrings {
987   441   my ($self,$tran) = @_;
988       my $prePad = '';
989       my $postPad = '';
990 50     if($tran->getCdsPhase == 0){
  0      
991       # nothing to pad
992       $prePad = '';
993       } elsif($tran->getCdsPhase > 0 && $tran->getCdsPhase < 3){
994 0     if($tran->getCdsPhase == 1){
  0      
995       $prePad = 'N';
996       } elsif ($tran->getCdsPhase == 2){
997       $prePad = 'NN';
998       } else {
999       $self->throw('this should be impossible');
1000       }
1001       } else {
1002       warn Dumper($tran);
1003       $self->throw("Unhandled phase");
1004       }
1005        
1006 50     if(length($prePad . $tran->getCdsSeq) % 3 == 0){
1007       # nothing to pad
1008       $postPad = '';
1009       } else {
1010       # yuk, its not a round number of codons long, got to pad the end.
1011       my $rem = 3 - (length($prePad . $tran->getCdsSeq) % 3);
1012       for(my $i = 0 ; $i < $rem; $i++){
1013       $postPad .= 'N';
1014       }
1015       }
1016       return ($prePad,$postPad);
1017       }
1018        
1019       sub _canAnnotateToCDS {
1020   441   my ($self,$tran,$anno) = @_;
1021 50     unless($tran->isProteinCoding){
1022       # if the transcript isn't protein coding it can't be a coding change
1023       return 0;
1024       }
1025 50     if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
1026       # first work out if the variant overlaps with the CDS
1027 100     if($anno->hasClassification($self->getInsertionClass)){
1028       # insertions are a special case.
1029       # Coordinates are the last WT positions, and not the first variant ones like everything else
1030        
1031 50     print 'ANNO POS: '.$anno->getMinPos.' , '.$anno->getMinOffset.' - '.$anno->getMaxPos.' , '.$anno->getMaxOffset."\n" if $self->_debug();
1032 50     print 'CDS POS: '.$tran->getCdsMinPos.' , '.$tran->getCdsMaxPos."\n" if $self->_debug();
1033        
1034 100     if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
  100      
1035       # ends before CDS or starts afterwards
1036       return 0;
1037       } elsif($anno->getMaxPos == $tran->getCdsMinPos) {
1038       # potential start codon issues
1039 100     if($anno->getMinPos == $anno->getMaxPos && $anno->getMinPos == $tran->getCdsMinPos && abs($anno->getMinOffset) + abs($anno->getMaxOffset) > 0){
1040       # probably start coordinate issues
1041 50     unless($anno->getMaxOffset <= 0 && $self->_isIntronicOffsetDistance($anno->getMaxOffset) == 0){
1042       # or not
1043       return 0;
1044       }
1045       } else {
1046       return 0;
1047       }
1048       }
1049       } else {
1050 100     if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
1051       # its outside the CDS
1052       return 0;
1053       }
1054       }
1055       # now we look at the classifications to workout if can be annotated to the CDS
1056 100     if($anno->hasClassification($self->getSpliceRegionVariantClass) || $anno->hasClassification($self->getEssentialSpliceSiteVariantClass)){
  100      
  100      
  100      
  100      
  50      
  50      
  100      
1057       return 1;
1058       } elsif($anno->hasClassification($self->getComplexChangeVariantClass)){
1059       return 1;
1060       } elsif($anno->hasClassification($self->getInFrameVariantClass)){
1061       return 1;
1062       } elsif($anno->hasClassification($self->getFrameShiftVariantClass)){
1063       return 1;
1064       } elsif($anno->hasClassification($self->getCodonVariantClass)){
1065       return 1;
1066       } elsif($anno->hasClassification($self->getIntronVariantClass)){
1067       return 0;
1068       } elsif($anno->hasClassification($self->getUnknownVariantClass)){
1069       return 0;
1070       } elsif($anno->hasClassification($self->getInsertionClass) && $anno->hasClassification($self->get5PrimeUtrVariantClass)){
1071       # odd case, insertions close to the start codons can be described on the CDS even though they don't change it.
1072       return 1;
1073       } else {
1074       my $msg = "Unable to calculate CDS relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications);
1075       $self->addMessage($msg);
1076       $log->info($msg);
1077       return 0;
1078       }
1079       } else {
1080       my $msg = "Unable to calculate CDS relevance - UNKNOWN CONTEXT: ".$anno->getContext;
1081       $self->addMessage($msg);
1082       $log->info($msg);
1083       return 0;
1084       }
1085       }
1086        
1087       sub _canAnnotateToProtein {
1088   441   my ($self,$tran,$anno) = @_;
1089        
1090 50     unless($tran->isProteinCoding){
1091       # if the transcript isn't protein coding it can't be a coding change
1092       return 0;
1093       }
1094 50     if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
1095 100     if($anno->hasClassification($self->getUnknownVariantClass)){
  100      
  100      
  100      
  100      
  100      
  100      
1096       return 0;
1097       } elsif($anno->hasClassification($self->getEssentialSpliceSiteVariantClass)){
1098       return 0;
1099       } elsif($anno->hasClassification($self->getSpliceRegionVariantClass)){
1100       return 0;
1101       } elsif($anno->hasClassification($self->getComplexChangeVariantClass)){
1102       return 0;
1103       } elsif($anno->hasClassification($self->getInFrameVariantClass)){
1104       return 1;
1105       } elsif($anno->hasClassification($self->getFrameShiftVariantClass)){
1106       return 1;
1107       } elsif($anno->hasClassification($self->getCodonVariantClass)){
1108       return 1;
1109       } else {
1110       my $msg = "Unable to calculate protein relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications);
1111       $self->addMessage($msg);
1112       $log->info($msg);
1113       return 0;
1114       }
1115       } else {
1116       my $msg = "Unable to calculate protein relevance - UNKNOWN CONTEXT: ".$anno->getContext;
1117       $self->addMessage($msg);
1118       $log->info($msg);
1119       return 0;
1120       }
1121       }
1122        
1123       sub _revcompSeq {
1124   487   my ($self,$seqIn) = @_;
1125       my $seqOut;
1126       foreach my $b(reverse(split('',$seqIn))){
1127       $b =~ tr/atcgATCG/tagcTAGC/;
1128       $seqOut .= $b;
1129       }
1130       return $seqOut;
1131       }
1132        
1133       sub _sequenceStartsWithSequence {
1134   26   my ($self,$seqRef,$seqCheck) = @_;
1135 50     warn "CHECKING IF\n$seqRef\nSTARTS WITH\n$seqCheck\n" if ($self->_debug);
1136 50     if(length($seqRef) < length($seqCheck)){
1137 0     warn "FALSE\n" if ($self->_debug);
1138       return 0;
1139       }
1140 50     warn substr($seqRef,0,length($seqCheck))."\n" if ($self->_debug);
1141 50     warn $seqCheck."\n" if ($self->_debug);
1142        
1143 50     if(substr($seqRef,0,length($seqCheck)) eq $seqCheck){
1144 0     warn "TRUE\n" if ($self->_debug);
1145       return 1;
1146       }
1147 50     warn "FALSE\n" if ($self->_debug);
1148       return 0;
1149       }
1150        
1151       sub _calculateFrameShiftProteinChange {
1152   26   my ($self,$wtProt,$mtProt) = @_;
1153       my $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
1154       my $wt = undef;
1155       my $mt = undef;
1156       my $mutProtMin = undef;
1157       my $mutProtMax = undef;
1158       my $desc = undef;
1159 50     warn "\n$wtProt\n$mtProt\n" if ($self->_debug);
1160       for(my $i = 0 ; $i < length($mtProt); $i++){
1161 50     warn "$wt, ".substr($wtProt,$i,1)." ne ".substr($mtProt,$i,1)."\n" if ($self->_debug);
1162 100     if(!defined($wt) && substr($wtProt,$i,1) ne substr($mtProt,$i,1)){
  50      
  50      
  100      
1163       $wt = substr($wtProt,$i,1);
1164       $mt = substr($mtProt,$i,1);
1165       $mutProtMin = $i + 1;
1166       $mutProtMax = $mutProtMin;
1167 100     if($mt eq '*'){
1168       # variation actually creates the stop codon
1169       $desc = 'p.'.$wt.$mutProtMin.'fs*1';
1170       last;
1171       } else {
1172       # we'll reserve an I don't know description, should get overridden in further iterations.
1173       $desc = 'p.'.$wt.$mutProtMin.'fs?';
1174       }
1175       } elsif(!defined($wt) && length($mtProt) == $i + 1 && length($wtProt) > $i + 1 && substr($wtProt,$i,1) eq substr($mtProt,$i,1)){
1176       # interesting case, the current position is the last position of the mutant protein
1177       # no differences have been found between mt and wt, but the wt sequence is longer, ie we have lost residues from the end
1178       $wt = substr($wtProt,($i + 1),1);
1179       $mt = '?';
1180       $mutProtMin = $i + 2;
1181       $mutProtMax = $mutProtMin;
1182       $desc = 'p.'.$wt.$mutProtMin.'fs?';
1183       last;
1184       } elsif(defined($wt) && length($mtProt) == $i + 1 && substr($mtProt,$i,1) ne '*'){
1185       # this is the last base of the mutant protein and we never reach a stop.
1186       $mt .= substr($mtProt,$i,1);
1187       $desc = 'p.'.$wt.$mutProtMin.'fs*>'.length($mt);
1188       last;
1189       } elsif(defined($wt)){
1190       $mt .= substr($mtProt,$i,1);
1191 100     if(substr($mtProt,$i,1) eq '*'){
1192       $desc = 'p.'.$wt.$mutProtMin.'fs*'.length($mt);
1193       last;
1194       }
1195       }
1196       }
1197 50     warn "$subtype,$wt,$mt,$mutProtMin,$mutProtMax,$desc\n" if ($self->_debug);
1198       return ($subtype,$wt,$mt,$mutProtMin,$mutProtMax,$desc);
1199       }
1200        
1201       sub _calculateGenomicCdsPosition {
1202   14   my ($self,$tran) = @_;
1203       my @exons = $tran->getExonsGenomicOrder;
1204       my $genoCdsMin = undef;
1205       my $genoCdsMax = undef;
1206       foreach my $e(@exons){
1207 100     if($tran->getStrand == 1){
1208 100     if(!defined($genoCdsMin)){
1209 100     if($e->getRnaMinPos <= $tran->getCdsMinPos && $e->getRnaMaxPos >= $tran->getCdsMinPos){
1210       $genoCdsMin = $e->getMinPos + ($tran->getCdsMinPos - $e->getRnaMinPos);
1211       }
1212       }
1213 50     if(!defined($genoCdsMax)){
1214 100     if($e->getRnaMinPos <= $tran->getCdsMaxPos && $e->getRnaMaxPos >= $tran->getCdsMaxPos){
1215       $genoCdsMax = $e->getMinPos + ($tran->getCdsMaxPos - $e->getRnaMinPos);
1216       }
1217       }
1218       } else {
1219 100     if(!defined($genoCdsMin)){
1220 50     if($e->getRnaMinPos <= $tran->getCdsMaxPos && $e->getRnaMaxPos >= $tran->getCdsMaxPos){
1221       $genoCdsMin = $e->getMaxPos - ($tran->getCdsMaxPos - $e->getRnaMinPos);
1222       }
1223       }
1224 100     if(!defined($genoCdsMax)){
1225 100     if($e->getRnaMinPos <= $tran->getCdsMinPos && $e->getRnaMaxPos >= $tran->getCdsMinPos){
1226       $genoCdsMax = $e->getMaxPos - ($tran->getCdsMinPos - $e->getRnaMinPos);
1227       }
1228       }
1229       }
1230       }
1231       return ($genoCdsMin,$genoCdsMax);
1232       }
1233        
1234       sub _arrayHasString {
1235   605   my ($self,$val,@arr) = @_;
1236       foreach my $a (@arr){
1237 100     return 1 if $a eq $val;
1238       }
1239       return 0;
1240       }
1241        
1242       sub _defaultTranscriptSort {
1243   0   my ($self,@trans) = @_;
1244       my $customSort = sub {
1245   0   my $accds = 0;
1246       my $bccds = 0;
1247 0     $accds = 1 if(defined $a->getCCDS && length($a->getCCDS) > 0);
1248 0     $bccds = 1 if(defined $b->getCCDS && length($b->getCCDS) > 0);
1249       my $chk = $bccds <=> $accds;
1250 0     if($chk == 0){
1251       $chk = $b->getCdsLength <=> $a->getCdsLength;
1252 0     if($chk == 0){
1253       $chk = length($b->getcDNASeq()) <=> length($a->getcDNASeq());
1254       }
1255       }
1256       return $chk;};
1257       return sort $customSort @trans;
1258       }
1259        
1260       __END__
1261        
1262       =head1 NAME
1263        
1264       Sanger::CGP::Vagrent::Annotators::AbstractAnnotator - Abstract base class for the annotation generators
1265        
1266       =head1 DESCRIPTION
1267        
1268       This is an abstract template class for the mutation annotators, it provides a lot of shared behind the scenes functionality. All
1269       subclasses must implement the _getAnnotation method.
1270        
1271       =head1 METHODS
1272        
1273       =head2 Constructor
1274        
1275       =head3 new
1276        
1277       =over
1278        
1279       =item Usage :
1280        
1281       my $source = Sanger::CGP::Vagrent::Annotators::AbstractAnnotatorSubClass->new(%params);
1282        
1283       =item Function :
1284        
1285       Builds a new Sanger::CGP::Vagrent::Annotators::AbstractAnnotator inheriting object
1286        
1287       =item Returns :
1288        
1289       Sanger::CGP::Vagrent::Annotators::AbstractAnnotator object initialized with parameter values
1290        
1291       =item Params :
1292        
1293       Hash of parameter values
1294        
1295       transcriptSource => A Sanger::CGP::Vagrent::TranscriptSource::AbstractTranscriptSource inheriting object
1296       bookmarker => (Optional) An array reference of, or single, Sanger::CGP::Vagrent::Bookmarkers::AbstractBookmarker inheriting object
1297       only_bookmarked => (Optional) Boolean, only return annotations that get bookmarked
1298        
1299       =back
1300        
1301       =head2 Functions
1302        
1303       =head3 getAnnotation
1304        
1305       =over
1306        
1307       =item Usage :
1308        
1309       my @annoGrps = $annotator->getAnnotation($variation);
1310        
1311       =item Function :
1312        
1313       Annotates the supplied L<Variation|Sanger::CGP::Vagrent::Data::AbstractVariation> object and returns a list of L<AnnotationGroups|Sanger::CGP::Vagrent::Data::AnnotationGroup> objects.
1314       If L<Bookmarkers|Sanger::CGP::Vagrent::Bookmarkers::AbstractBookmarker> have been set, the AnnotationGroups will have been marked before being returned.
1315       If 'only_bookmarked' was set to true, only AnnotationGroups that match Bookmarkers will be returned.
1316        
1317       =item Returns :
1318        
1319       An array of L<Sanger::CGP::Vagrent::Data::AnnotationGroup|Sanger::CGP::Vagrent::Data::AnnotationGroup> objects
1320        
1321       =item Params :
1322        
1323       A L<Sanger::CGP::Vagrent::Data::AbstractVariation|Sanger::CGP::Vagrent::Data::AbstractVariation> implementing object
1324        
1325       =back
1326        
1327       =head3 addMessage
1328        
1329       =over
1330        
1331       =item Usage :
1332        
1333       $annotator->addMessage("Interesting event found");
1334        
1335       =item Function :
1336        
1337       Adds a text message to the message list. All messages are reset every time C<getAnnotation> is called
1338        
1339       =item Returns :
1340        
1341       Nothing
1342        
1343       =item Params :
1344        
1345       String
1346        
1347       =back
1348        
1349       =head3 getMessages
1350        
1351       =over
1352        
1353       =item Usage :
1354        
1355       my @mess = $annotator->getMessages();
1356        
1357       =item Function :
1358        
1359       Retrieves a list of message strings about the most recent annotation attempt
1360        
1361       =item Returns :
1362        
1363       Array of String
1364        
1365       =back
1366        
1367       =head2 Abstract
1368        
1369       =head3 _getAnnotation
1370        
1371       =over
1372        
1373       =item Usage :
1374        
1375       my $type = $annotator->_getAnnotation($variation);
1376        
1377       =item Function :
1378        
1379       Abstract internal function, must be implemented in subclass. Returns a list of L<AnnotationGroups|Sanger::CGP::Vagrent::Data::AnnotationGroup>
1380        
1381       =item Returns :
1382        
1383       An array of L<Sanger::CGP::Vagrent::Data::AnnotationGroup|Sanger::CGP::Vagrent::Data::AnnotationGroup> objects
1384        
1385       =item Params :
1386        
1387       A L<Sanger::CGP::Vagrent::Data::AbstractVariation|Sanger::CGP::Vagrent::Data::AbstractVariation> implementing object
1388        
1389       =back
1390        
1391       =head3 _getDefaultCDSAnnotationType
1392        
1393       =over
1394        
1395       =item Usage :
1396        
1397       my $type = $annotator->_getDefaultCDSAnnotationType();
1398        
1399       =item Function :
1400        
1401       Abstract internal function, must be implemented in subclass. Returns default CDS annotation (L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> type constant) type for the Annotator
1402        
1403       =item Returns :
1404        
1405       String - L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> type constant
1406        
1407       =back
1408        
1409       =head3 _getMutatedCdsSequence
1410        
1411       =over
1412        
1413       =item Usage :
1414        
1415       my $varSeq = $annotator->_getMutatedCdsSequence();
1416        
1417       =item Function :
1418        
1419       Abstract internal function, must be implemented in subclass. Returns the full variant form of the CDS sequence
1420        
1421       =item Returns :
1422        
1423       String, DNA sequence
1424        
1425       =item Params :
1426        
1427       String - Full wildtype CDS DNA sequence
1428       Integer - Minimum position of the variant on the wildtype CDS sequence
1429       Integer - Maximum position of the variant on the wildtype CDS sequence
1430       String - Mutant DNA sequence for the variant
1431        
1432       =back
1433        
1434       =head3 _getWildTypeStringForCDSAnno
1435        
1436       =over
1437        
1438       =item Usage :
1439        
1440       my $wtseq = $annotator->_getWildTypeStringForCDSAnno($var,$tran,$mrnaAnno);
1441        
1442       =item Function :
1443        
1444       Abstract internal function, must be implemented in subclass. Generates the CDS wildtype string from the mRNA annotation
1445        
1446       =item Returns :
1447        
1448       String, DNA sequence
1449        
1450       =item Params :
1451        
1452       A Sanger::CGP::Vagrent::Data::AbstractVariation implementing object
1453       A Sanger::CGP::Vagrent::Data::Transcript object
1454       A Sanger::CGP::Vagrent::Data::Annotation object
1455        
1456       =back
1457        
1458       =head3 _getMutantStringForCDSAnno
1459        
1460       =over
1461        
1462       =item Usage :
1463        
1464       my $varseq = $annotator->_getMutantStringForCDSAnno($var,$tran,$mrnaAnno);
1465        
1466       =item Function :
1467        
1468       Abstract internal function, must be implemented in subclass. Generates the CDS variant string from the mRNA annotation
1469        
1470       =item Returns :
1471        
1472       String, DNA sequence
1473        
1474       =item Params :
1475        
1476       A Sanger::CGP::Vagrent::Data::AbstractVariation implementing object
1477       A Sanger::CGP::Vagrent::Data::Transcript object
1478       A Sanger::CGP::Vagrent::Data::Annotation object
1479        
1480       =back
1481        
1482       =head3 _getCDSDescriptionString
1483        
1484       =over
1485        
1486       =item Usage :
1487        
1488       my $desc = $annotator->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt);
1489        
1490       =item Function :
1491        
1492       Abstract internal function, must be implemented in subclass. Takes the plotted CDS variation data and returns the HGVS syntax describing the change
1493        
1494       =item Returns :
1495        
1496       String, HGVS description
1497        
1498       =item Params :
1499        
1500       A Sanger::CGP::Vagrent::Data::Transcript object
1501       Integer - CDS minimum position
1502       Integer - CDS maximum position
1503       Integer - Offset from the CDS minimum position (signed)
1504       Integer - Offset from the CDS maximum position (signed)
1505       String - Wildtype cDNA sequence of variant
1506       String - Variant cDNA sequence of variant
1507        
1508       =back
1509        
1510       =head3 _getDefaultProteinAnnotationType
1511        
1512       =over
1513        
1514       =item Usage :
1515        
1516       my $type = $annotator->_getDefaultProteinAnnotationType();
1517        
1518       =item Function :
1519        
1520       Abstract internal function, must be implemented in subclass. Returns default Protein annotation (L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> type constant) type for the Annotator
1521        
1522       =item Returns :
1523        
1524       String - L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> type constant
1525        
1526       =back
1527        
1528       =head3 _getCdsMinPosForProteinCalculation
1529        
1530       =over
1531        
1532       =item Usage :
1533        
1534       my $max = $annotator->_getCdsMinPosForProteinCalculation($cDNAAnnot);
1535        
1536       =item Function :
1537        
1538       Abstract internal function, must be implemented in subclass. Returns the minimum cDNA location of the variant that can be used for protein annotation
1539        
1540       =item Returns :
1541        
1542       Integer - cDNA position
1543        
1544       =item Params :
1545        
1546       A L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> representing the cDNA annotation
1547        
1548       =back
1549        
1550       =head3 _getCdsMaxPosForProteinCalculation
1551        
1552       =over
1553        
1554       =item Usage :
1555        
1556       my $max = $annotator->_getCdsMaxPosForProteinCalculation($cDNAAnnot);
1557        
1558       =item Function :
1559        
1560       Abstract internal function, must be implemented in subclass. Returns the maximum cDNA location of the variant that can be used for protein annotation
1561        
1562       =item Returns :
1563        
1564       Integer - cDNA position
1565        
1566       =item Params :
1567        
1568       A L<Annotation|Sanger::CGP::Vagrent::Data::Annotation> representing the cDNA annotation
1569        
1570       =back
1571        
1572       =head3 _isStartGained
1573        
1574       =over
1575        
1576       =item Usage :
1577        
1578       if($annotator->_isStartGained($var,$tran,$mRNAmin,$mRNAmax,$wt,$mt)){
1579       .......
1580       }
1581        
1582       =item Function :
1583        
1584       Abstract internal function, must be implemented in subclass. Returns returns true if there is a start codon created or moved in the 5' UTR
1585        
1586       =item Returns :
1587        
1588       Boolean
1589        
1590       =item Params :
1591        
1592       A Sanger::CGP::Vagrent::Data::AbstractVariation implementing object
1593       A Sanger::CGP::Vagrent::Data::Transcript object
1594       Integer - Minimum mRNA coordinate of the variant
1595       Integer - Maximum mRNA coordinate of the variant
1596       String - Wildtype variant sequence
1597       String - Mutant variant sequence
1598        
1599       =back