File Coverage

lib/Sanger/CGP/Vagrent/Annotators/DeletionAnnotator.pm
Criterion Covered Total %
branch 258 294 87.7
subroutine 28 28 100.0
pod n/a
total 286 322 88.8


line bran sub pod code
1       package Sanger::CGP::Vagrent::Annotators::DeletionAnnotator;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2014-2017 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of VAGrENT.
9       #
10       # VAGrENT is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       ##########LICENCE##########
23        
24        
25   2   use strict;
26        
27   2   use Bio::Seq;
28        
29   2   use Carp qw(cluck);
30   2   use Log::Log4perl;
31   2   use POSIX qw(ceil);
32   2   use Data::Dumper;
33        
34   2   use Sanger::CGP::Vagrent::Data::Deletion;
35   2   use Sanger::CGP::Vagrent qw($VERSION);
36        
37   2   use base qw(Sanger::CGP::Vagrent::Annotators::AbstractAnnotator);
38        
39       my $log = Log::Log4perl->get_logger(__PACKAGE__);
40        
41       1;
42        
43       sub _getAnnotation {
44   282   my ($self,$var) = @_;
45       $self->_clearMessages();
46 50     unless(defined($var) && $var->isa('Sanger::CGP::Vagrent::Data::Deletion')){
47       my $msg = 'require a Sanger::CGP::Vagrent::Data::Deletion object not a '.ref($var);
48       $self->addMessage($msg);
49       $log->info($msg);
50       return undef;
51       }
52 50     unless($var->isValid){
53       my $msg = 'deletion not valid';
54       $self->addMessage($msg);
55       $log->error($msg);
56       return undef;
57       }
58       my @trans = $self->_getTranscriptSource->getTranscripts($var);
59 100     unless(defined($trans[0])){
60       my $msg = 'No transcripts returned from transcript source';
61       $self->addMessage($msg);
62       $log->info($msg);
63       return undef;
64       }
65       my @groups;
66       foreach my $t(@trans){
67       my $g = $self->_generateAnnotatonGroup($var,$t);
68 100     if(defined($g)){
69       push(@groups,$g);
70       }
71       }
72 100     unless(scalar(@groups) > 0 && defined($groups[0])){
73       my $msg = 'No annotation groups generated';
74       $self->addMessage($msg);
75       $log->info($msg);
76       return undef;
77       }
78       return @groups;
79       }
80        
81       sub _generateAnnotatonGroup {
82   278   my ($self,$var,$tran) = @_;
83       my ($rAnnot,@groupClasses) = $self->_buildRNAAnnotation($var,$tran);
84       @groupClasses = sort @groupClasses;
85 100     unless(defined($rAnnot)){
86       my $msg = 'No mRNA annotation created';
87       $self->addMessage($msg);
88       $log->info($msg);
89       return undef;
90       }
91       my $group = Sanger::CGP::Vagrent::Data::AnnotationGroup->new( accession => $tran->getAccession,
92       label => $tran->getGeneName,
93       ccds => $tran->getCCDS,
94       type => $tran->getGeneType,);
95        
96 100     if($tran->isProteinCoding){
97 100     if( ( $rAnnot->hasClassification($self->getIntronVariantClass) ||
98       $rAnnot->hasClassification($self->get5KBUpStreamVariantClass) ||
99       $rAnnot->hasClassification($self->get2KBUpStreamVariantClass) ||
100       $rAnnot->hasClassification($self->get5KBDownStreamVariantClass) ||
101       $rAnnot->hasClassification($self->get500BPDownStreamVariantClass)) &&
102       !$rAnnot->hasClassification($self->getEssentialSpliceSiteVariantClass) &&
103       !$rAnnot->hasClassification($self->getSpliceRegionVariantClass) &&
104       !$rAnnot->hasClassification($self->getFrameShiftVariantClass) &&
105       !$rAnnot->hasClassification($self->getInFrameVariantClass) &&
106       !$rAnnot->hasClassification($self->get5PrimeUtrVariantClass) &&
107       !$rAnnot->hasClassification($self->get3PrimeUtrVariantClass) &&
108       !$rAnnot->hasClassification($self->getComplexChangeVariantClass)){
109       # Inronic or up/down stream mutations don't need to get any further annotation
110       $group->addAnnotation($rAnnot);
111       } else {
112       my $cAnnot = $self->_buildCDSAnnotation($var,$tran,$rAnnot);
113 50     unless(defined($cAnnot)){
114       my $msg = 'No CDS annotation created';
115       $self->addMessage($msg);
116       $log->info($msg);
117       return undef;
118       }
119       my $pAnnot = $self->_buildProteinAnnotation($var,$tran,$cAnnot,$rAnnot);
120 50     unless(defined($pAnnot)){
121       my $msg = 'No Protein annotation created';
122       $self->addMessage($msg);
123       $log->info($msg);
124       return undef;
125       }
126       $group->addAnnotation($rAnnot);
127       $group->addAnnotation($cAnnot);
128       $group->addAnnotation($pAnnot);
129       }
130       } else {
131       $group->addAnnotation($rAnnot);
132       }
133       $group->addClassification(@groupClasses);
134       return $group;
135       }
136        
137       sub _buildRNAAnnotation {
138   278   my ($self,$var,$tran) = @_;
139       my ($mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset) = $self->_getmRNAPositions($var,$tran);
140 50     unless(defined($mrnaMin) && defined($mrnaMinOffset) && defined($mrnaMax) && defined($mrnaMaxOffset)){
141       my $msg = 'problem generating mrna coordinates';
142       $self->addMessage($msg);
143       $log->error($msg);
144       return undef;
145       }
146 50     unless($self->_safetyCheck($var,$tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset)){
147       # something has gone wrong,
148       my $msg = 'The affected transcript WT sequence in the deleted sequence doesnt match the affected transcript WT sequence from the transcript';
149       $self->addMessage($msg);
150       $log->error($msg);
151       return undef;
152       }
153       my @classes = ($self->getDeletionClass);
154       my @groupClasses = ($self->classifyTranscript($tran));
155        
156       my $upstream = $self->_upstreamVariantCheck($mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset,\@classes);
157 100     if(defined $upstream){
158 100     if($upstream){
159       return ($self->_buildUnknownMRNAAnnotation($var,$tran,@classes),@groupClasses);
160       }
161       } else {
162       return undef;
163       }
164        
165       my $downstream = $self->_downstreamVariantCheck($mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset,\@classes);
166 100     if(defined $downstream){
167 100     if($downstream){
168       return ($self->_buildUnknownMRNAAnnotation($var,$tran,@classes),@groupClasses);
169       }
170       } else {
171       return undef;
172       }
173        
174       # delt with up/down stream issues, can reset coordinates to the extremes of the transcript;
175 100     if($mrnaMin == 0 && $mrnaMinOffset <= 0){
176       $mrnaMin = 1;
177       $mrnaMinOffset = 0;
178       }
179 100     if($mrnaMax == 0 && $mrnaMaxOffset >= 0){
180       $mrnaMax = length($tran->getcDNASeq);
181       $mrnaMaxOffset = 0;
182       }
183        
184 100     if($tran->isProteinCoding){
185 100     if(($mrnaMax > $tran->getCdsMinPos || ($mrnaMax == $tran->getCdsMinPos && $mrnaMaxOffset >= 0)) &&
186       ($mrnaMin < $tran->getCdsMaxPos || ($mrnaMin == $tran->getCdsMaxPos && $mrnaMinOffset <= 0))){
187       #if($mrnaMax >= $tran->getCdsMinPos && $mrnaMin <= $tran->getCdsMaxPos){
188       # coding change
189       push(@groupClasses,$self->getCDSClass);
190       }
191 100     if($mrnaMin < $tran->getCdsMinPos || ($mrnaMin == $tran->getCdsMinPos && $mrnaMinOffset < 0)){
192       #if($mrnaMin < $tran->getCdsMinPos){
193       # 5prime UTR
194       push(@groupClasses,$self->get5PrimeUtrClass);
195 100     push(@classes,$self->get5PrimeUtrVariantClass) unless($self->_arrayHasString($self->getCDSClass,@groupClasses));
196       }
197 100     if($mrnaMax > $tran->getCdsMaxPos || ($mrnaMax == $tran->getCdsMaxPos && $mrnaMaxOffset > 0)){
198       #if($mrnaMax > $tran->getCdsMaxPos){
199       # 3prime UTR
200       push(@groupClasses,$self->get3PrimeUtrClass);
201 100     push(@classes,$self->get3PrimeUtrVariantClass) unless($self->_arrayHasString($self->getCDSClass,@groupClasses));
202       }
203       }
204        
205       my $tmpGroupClassesHash = $self->_classifyDeletion($tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset);
206       push(@groupClasses, keys %$tmpGroupClassesHash);
207 100     if(scalar(keys %$tmpGroupClassesHash) == 1 && defined($tmpGroupClassesHash->{$self->getIntronClass})){
208       # its intron only
209       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getDeletionClass,$self->getIntronVariantClass),@groupClasses);
210       }
211        
212       my $wt = $self->_getWildTypeStringForRNAAnno($var,$tran);
213       my $mt = '-';
214        
215       $wt =~ tr/Tt/Uu/;
216        
217       my $desc = undef;
218        
219 100     if($mrnaMin == 1 && $mrnaMax == length($tran->getcDNASeq)){
220       $desc = 'r.0';
221       } else {
222       $desc = $self->_generateDNADelDescriptionString('r.',$mrnaMin,$mrnaMax,$mrnaMinOffset,$mrnaMaxOffset,$wt);
223       }
224        
225       my $subtype = undef;
226        
227 50     if($mrnaMin == 0 && $mrnaMax == 0){
  100      
228       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype();
229       } elsif($mrnaMinOffset == 0 && $mrnaMaxOffset == 0){
230       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
231       } else {
232       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype();
233       }
234        
235       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new( wt => uc($wt),
236       mt => $mt,
237       minpos => $mrnaMin,
238       minOffset => $mrnaMinOffset,
239       maxpos => $mrnaMax,
240       maxOffset => $mrnaMaxOffset,
241       acc => $tran->getAccession,
242       accversion => $tran->getAccessionVersion,
243       db => $tran->getDatabase,
244       dbversion => $tran->getDatabaseVersion,
245       seqlength => length($tran->getcDNASeq),
246       description => $desc,
247       context => Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(),
248       type => Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType(),
249       subtype => $subtype);
250        
251 100     if(defined($tmpGroupClassesHash->{$self->getExonClass}) && defined($tmpGroupClassesHash->{$self->getEssentialSpliceSiteClass})){
252       # special case, variation spans over the end of the exon. This is a complex change in the transcript
253       push(@classes,$self->getComplexChangeVariantClass);
254 100     unless($tran->isProteinCoding){
255       push(@classes,$self->getNonCodingTranscriptVariantClass);
256       }
257        
258       } else {
259 100     if(defined($tmpGroupClassesHash->{$self->getSpliceRegionClass})){
260       push(@classes,$self->getSpliceRegionVariantClass);
261       }
262 100     if(defined($tmpGroupClassesHash->{$self->getEssentialSpliceSiteClass})){
263       push(@classes,$self->getEssentialSpliceSiteVariantClass);
264       }
265 100     if(defined($tmpGroupClassesHash->{$self->getIntronClass})){
266       push(@classes,$self->getIntronVariantClass);
267       }
268 100     if(defined($tmpGroupClassesHash->{$self->getExonClass})){
269 100     if($tran->isProteinCoding){
270 100     if($self->_arrayHasString($self->getCDSClass,@groupClasses)){
271 100     if($self->_arrayHasString($self->get2KBUpStreamVariantClass,@classes) || $self->_arrayHasString($self->get5PrimeUtrClass,@groupClasses) || $self->_arrayHasString($self->get3PrimeUtrClass,@groupClasses) || $self->_arrayHasString($self->get500BPDownStreamVariantClass,@classes)){
272       # it overhangs the CDS-to-UTR/upstream/downstream boundry
273       push(@classes,$self->getComplexChangeVariantClass);
274       } else {
275 100     if(length($wt) % 3 == 0){
276       push(@classes,$self->getInFrameVariantClass);
277       } else {
278       push(@classes,$self->getFrameShiftVariantClass);
279       }
280       }
281       }
282       } else {
283       push(@classes,$self->getNonCodingTranscriptVariantClass);
284       }
285       }
286       }
287        
288       $anno->addClassification(@classes);
289        
290       return ($anno,@groupClasses);
291       }
292        
293       sub _upstreamVariantCheck {
294   505   my ($self,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset,$classes) = @_;
295        
296       my $upStartScore = 0;
297       my $upEndScore = 0;
298 100     if($mrnaMin > 0 || ($mrnaMin == 0 && $mrnaMinOffset >= 0)){
  50      
299       $upStartScore = 3;
300       } elsif($mrnaMin == 0 && $mrnaMinOffset < 0){
301 100     if($self->_isWithin2KBUpstreamOffsetDistance($mrnaMinOffset)){
  100      
302       $upStartScore = 2;
303       } elsif($self->_isWithin5KBOffsetDistance($mrnaMinOffset)){
304       $upStartScore = 1;
305       }
306       }
307 100     if($mrnaMax > 0 || ($mrnaMax == 0 && $mrnaMaxOffset >= 0)){
  50      
308       $upEndScore = 3;
309       } elsif ($mrnaMax == 0 && $mrnaMaxOffset < 0){
310 100     if($self->_isWithin2KBUpstreamOffsetDistance($mrnaMaxOffset)){
  100      
311       $upEndScore = 2;
312       } elsif($self->_isWithin5KBOffsetDistance($mrnaMaxOffset)){
313       $upEndScore = 1;
314       }
315       }
316 50     print "up scores: $upStartScore - $upEndScore\n" if($self->_debug);
317 100     if($upStartScore == 0 && $upEndScore == 0){
318       my $msg = "variant isnt close enough to this transcript, nothing to do";
319       $self->addMessage($msg);
320       $log->error($msg);
321       return undef;
322       }
323       for(my $i = $upStartScore; $i <= $upEndScore ; $i++){
324 100     if($i == 1){
  100      
325       push(@$classes,$self->get5KBUpStreamVariantClass);
326       } elsif($i == 2){
327       push(@$classes,$self->get2KBUpStreamVariantClass);
328       }
329       }
330 100     if($upEndScore < 3){
331       return 1
332       }
333       return 0;
334       }
335        
336       sub _downstreamVariantCheck {
337   455   my ($self,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset,$classes) = @_;
338        
339       my $downStartScore = 0;
340       my $downEndScore = 0;
341 100     if($mrnaMin > 0 || ($mrnaMin == 0 && $mrnaMinOffset <= 0)){
  50      
342       $downStartScore = 3;
343       } elsif($mrnaMin == 0 && $mrnaMinOffset > 0){
344 100     if($self->_isWithin500BPDownstreamOffsetDistance($mrnaMinOffset)){
  100      
345       $downStartScore = 2;
346       } elsif($self->_isWithin5KBOffsetDistance($mrnaMinOffset)){
347       $downStartScore = 1;
348       }
349       }
350 100     if($mrnaMax > 0 || ($mrnaMax == 0 && $mrnaMaxOffset <= 0)){
  50      
351       $downEndScore = 3;
352       } elsif ($mrnaMax == 0 && $mrnaMaxOffset > 0){
353 100     if($self->_isWithin500BPDownstreamOffsetDistance($mrnaMaxOffset)){
  100      
354       $downEndScore = 2;
355       } elsif($self->_isWithin5KBOffsetDistance($mrnaMaxOffset)){
356       $downEndScore = 1;
357       }
358       }
359 50     print "down scores: $downStartScore - $downEndScore\n" if($self->_debug);
360 100     if($downStartScore == 0 && $downEndScore == 0){
361       my $msg = "variant isnt close enough to this transcript, nothing to do";
362       $self->addMessage($msg);
363       $log->error($msg);
364       return undef;
365       }
366       for(my $i = $downEndScore ; $i <= $downStartScore; $i++){
367 100     if($i == 1){
  100      
368       push(@$classes,$self->get5KBDownStreamVariantClass);
369       } elsif($i == 2){
370       push(@$classes,$self->get500BPDownStreamVariantClass);
371       }
372       }
373 100     if($downStartScore < 3){
374       return 1;
375       }
376       return 0;
377       }
378        
379       sub _classifyDeletion {
380   413   my ($self,$tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset) = @_;
381       # big scary logic ladder time
382       my $grpCls;
383       my @exons = $tran->getExons;
384       @exons = sort {$a->getRnaMinPos <=> $b->getRnaMinPos} @exons;
385 50     print "$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset\n" if($self->_debug);
386       for(my $i = 0 ; $i < scalar(@exons) ; $i++){
387       my $e = $exons[$i];
388 100     if($mrnaMin > $e->getRnaMaxPos){
389       # exon occurs before mutation in transcript, ignore
390       next;
391       }
392 100     if($mrnaMax < $e->getRnaMinPos){
393       # exon occurs after mutation in transcript, ignore
394       next;
395       }
396       # if we get here the variant is somehow linked to the current exon
397        
398       # check to see if the variant overlaps the exon
399 100     if(($mrnaMin < $e->getRnaMaxPos || ($mrnaMin == $e->getRnaMaxPos && $mrnaMinOffset == 0)) && ($mrnaMax > $e->getRnaMinPos || ($mrnaMax == $e->getRnaMinPos && $mrnaMaxOffset == 0))){
400       $grpCls->{$self->getExonClass}++;
401       }
402 50     print Dumper($e) if($self->_debug);
403       # check to see if the variant effects the area before the exon
404 100     if($mrnaMin < $e->getRnaMinPos || ($mrnaMin == $e->getRnaMinPos && $mrnaMinOffset < 0)){
405       # variant starts before the exon
406 100     if($mrnaMin < $e->getRnaMinPos || $self->_isIntronicOffsetDistance($mrnaMinOffset)){
  100      
407       # variant starts in the intron
408 100     if($mrnaMax > $e->getRnaMinPos || ($mrnaMax == $e->getRnaMinPos && $mrnaMaxOffset == 0)){
  100      
  50      
409       # variant ends after or on the start of the exon, so it gets all 3 terms
410       $grpCls->{$self->getIntronClass}++;
411       $grpCls->{$self->getSpliceRegionClass}++;
412       $grpCls->{$self->getEssentialSpliceSiteClass}++;
413       } elsif($self->_isIntronicOffsetDistance($mrnaMaxOffset)){
414       # variant also ends in the intron
415       $grpCls->{$self->getIntronClass}++;
416       } elsif($mrnaMaxOffset < 0 && $mrnaMaxOffset >= $self->_getConsesnsusSpliceBeforeBoundry){
417       # variant ends in the essential splice site
418       $grpCls->{$self->getIntronClass}++;
419       $grpCls->{$self->getSpliceRegionClass}++;
420       $grpCls->{$self->getEssentialSpliceSiteClass}++;
421       } else {
422       # variant ends in the splice region
423       $grpCls->{$self->getIntronClass}++;
424       $grpCls->{$self->getSpliceRegionClass}++;
425       }
426       } elsif ($mrnaMin == $e->getRnaMinPos && $mrnaMinOffset >= $self->_getConsesnsusSpliceBeforeBoundry){
427       # variant starts in the essential splice site
428       $grpCls->{$self->getEssentialSpliceSiteClass}++;
429       } else {
430       # variant starts in the splice region
431 100     if($mrnaMax > $e->getRnaMinPos || ($mrnaMax == $e->getRnaMinPos && $mrnaMaxOffset == 0)){
  100      
432       # variant ends after the start of the exon, so
433       $grpCls->{$self->getSpliceRegionClass}++;
434       $grpCls->{$self->getEssentialSpliceSiteClass}++;
435       } elsif($mrnaMaxOffset < 0 && $mrnaMaxOffset >= $self->_getConsesnsusSpliceBeforeBoundry){
436       # variant ends in the essential splice site
437       $grpCls->{$self->getSpliceRegionClass}++;
438       $grpCls->{$self->getEssentialSpliceSiteClass}++;
439       } else {
440       # variant also ends in the splice region
441       $grpCls->{$self->getSpliceRegionClass}++;
442       }
443       }
444       }
445        
446       # check to see if the variant effects the area after the exon
447 100     if($mrnaMax > $e->getRnaMaxPos || ($mrnaMax == $e->getRnaMaxPos && $mrnaMaxOffset > 0)){
448       # variant ends after the exon
449 100     if($mrnaMax > $e->getRnaMaxPos || $self->_isIntronicOffsetDistance($mrnaMaxOffset)){
  100      
450       # variant ends in the intron
451 100     if($mrnaMin < $e->getRnaMaxPos || ($mrnaMin == $e->getRnaMaxPos && $mrnaMinOffset == 0)){
  100      
  100      
452       # variant starts before or on the end of the exon, so it gets all 3 terms
453       $grpCls->{$self->getIntronClass}++;
454       $grpCls->{$self->getSpliceRegionClass}++;
455       $grpCls->{$self->getEssentialSpliceSiteClass}++;
456       } elsif($self->_isIntronicOffsetDistance($mrnaMinOffset)){
457       # variant also starts in the intron
458       $grpCls->{$self->getIntronClass}++;
459       } elsif ($mrnaMinOffset > 0 && $mrnaMinOffset <= $self->_getConsesnsusSpliceAfterBoundry){
460       # variant starts in the essential splice site
461       $grpCls->{$self->getIntronClass}++;
462       $grpCls->{$self->getSpliceRegionClass}++;
463       $grpCls->{$self->getEssentialSpliceSiteClass}++;
464       } else {
465       # variant must start in the splice region
466       $grpCls->{$self->getIntronClass}++;
467       $grpCls->{$self->getSpliceRegionClass}++;
468       }
469       } elsif($mrnaMax == $e->getRnaMaxPos && $mrnaMaxOffset <= $self->_getConsesnsusSpliceAfterBoundry){
470       # variant ends in the essential splice site
471       $grpCls->{$self->getEssentialSpliceSiteClass}++;
472       } else {
473       # variant ends in the splice region
474 100     if($mrnaMin < $e->getRnaMaxPos || ($mrnaMin == $e->getRnaMaxPos && $mrnaMinOffset == 0)){
  100      
475       # variant starts before or on the end of the exon, so it gets all 3 terms
476       $grpCls->{$self->getSpliceRegionClass}++;
477       $grpCls->{$self->getEssentialSpliceSiteClass}++;
478       } elsif ($mrnaMinOffset > 0 && $mrnaMinOffset <= $self->_getConsesnsusSpliceAfterBoundry){
479       # variant starts in the essential splice site
480       $grpCls->{$self->getSpliceRegionClass}++;
481       $grpCls->{$self->getEssentialSpliceSiteClass}++;
482       } else {
483       # variant must start in the splice region
484       $grpCls->{$self->getSpliceRegionClass}++;
485       }
486       }
487       }
488       }
489 50     print Dumper($grpCls) if($self->_debug);
490       return $grpCls;
491       }
492        
493       sub _getCDSDescriptionString {
494   86   my ($self,$tran,$mutStart,$mutEnd,$mutStartOffset,$mutEndOffset,$wt,$mt) = @_;
495       my $desc = undef;
496 100     if($mutStart == 1 && $mutEnd == length($tran->getCdsSeq)){
497       $desc = 'c.0';
498       } else {
499       $desc = $self->_generateDNADelDescriptionString('c.',$mutStart,$mutEnd,$mutStartOffset,$mutEndOffset,$wt);
500       }
501       return $desc;
502       }
503        
504       sub _generateDNADelDescriptionString {
505   260   my ($self,$pre,$mutStart,$mutEnd,$mutStartOffset,$mutEndOffset,$wt) = @_;
506       my $desc = 'WIBBLE';
507 100     if($mutStart == $mutEnd && $mutStartOffset == $mutEndOffset){
508       # single base change
509 100     if($mutStartOffset > 0){
  100      
  50      
510       $desc = $pre.$mutStart.'+'.$mutStartOffset.'del'.$wt;
511       } elsif($mutStartOffset < 0){
512       $desc = $pre.$mutStart.$mutStartOffset.'del'.$wt;
513       } elsif ($mutStartOffset == 0) {
514       $desc = $pre.$mutStart.'del'.$wt;
515       }
516       } else {
517       # multi base change
518 100     if($mutStartOffset > 0){
  100      
  50      
519       $desc = $pre.$mutStart.'+'.$mutStartOffset.'_';
520       } elsif($mutStartOffset < 0){
521       $desc = $pre.$mutStart.$mutStartOffset.'_';
522       } elsif($mutStartOffset == 0) {
523       $desc = $pre.$mutStart.'_';
524       }
525        
526 100     if($mutEndOffset > 0){
  100      
  50      
527       $desc .= $mutEnd.'+'.$mutEndOffset.'del';
528       } elsif($mutEndOffset < 0){
529       $desc .= $mutEnd.$mutEndOffset.'del';
530       } elsif($mutEndOffset == 0){
531       $desc .= $mutEnd.'del';
532       }
533        
534 100     if(length($wt) > 15){
535       $desc .= length($wt);
536       } else {
537       $desc .= $wt;
538       }
539       }
540       return $desc;
541       }
542        
543       sub _getWildTypeStringForRNAAnno {
544   444   my ($self,$var,$tran) = @_;
545       my $wt = '';
546       my @seq = split('',$var->getDeletedSequence);
547       my @exons = sort{$a->getMinPos <=> $b->getMinPos} $tran->getExons;
548       for(my $i = $var->getMinPos, my $c = 0; $c < scalar(@seq); $i++, $c++){
549       my $isExonic = 0;
550       my $ec = 0;
551       my $use = 0;
552       foreach my $e(@exons){
553       $ec++;
554 100     if($i > $e->getMaxPos){ # base after exon, continue
  100      
  50      
555 100     if($ec == scalar(@exons)){ # if the base is after the last exon we don't want to use it
556       $use = 0;
557       } else {
558       $use = 1;
559       }
560       next;
561       } elsif($i >= $e->getMinPos && $i <= $e->getMaxPos){ # base in exon, quit
562       $isExonic = 1;
563       $use = 1;
564       last;
565       } elsif($i < $e->getMinPos){ # base before exon, quit
566 100     $use = 1 unless($ec==1); # ignore the base if its before the first exon
567       last;
568       }
569       }
570 100     next unless($use == 1); # skip the base if its not flagged for use
571 100     if($isExonic){
572       $wt .= uc($seq[$c]);
573       } else {
574       $wt .= lc($seq[$c]);
575       }
576       }
577 100     if($tran->getStrand == -1){
578       $wt = $self->_revcompSeq($wt);
579       }
580       return $wt;
581       }
582        
583       sub _getWildTypeStringForCDSAnno {
584   165   my ($self,$var,$tran,$rAnnot) = @_;
585       my $wt = '';
586 100     if($rAnnot->hasClassification($self->getInFrameVariantClass) || $rAnnot->hasClassification($self->getFrameShiftVariantClass)){
  100      
587       my @seq = split('',$var->getDeletedSequence);
588       my @exons = sort{$a->getMinPos <=> $b->getMinPos} $tran->getExons;
589       for(my $i = $var->getMinPos, my $c = 0; $c < scalar(@seq); $i++, $c++){
590       my $isCod = 0;
591       foreach my $e(@exons){
592 100     next unless($i >= $e->getMinPos && $i <= $e->getMaxPos); # base is in this exon
593 100     if($tran->getStrand == 1){
594       my $pos = ($i - $e->getMinPos) + $e->getRnaMinPos;
595 50     if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){
596       $isCod = 1;
597       last;
598       }
599       } else {
600       my $pos = $e->getRnaMaxPos - ($i - $e->getMinPos);
601 50     if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){
602       $isCod = 1;
603       last;
604       }
605       }
606       }
607 50     if($isCod){
608       $wt .= uc($seq[$c]);
609       }
610       }
611 100     if($tran->getStrand == -1){
612       $wt = $self->_revcompSeq($wt);
613       }
614       } elsif ($rAnnot->hasClassification($self->getComplexChangeVariantClass)){
615 100     if($rAnnot->getMinPos <= $tran->getCdsMinPos || $rAnnot->getMaxPos >= $tran->getCdsMaxPos){
616       # variant exists outside the CDS, have to truncate it back
617       my $len = ($tran->getCdsMaxPos - $tran->getCdsMinPos) + 1;
618       my ($genomicCdsLow,$genomicCdsHigh) = $self->_calculateGenomicCdsPosition($tran);
619       my $newDelMin = undef;
620       my $newDelMax = undef;
621       my $newDelSeq = $var->getDeletedSequence;
622 100     if($var->getMinPos < $genomicCdsLow){
623       $newDelMin = $genomicCdsLow;
624       substr($newDelSeq,0,($newDelMin - $var->getMinPos),'');
625       } else {
626       $newDelMin = $var->getMinPos;
627       }
628 100     if($var->getMaxPos > $genomicCdsHigh){
629       $newDelMax = $genomicCdsHigh;
630       $newDelSeq = substr($newDelSeq,0,($newDelMax - $newDelMin) + 1);
631       } else {
632       $newDelMax = $var->getMaxPos;
633       }
634        
635       my $newDel = Sanger::CGP::Vagrent::Data::Deletion->new(
636       'species' => $var->getSpecies,
637       'genomeVersion' => $var->getGenomeVersion,
638       'chr' => $var->getChr,
639       'minpos' => $newDelMin,
640       'maxpos' => $newDelMax,
641       'delseq' => $newDelSeq);
642        
643       $wt = $self->_getWildTypeStringForRNAAnno($newDel,$tran);
644       } else {
645       $wt = $self->_getWildTypeStringForRNAAnno($var,$tran);
646       }
647       } else {
648       $wt = $self->_getWildTypeStringForRNAAnno($var,$tran);
649       }
650       $wt =~ s/u/t/ig;
651       return $wt;
652       }
653        
654       sub _getMutantStringForCDSAnno {
655   86   my ($self,$var,$tran,$rAnnot) = @_;
656       return '-';
657       }
658        
659       sub _getMutatedCdsSequence {
660   18   my ($self,$wtDna,$min,$max,$mt) = @_;
661       my $codingDelLength = ($max - $min) + 1;
662       my $mtDna = $wtDna;
663       substr($mtDna,($min - 1),$codingDelLength,'');
664       return $mtDna
665       }
666        
667       sub _getCdsMinPosForProteinCalculation {
668   36   my ($self,$cAnnot) = @_;
669       my $min = undef;
670 50     if($cAnnot->getMinOffset > 0){
671       $min = $cAnnot->getMinPos + 1;
672       } else {
673       $min = $cAnnot->getMinPos;
674       }
675       return $min;
676       }
677        
678       sub _getCdsMaxPosForProteinCalculation {
679   36   my ($self,$cAnnot) = @_;
680       my $max = undef;
681 50     if($cAnnot->getMaxOffset < 0){
682       $max = $cAnnot->getMaxPos - 1;
683       } else {
684       $max = $cAnnot->getMaxPos;
685       }
686       return $max;
687       }
688        
689       sub _safetyCheck {
690   505   my ($self,$var,$tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset) = @_;
691       my $tranString = $self->_getTranscriptSubstringForCheck($tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset);
692 50     unless(defined($tranString)){
693       return 0;
694       }
695       my $genoString = $self->_getExonicGenomicSubstringForCheck($var,$tran);
696 50     unless(defined($genoString)){
697       return 0;
698       }
699 50     if($self->_debug){
700       print 'DELSEQ = '.$var->getDeletedSequence."\n";
701       print 'TRANSEQ = '.$tranString."\n";
702       print 'GENOSEQ = '.$genoString."\n";
703       }
704 100     if(length($tranString) == 0 && length($genoString) == 0){
  50      
705       # not exonic sequence, no transcript sequence to check, just have to hope its OK
706       return 1;
707       } elsif(uc($tranString) eq uc($genoString)){
708       return 1;
709       }
710       return 0;
711       }
712        
713       sub _getTranscriptSubstringForCheck {
714   505   my ($self,$tran,$mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset) = @_;
715       my $substr = undef;
716       my $pos = undef;
717       my $len = undef;
718 50     print "TRAN SUB STR $mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset\n" if($self->_debug);
719 100     if($mrnaMin == 0 && $mrnaMinOffset > 0){
  100      
  100      
720       # variant starts after transcript, nothing to do
721       return '';
722       } elsif($mrnaMax == 0 && $mrnaMaxOffset < 0){
723       # variant ends before transcript, nothing to do
724       return '';
725       } elsif($mrnaMinOffset == 0 && $mrnaMaxOffset == 0){
726       $pos = $mrnaMin - 1;
727       $len = ($mrnaMax - $mrnaMin) + 1
728       } else {
729       my $leftAnchor;
730       my $rightAnchor;
731 100     if($mrnaMinOffset > 0){
732       $leftAnchor = $mrnaMin + 1;
733       } else {
734       $leftAnchor = $mrnaMin;
735       }
736 100     if($mrnaMaxOffset < 0){
737       $rightAnchor = $mrnaMax - 1;
738       } else {
739       $rightAnchor = $mrnaMax;
740       }
741 100     $leftAnchor = 1 if $leftAnchor < 1;
742 100     $rightAnchor = length($tran->getcDNASeq) if($rightAnchor == 0);
743 50     print "ANCHOR $leftAnchor $rightAnchor\n" if($self->_debug);
744 100     if($leftAnchor > $rightAnchor){
745       # its an intronic change that doesn't span any coding bases
746       return '';
747       }
748       $pos = $leftAnchor - 1;
749       $len = ($rightAnchor - $leftAnchor) + 1;
750        
751       }
752 50     print "FINAL $pos $len\n" if($self->_debug);
753       $substr = substr($tran->getcDNASeq,$pos,$len);
754       return $substr;
755       }
756        
757       sub _getExonicGenomicSubstringForCheck {
758   505   my ($self,$var,$tran) = @_;
759       my $substr = undef;
760       my $c = 0;
761       my $beforeC = 0;
762       my $duringC = 0;
763       foreach my $e ($tran->getExonsGenomicOrder){
764       $c++;
765 100     if($e->getMaxPos < $var->getMinPos){
766       # exon before deletion - skip
767 50     print "EXON $c BEFORE\n" if($self->_debug);
768       $beforeC++;
769       next;
770       }
771 100     if($var->getMinPos <= $e->getMaxPos && $e->getMinPos <= $var->getMaxPos){
772       # overlap
773       $duringC++;
774       my $start = ($e->getMinPos - $var->getMinPos) + 1;
775       my $end = ($e->getMaxPos - $var->getMinPos) + 1;
776 100     $start = 1 if($start < 1);
777 100     $end = length($var->getDeletedSequence) if($end > length($var->getDeletedSequence));
778        
779       my $pos = $start - 1;
780       my $len = ($end - $start) + 1;
781       my $tmp = substr($var->getDeletedSequence,$pos,$len);
782 100     if(defined($substr)){
783       $substr .= $tmp;
784       } else {
785       $substr = $tmp;
786       }
787 50     print "EXON $c DURING $start $end $tmp\n" if($self->_debug);
788       }
789 100     if($var->getMaxPos < $e->getMinPos){
790 50     print "EXON $c AFTER\n" if($self->_debug);
791       # exon after deletion
792 100     if(!defined($substr)){
793       # its not a coding change;
794       return '';
795       }
796       }
797       }
798 100     if($duringC == 0){
799       # variant doesn't overlap any exons
800       return '';
801       }
802 100     if($tran->getStrand == -1){
803       $substr = $self->_revcompSeq($substr);
804       }
805       return $substr;
806       }
807        
808       sub _getDefaultCDSAnnotationType {
809   86   return Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType();
810       }
811        
812       sub _getDefaultProteinAnnotationType {
813   3   return Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType();
814       }
815        
816        
817       =head1 NAME
818        
819       Sanger::CGP::Vagrent::Annotators::DeletionAnnotator - Annotator for deletion variants
820        
821       =head1 DESCRIPTION
822        
823       This annotates deletion variants, it provides L<AnnotatonGroups|Sanger::CGP::Vagrent::Data::AnnotationGroup> for each transcript returned from the L<TranscriptSource|Sanger::CGP::Vagrent::TranscriptSource::AbstractTranscriptSource>
824        
825       It will only process L<Sanger::CGP::Vagrent::Data::Deletion|Sanger::CGP::Vagrent::Data::Deletion> objects, if any other L<Variation|Sanger::CGP::Vagrent::Data::AbstractVariation> objects are sent in it will return an empty answer.
826        
827       It inherits from L<Sanger::CGP::Vagrent::Annotators::AbstractAnnotator|Sanger::CGP::Vagrent::Annotators::AbstractAnnotator>.
828        
829       =head1 METHODS
830        
831       See L<Sanger::CGP::Vagrent::Annotators::AbstractAnnotator|Sanger::CGP::Vagrent::Annotators::AbstractAnnotator>