File Coverage

lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm
Criterion Covered Total %
branch 71 84 84.5
subroutine 21 21 100.0
pod n/a
total 92 105 87.6


line bran sub pod code
1       package Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2014-2017 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of VAGrENT.
9       #
10       # VAGrENT is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       ##########LICENCE##########
23        
24        
25   1   use strict;
26        
27   1   use Bio::Seq;
28        
29   1   use Log::Log4perl;
30   1   use Carp qw(cluck);
31   1   use Log::Log4perl qw(:easy);
32   1   use POSIX qw(ceil);
33   1   use Data::Dumper;
34        
35   1   use Sanger::CGP::Vagrent qw($VERSION);
36        
37   1   use base qw(Sanger::CGP::Vagrent::Annotators::AbstractAnnotator);
38        
39       my $log = Log::Log4perl->get_logger(__PACKAGE__);
40        
41       1;
42        
43       sub _getAnnotation {
44   156   my ($self,$var) = @_;
45       $self->_clearMessages();
46 50     unless(defined($var) && $var->isa('Sanger::CGP::Vagrent::Data::Substitution')){
47       my $msg = 'require a Sanger::CGP::Vagrent::Data::Substitution object, not a '.ref($var);
48       $self->addMessage($msg);
49       $log->info($msg);
50       return undef;
51       }
52 50     unless($var->isValid){
53       my $msg = 'substitution not valid';
54       $self->addMessage($msg);
55       $log->error($msg);
56       return undef;
57       }
58       my @trans = $self->_getTranscriptSource->getTranscripts($var);
59 100     unless(defined($trans[0])){
60       my $msg = 'No transcripts returned from transcript source';
61       $self->addMessage($msg);
62       $log->info($msg);
63       return undef;
64       }
65       my @groups;
66       foreach my $t(@trans){
67       my $g = $self->_generateAnnotatonGroup($var,$t);
68 100     if(defined($g)){
69       push(@groups,$g);
70       }
71       }
72 100     unless(scalar(@groups) > 0 && defined($groups[0])){
73       my $msg = 'No annotation groups generated';
74       $self->addMessage($msg);
75       $log->info($msg);
76       return undef;
77       }
78       return @groups;
79       }
80        
81       sub _generateAnnotatonGroup {
82   154   my ($self,$var,$tran) = @_;
83       my ($rAnnot,@groupClasses) = $self->_buildRNAAnnotation($var,$tran);
84 100     unless(defined($rAnnot)){
85       my $msg = 'No mRNA annotation created';
86       $self->addMessage($msg);
87       $log->info($msg);
88       return undef;
89       }
90       my $group = Sanger::CGP::Vagrent::Data::AnnotationGroup->new( accession => $tran->getAccession,
91       label => $tran->getGeneName,
92       ccds => $tran->getCCDS,
93       type => $tran->getGeneType,);
94        
95 100     if($tran->isProteinCoding){
96 100     if($rAnnot->hasClassification($self->getIntronVariantClass) ||
97       $rAnnot->hasClassification($self->get5KBUpStreamVariantClass) ||
98       $rAnnot->hasClassification($self->get2KBUpStreamVariantClass) ||
99       $rAnnot->hasClassification($self->get5KBDownStreamVariantClass) ||
100       $rAnnot->hasClassification($self->get500BPDownStreamVariantClass)){
101       # mutation is only indirectly related to the transcript, only need the mRNA annotation
102       $group->addAnnotation($rAnnot);
103       } else {
104       my $cAnnot = $self->_buildCDSAnnotation($var,$tran,$rAnnot);
105 50     unless(defined($cAnnot)){
106       my $msg = 'No CDS annotation created';
107       $self->addMessage($msg);
108       $log->info($msg);
109       return undef;
110       }
111       my $pAnnot = $self->_buildProteinAnnotation($var,$tran,$cAnnot,$rAnnot);
112 50     unless(defined($pAnnot)){
113       my $msg = 'No Protein annotation created';
114       $self->addMessage($msg);
115       $log->info($msg);
116       return undef;
117       }
118       $group->addAnnotation($rAnnot);
119       $group->addAnnotation($cAnnot);
120       $group->addAnnotation($pAnnot);
121       }
122       } else {
123       $group->addAnnotation($rAnnot);
124       }
125       $group->addClassification(sort @groupClasses);
126       return $group;
127       }
128        
129       sub _buildRNAAnnotation {
130   154   my ($self,$var,$tran) = @_;
131        
132       my ($mrnaMin,$mrnaMinOffset,$mrnaMax,$mrnaMaxOffset) = $self->_getmRNAPositions($var,$tran);
133 50     unless(defined($mrnaMin) && defined($mrnaMinOffset) && defined($mrnaMax) && defined($mrnaMaxOffset)){
134       my $msg = 'problem generating mrna coordinates';
135       $self->addMessage($msg);
136       $log->error($msg);
137       return undef;
138       }
139 50     unless($mrnaMin == $mrnaMax && $mrnaMinOffset == $mrnaMaxOffset){
140       my $msg = "This should be a single base mutation, coordinates returned were MIN: $mrnaMin OFFSET:$mrnaMinOffset and MAX: $mrnaMax OFFSET: $mrnaMaxOffset";
141       $self->addMessage($msg);
142       $log->error($msg);
143       return undef;
144       }
145        
146       # This should simplify things from here on in, only need one position and one offset
147       # we have just checked that they are the same
148       my $pos = $mrnaMin;
149       my $offset = $mrnaMinOffset;
150       my $wt = undef;
151       my $mt = undef;
152       my $desc = undef;
153       my $subtype = undef;
154       my @classes = ($self->getSubstitutionClass);
155       my @groupClasses = ($self->classifyTranscript($tran));
156        
157 100     if($pos == 0){
158       # the variant is off the transcript, have to do the up/down stream check
159 100     if($offset < 0){
  50      
160       # before start of transcript
161 100     if($self->_isWithin2KBUpstreamOffsetDistance($offset)){
  100      
162       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getSubstitutionClass,$self->get2KBUpStreamVariantClass),@groupClasses);
163       } elsif($self->_isWithin5KBOffsetDistance($offset)){
164       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getSubstitutionClass,$self->get5KBUpStreamVariantClass),@groupClasses);
165       } else {
166       my $msg = "variant isnt close enough to this transcript, nothing to do";
167       $self->addMessage($msg);
168       $log->error($msg);
169       return undef;
170       }
171        
172       } elsif($offset > 0){
173       # after end of transcript
174 100     if($self->_isWithin500BPDownstreamOffsetDistance($offset)){
  100      
175       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getSubstitutionClass,$self->get500BPDownStreamVariantClass),@groupClasses);
176       } elsif($self->_isWithin5KBOffsetDistance($offset)){
177       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getSubstitutionClass,$self->get5KBDownStreamVariantClass),@groupClasses);
178       } else {
179       my $msg = "variant isnt close enough to this transcript, nothing to do";
180       $self->addMessage($msg);
181       $log->error($msg);
182       return undef;
183       }
184       } else {
185       my $msg = "strange positional info, a variant with a position of 0 must have an offset";
186       $self->addMessage($msg);
187       $log->error($msg);
188       return undef;
189       }
190       }
191        
192 100     if($tran->isProteinCoding){
193 100     if(($pos > $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset >= 0)) &&
  100      
  50      
194       ($pos < $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset <= 0))){
195       # coding change
196       push(@groupClasses,$self->getCDSClass);
197       } elsif($pos < $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset < 0)){
198       # 5prime UTR
199       push(@groupClasses,$self->get5PrimeUtrClass);
200       } elsif($pos > $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset > 0)){
201       # 3prime UTR
202       push(@groupClasses,$self->get3PrimeUtrClass);
203       } else {
204       my $msg = "strange positional info, can't work out subtype : var $mrnaMin $mrnaMinOffset $mrnaMax $mrnaMaxOffset transcript ".$tran->getCdsMinPos." - ".$tran->getCdsMaxPos;
205       $self->addMessage($msg);
206       $log->error($msg);
207       return undef;
208       }
209       }
210        
211 100     if($self->_isIntronicOffsetDistance($offset)){
212       # its intronic
213       push(@groupClasses,$self->getIntronClass);
214       return ($self->_buildUnknownMRNAAnnotation($var,$tran,$self->getSubstitutionClass,$self->getIntronVariantClass),@groupClasses);
215       }
216 100     if($tran->getStrand == 1){
217       # transcript is on the same strand as the genome
218       $wt = $var->getWt();
219       $mt = $var->getMt();
220       } else {
221       # transcript reversed on the genome
222       $wt = $self->_revcompSeq($var->getWt());
223       $mt = $self->_revcompSeq($var->getMt());
224       }
225 100     if($offset == 0){
226       # its in an exon
227       push(@groupClasses,$self->getExonClass);
228       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionKnownSubtype();
229       # sanity check - does the substring on the cDNA sequence at the mut location equal wt
230       my $substr = substr($tran->getcDNASeq,($pos - 1),1);
231 50     if(lc($substr) ne lc($wt)){
232       my $msg = "calculated wt doesn't match substring - $wt vs $substr";
233       $self->addMessage($msg);
234       $log->error($msg);
235       return undef;
236       }
237 100     if($tran->isProteinCoding){
238 100     if($self->_arrayHasString($self->get5PrimeUtrClass,@groupClasses)) {
  100      
239 100     if($self->_isStartGained($var,$tran,$pos,$pos,$wt,$mt)){
240       push(@classes,$self->getPrematureStartGainedVariantClass);
241       } else {
242       push(@classes,$self->get5PrimeUtrVariantClass);
243       }
244       } elsif($self->_arrayHasString($self->get3PrimeUtrClass,@groupClasses)){
245       push(@classes,$self->get3PrimeUtrVariantClass);
246       } else {
247       push(@classes,$self->getCodonVariantClass);
248       }
249       } else {
250       push(@classes,$self->getNonCodingTranscriptVariantClass);
251       }
252       } else {
253       # its not in an exon, and its not intronic. Must be related to splice site.
254       $subtype = Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype();
255 100     if($self->_isOffsetAConsensusSpliceDistance($offset)){
256       push(@classes,$self->getEssentialSpliceSiteVariantClass);
257       push(@groupClasses,$self->getEssentialSpliceSiteClass);
258       } else {
259       push(@classes,$self->getSpliceRegionVariantClass);
260       push(@groupClasses,$self->getSpliceRegionClass);
261       }
262 100     if($self->_arrayHasString($self->get5PrimeUtrClass,@groupClasses)){
  100      
263       push(@classes,$self->get5PrimeUtrVariantClass);
264       } elsif($self->_arrayHasString($self->get3PrimeUtrClass,@groupClasses)){
265       push(@classes,$self->get3PrimeUtrVariantClass);
266       }
267       }
268        
269       $wt =~ s/t/u/ig;
270       $mt =~ s/t/u/ig;
271 100     if($offset == 0){
  100      
272       $desc = 'r.'.$pos . lc($wt) . '>' . lc($mt);
273       } elsif($offset > 0){
274       $desc = 'r.'.$pos.'+'.$offset. lc($wt).'>'.lc($mt);
275       } else {
276       $desc = 'r.'.$pos.$offset.lc($wt).'>'.lc($mt);
277       }
278        
279        
280        
281       my $anno = Sanger::CGP::Vagrent::Data::Annotation->new(wt => uc($wt),
282       mt => uc($mt),
283       minpos => $pos,
284       minOffset => $offset,
285       maxpos => $pos,
286       maxOffset => $offset,
287       acc => $tran->getAccession,
288       accversion => $tran->getAccessionVersion,
289       db => $tran->getDatabase,
290       dbversion => $tran->getDatabaseVersion,
291       seqlength => length($tran->getcDNASeq),
292       description => $desc,
293       context => Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(),
294       type => Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType(),
295       subtype => $subtype);
296        
297       $anno->addClassification(@classes);
298        
299       return ($anno,@groupClasses);
300       }
301        
302       sub _getWildTypeStringForCDSAnno {
303   56   my ($self,$var,$tran,$rAnnot) = @_;
304       my $wt = $rAnnot->getWt();
305       $wt =~ s/u/t/ig;
306       return uc($wt);
307       }
308        
309       sub _getMutantStringForCDSAnno {
310   56   my ($self,$var,$tran,$rAnnot) = @_;
311       my $mt = $rAnnot->getMt();
312       $mt =~ s/u/t/ig;
313       return uc($mt);
314       }
315        
316       sub _getCDSDescriptionString {
317   56   my ($self,$tran,$mutStart,$mutEnd,$mutStartOffset,$mutEndOffset,$wt,$mt) = @_;
318       my $desc;
319 100     if($mutStartOffset == 0){
  100      
320       $desc = 'c.'.$mutStart . $wt . '>' . $mt;
321       } elsif($mutStartOffset > 0){
322       $desc = 'c.'.$mutStart.'+'.$mutStartOffset. $wt.'>'.$mt;
323       } else {
324       $desc = 'c.'.$mutStart.$mutStartOffset.$wt.'>'.$mt;
325       }
326       return $desc;
327       }
328        
329       sub _getMutatedCdsSequence {
330   10   my ($self,$wtDna,$min,$max,$mt) = @_;
331       my $mtDna = substr($wtDna,0,$min - 1) . $mt . substr($wtDna,$max);
332       return $mtDna
333       }
334        
335       sub _getCdsMinPosForProteinCalculation {
336   10   my ($self,$cAnnot) = @_;
337       return $cAnnot->getMinPos();
338       }
339        
340       sub _getCdsMaxPosForProteinCalculation {
341   10   my ($self,$cAnnot) = @_;
342       return $cAnnot->getMaxPos();
343       }
344        
345       sub _isStartGained {
346   4   my ($self,$var,$tran,$varTranStart,$varTranEnd,$wt,$mt) = @_;
347       my $varLength = ($varTranEnd - $varTranStart) + 1;
348       my $substrStart = $varTranStart - 3;
349       my $substrLength;
350        
351 50     if($substrStart < 0){
352       $substrLength = (($varTranEnd - $varTranStart) + 5) + $substrStart;
353       $substrStart = 0;
354       } else {
355       $substrLength = ($varTranEnd - $varTranStart) + 5;
356       }
357        
358       my $wtSubstr = substr($tran->getcDNASeq,$substrStart,$substrLength);
359       my $mtSubstr = $wtSubstr;
360       $mtSubstr =~ s/\w{$varLength}(\w{2})$/$mt$1/;
361       my @wtPos;
362       my @mtPos;
363       while($wtSubstr =~ m/atg/ig){
364       push(@wtPos,pos($wtSubstr));
365       }
366       while($mtSubstr =~ m/atg/ig){
367       push(@mtPos,pos($mtSubstr));
368       }
369 100     if(scalar(@mtPos) > scalar(@wtPos)){
  50      
370       # more in mutant, must have created a start
371       return 1;
372       } elsif(scalar(@mtPos) == scalar(@wtPos) && scalar(@wtPos) > 0){
373       # both have same number, must check positions, if any change must have created a start
374       for(my $i = 0 ;$i < scalar(@wtPos) ; $i++){
375 0     if($mtPos[$i] != $wtPos[$i]){
376       return 1;
377       }
378       }
379       }
380       return 0;
381       }
382        
383       sub _getDefaultCDSAnnotationType {
384   56   return Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType();
385       }
386        
387       sub _getDefaultProteinAnnotationType {
388   2   return Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType();
389       }
390        
391        
392       =head1 NAME
393        
394       Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator - Annotator for simple substitution variants
395        
396       =head1 DESCRIPTION
397        
398       This annotates substitution variants, it provides L<AnnotatonGroups|Sanger::CGP::Vagrent::Data::AnnotationGroup> for each transcript returned from the L<TranscriptSource|Sanger::CGP::Vagrent::TranscriptSource::AbstractTranscriptSource>
399        
400       It will only process L<Sanger::CGP::Vagrent::Data::Substitution|Sanger::CGP::Vagrent::Data::Substitution> objects, if any other L<Variation|Sanger::CGP::Vagrent::Data::AbstractVariation> objects are sent in it will return an empty answer.
401        
402       It inherits from L<Sanger::CGP::Vagrent::Annotators::AbstractAnnotator|Sanger::CGP::Vagrent::Annotators::AbstractAnnotator>.
403        
404       =head1 METHODS
405        
406       See L<Sanger::CGP::Vagrent::Annotators::AbstractAnnotator|Sanger::CGP::Vagrent::Annotators::AbstractAnnotator>