line |
bran |
sub |
pod |
code |
1
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
3
|
|
|
|
Sanger::CGP::Vagrent::Ontology::SequenceOntologyClassifier - Contains a set of rules to classify |
4
|
|
|
|
Annotations using Sequence Ontology terms |
5
|
|
|
|
|
6
|
|
|
|
=head1 DESCRIPTION |
7
|
|
|
|
|
8
|
|
|
|
This class is designed to be extended, it contains a collection of test functions that classify |
9
|
|
|
|
Sanger::CGP::Vagrent::Data::Annotation objects and assign the appropriate Sequence Ontology terms. |
10
|
|
|
|
|
11
|
|
|
|
=cut |
12
|
|
|
|
|
13
|
|
|
|
package Sanger::CGP::Vagrent::Ontology::SequenceOntologyClassifier; |
14
|
|
|
|
|
15
|
|
|
|
##########LICENCE########## |
16
|
|
|
|
# Copyright (c) 2014 Genome Research Ltd. |
17
|
|
|
|
# |
18
|
|
|
|
# Author: Cancer Genome Project cgpit@sanger.ac.uk |
19
|
|
|
|
# |
20
|
|
|
|
# This file is part of VAGrENT. |
21
|
|
|
|
# |
22
|
|
|
|
# VAGrENT is free software: you can redistribute it and/or modify it under |
23
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
24
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
25
|
|
|
|
# later version. |
26
|
|
|
|
# |
27
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
28
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
29
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
30
|
|
|
|
# details. |
31
|
|
|
|
# |
32
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
33
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
34
|
|
|
|
##########LICENCE########## |
35
|
|
|
|
|
36
|
|
|
|
|
37
|
|
4
|
|
use strict; |
38
|
|
|
|
|
39
|
|
4
|
|
use Const::Fast qw(const); |
40
|
|
4
|
|
use Cwd qw(abs_path); |
41
|
|
4
|
|
use File::Basename; |
42
|
|
4
|
|
use File::Path; |
43
|
|
4
|
|
use List::Util qw(first); |
44
|
|
|
|
|
45
|
|
4
|
|
use Log::Log4perl qw(:easy); |
46
|
|
|
|
|
47
|
|
4
|
|
use File::ShareDir qw(module_dir); |
48
|
|
4
|
|
use Config::IniFiles; |
49
|
|
|
|
|
50
|
|
|
|
my $log = Log::Log4perl->get_logger(__PACKAGE__); |
51
|
|
4
|
|
use Sanger::CGP::Vagrent qw($VERSION); |
52
|
|
4
|
|
use base qw(Sanger::CGP::Vagrent); |
53
|
|
|
|
|
54
|
|
|
|
# constant values holding SO class identifiers |
55
|
|
|
|
const my $SO_SUBSTITUTION_CLASS => 'SO:1000002:substitution'; |
56
|
|
|
|
const my $SO_INSERTION_CLASS => 'SO:0000667:insertion'; |
57
|
|
|
|
const my $SO_DELETION_CLASS => 'SO:0000159:deletion'; |
58
|
|
|
|
const my $SO_COMPLEXINDEL_CLASS => 'SO:1000032:indel'; |
59
|
|
|
|
|
60
|
|
|
|
# sequence variant classes |
61
|
|
|
|
const my $SO_UNKNOWN_VARIANT_CLASS => 'SO:0001576:transcript_variant'; |
62
|
|
|
|
const my $SO_SYN_VARIANT_CLASS => 'SO:0001588:synonymous_codon'; |
63
|
|
|
|
const my $SO_NON_SYN_VARIANT_CLASS => 'SO:0001583:non_synonymous_codon'; |
64
|
|
|
|
const my $SO_STOP_GAINED_VARIANT_CLASS => 'SO:0001587:stop_gained'; |
65
|
|
|
|
const my $SO_STOP_LOST_VARIANT_CLASS => 'SO:0001578:stop_lost'; |
66
|
|
|
|
const my $SO_STOP_RETAINED_VARIANT_CLASS => 'SO:0001567:stop_retained_variant'; |
67
|
|
|
|
const my $SO_START_LOST_VARIANT_CLASS => 'SO:0001582:initiator_codon_change'; |
68
|
|
|
|
const my $SO_PREMATURE_START_GAINED_VARIANT_CLASS => 'SO:0001988:5_prime_UTR_premature_start_codon_gain_variant'; |
69
|
|
|
|
const my $SO_INTRON_VARIANT_CLASS => 'SO:0001627:intron_variant'; |
70
|
|
|
|
const my $SO_CODON_VARIANT_CLASS => 'SO:0001581:codon_variant'; |
71
|
|
|
|
const my $SO_5PRIME_UTR_VARIANT_CLASS => 'SO:0001623:5_prime_UTR_variant'; |
72
|
|
|
|
const my $SO_3PRIME_UTR_VARIANT_CLASS => 'SO:0001624:3_prime_UTR_variant'; |
73
|
|
|
|
const my $SO_NC_TRANS_VARIANT_CLASS => 'SO:0001619:nc_transcript_variant'; |
74
|
|
|
|
const my $SO_ESS_SPLICE_VARIANT_CLASS => 'SO:0001629:splice_site_variant'; |
75
|
|
|
|
const my $SO_SPLICE_REGION_VARIANT_CLASS => 'SO:0001995:extended_intronic_splice_region_variant'; |
76
|
|
|
|
const my $SO_FRAMESHIFT_VARIANT_CLASS => 'SO:0001589:frameshift_variant'; |
77
|
|
|
|
const my $SO_INFRAME_VARIANT_CLASS => 'SO:0001650:inframe_variant'; |
78
|
|
|
|
const my $SO_INFRAME_CODON_GAIN_VARIANT_CLASS => 'SO:0001651:inframe_codon_gain'; |
79
|
|
|
|
const my $SO_INFRAME_CODON_LOSS_VARIANT_CLASS => 'SO:0001652:inframe_codon_loss'; |
80
|
|
|
|
const my $SO_COMPLEX_CHANGE_VARIANT_CLASS => 'SO:0001577:complex_change_in_transcript'; |
81
|
|
|
|
|
82
|
|
|
|
const my $SO_5KB_UPSTREAM_VARIANT_CLASS => 'SO:0001635:5KB_upstream_variant'; |
83
|
|
|
|
const my $SO_2KB_UPSTREAM_VARIANT_CLASS => 'SO:0001636:2KB_upstream_variant'; |
84
|
|
|
|
const my $SO_5KB_DOWNSTREAM_VARIANT_CLASS => 'SO:0001633:5KB_downstream_variant'; |
85
|
|
|
|
const my $SO_500BP_DOWNSTREAM_VARIANT_CLASS => 'SO:0001634:500B_downstream_variant'; |
86
|
|
|
|
|
87
|
|
|
|
const my $SO_EXON_CLASS => 'SO:0000147:exon'; |
88
|
|
|
|
const my $SO_INTRON_CLASS => 'SO:0000188:intron'; |
89
|
|
|
|
const my $SO_SPLICE_REGION_CLASS => 'SO:0001996:extended_intronic_splice_region'; |
90
|
|
|
|
const my $SO_ESS_SPLICE_SITE_CLASS => 'SO:0001993:extended_cis_splice_site'; |
91
|
|
|
|
|
92
|
|
|
|
# mature transcript mRNA region classes |
93
|
|
|
|
const my $SO_CDS_CLASS => 'SO:0000316:CDS'; |
94
|
|
|
|
const my $SO_5PRIME_UTR_CLASS => 'SO:0000204:five_prime_UTR'; |
95
|
|
|
|
const my $SO_3PRIME_UTR_CLASS => 'SO:0000205:three_prime_UTR'; |
96
|
|
|
|
|
97
|
|
|
|
# gene attribute classes |
98
|
|
|
|
const my $SO_PROTEIN_CODING_CLASS => 'SO:0000010:protein_coding'; |
99
|
|
|
|
const my $SO_NON_PROTEIN_CODING_CLASS => 'SO:0000011:non_protein_coding'; |
100
|
|
|
|
|
101
|
|
|
|
const my $TERM_SUMMARY_INI => 'SequenceOntologySummary.ini'; |
102
|
|
|
|
|
103
|
|
|
|
# sub DESTROY { |
104
|
|
|
|
# ##### Handy DESTROY function that will print ontology combinations that don't exist in the summary lookup at program termination. |
105
|
|
|
|
# my $self = shift; |
106
|
|
|
|
# if(defined $self->{'_SOsum'}){ |
107
|
|
|
|
# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){ |
108
|
|
|
|
# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 0; |
109
|
|
|
|
# } |
110
|
|
|
|
# } |
111
|
|
|
|
# } |
112
|
|
|
|
|
113
|
|
|
|
sub _ontologyInit { |
114
|
|
862
|
|
my $self = shift; |
115
|
|
|
|
my %vars = @_; |
116
|
50
|
|
|
if(exists $vars{'ontologySymmary'} && defined $vars{'ontologySymmary'}){ |
117
|
|
|
|
$self->{'_SOsum'} = $vars{'ontologySymmary'}; |
118
|
|
|
|
} else { |
119
|
|
|
|
$self->_loadOntologySummaryIni(); |
120
|
|
|
|
} |
121
|
|
|
|
} |
122
|
|
|
|
|
123
|
|
|
|
sub _loadOntologySummaryIni { |
124
|
|
862
|
|
my $self = shift; |
125
|
|
|
|
my $share_path = dirname(abs_path($0)).'/../share'; |
126
|
50
|
|
|
$share_path = module_dir('Sanger::CGP::Vagrent') unless(-e File::Spec->catfile($share_path,$TERM_SUMMARY_INI)); |
127
|
|
|
|
$self->{'_SOsum'} = new Config::IniFiles( -file => File::Spec->catfile($share_path,$TERM_SUMMARY_INI)); |
128
|
|
|
|
} |
129
|
|
|
|
|
130
|
|
|
|
sub getOntologySummary { |
131
|
|
0
|
0
|
my ($self,$anno) = @_; |
132
|
|
|
|
my $mrna = $anno->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()); |
133
|
|
|
|
my $cds = $anno->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()); |
134
|
|
|
|
my $prot = $anno->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()); |
135
|
|
|
|
my @class = $anno->getClassifications; |
136
|
|
|
|
my @terms; |
137
|
|
|
|
foreach my $a($anno,$mrna,$cds,$prot){ |
138
|
0
|
|
|
next unless defined $a; |
139
|
|
|
|
foreach my $c($a->getClassifications){ |
140
|
0
|
0
|
|
next if first {$c eq $_} @terms; |
141
|
|
|
|
push @terms, $c; |
142
|
|
|
|
} |
143
|
|
|
|
} |
144
|
|
|
|
my $thing = join ',', @terms; |
145
|
|
|
|
|
146
|
|
|
|
my $sum = $self->{'_SOsum'}->val('SUMMARY',$thing,$thing); |
147
|
0
|
|
|
if($sum eq $thing){ |
148
|
|
|
|
$self->{'_notSummary'}->{$thing}++; |
149
|
|
|
|
} |
150
|
|
|
|
|
151
|
|
|
|
return $sum; |
152
|
|
|
|
} |
153
|
|
|
|
|
154
|
|
|
|
sub getSummaryCache { |
155
|
|
0
|
0
|
return shift->{'_SOsum'}; |
156
|
|
|
|
} |
157
|
|
|
|
|
158
|
|
|
|
sub classifyTranscript { |
159
|
|
852
|
0
|
my ($self,$tran) = @_; |
160
|
100
|
|
|
if($tran->isProteinCoding){ |
161
|
|
|
|
return $self->getProteinCodingClass; |
162
|
|
|
|
} else { |
163
|
|
|
|
return $self->getNonProteinCodingClass; |
164
|
|
|
|
} |
165
|
|
|
|
} |
166
|
|
|
|
|
167
|
|
|
|
# variant type classes |
168
|
|
|
|
|
169
|
|
|
|
sub getSubstitutionClass { |
170
|
|
446
|
0
|
my $self = shift; |
171
|
|
|
|
return $SO_SUBSTITUTION_CLASS; |
172
|
|
|
|
} |
173
|
|
|
|
sub getInsertionClass { |
174
|
|
1663
|
0
|
my $self = shift; |
175
|
|
|
|
return $SO_INSERTION_CLASS; |
176
|
|
|
|
} |
177
|
|
|
|
sub getDeletionClass { |
178
|
|
692
|
0
|
my $self = shift; |
179
|
|
|
|
return $SO_DELETION_CLASS; |
180
|
|
|
|
} |
181
|
|
|
|
sub getComplexIndelClass { |
182
|
|
598
|
0
|
my $self = shift; |
183
|
|
|
|
return $SO_COMPLEXINDEL_CLASS; |
184
|
|
|
|
} |
185
|
|
|
|
# sequence variant classes |
186
|
|
|
|
|
187
|
|
|
|
sub getUnknownVariantClass { |
188
|
|
1503
|
0
|
my $self = shift; |
189
|
|
|
|
return $SO_UNKNOWN_VARIANT_CLASS; |
190
|
|
|
|
} |
191
|
|
|
|
sub getIntronVariantClass { |
192
|
|
1103
|
0
|
my $self = shift; |
193
|
|
|
|
return $SO_INTRON_VARIANT_CLASS; |
194
|
|
|
|
} |
195
|
|
|
|
sub getSynonymousVariantClass { |
196
|
|
4
|
0
|
my $self = shift; |
197
|
|
|
|
return $SO_SYN_VARIANT_CLASS; |
198
|
|
|
|
} |
199
|
|
|
|
sub getNonSynonymousVariantClass { |
200
|
|
12
|
0
|
my $self = shift; |
201
|
|
|
|
return $SO_NON_SYN_VARIANT_CLASS; |
202
|
|
|
|
} |
203
|
|
|
|
sub getStopGainedVariantClass { |
204
|
|
14
|
0
|
my $self = shift; |
205
|
|
|
|
return $SO_STOP_GAINED_VARIANT_CLASS; |
206
|
|
|
|
} |
207
|
|
|
|
sub getStopLostVariantClass { |
208
|
|
12
|
0
|
my $self = shift; |
209
|
|
|
|
return $SO_STOP_LOST_VARIANT_CLASS; |
210
|
|
|
|
} |
211
|
|
|
|
sub getStopRetainedVariantClass { |
212
|
|
0
|
0
|
my $self = shift; |
213
|
|
|
|
return $SO_STOP_RETAINED_VARIANT_CLASS; |
214
|
|
|
|
} |
215
|
|
|
|
sub getStartLostVariantClass { |
216
|
|
30
|
0
|
my $self = shift; |
217
|
|
|
|
return $SO_START_LOST_VARIANT_CLASS; |
218
|
|
|
|
} |
219
|
|
|
|
sub getPrematureStartGainedVariantClass{ |
220
|
|
4
|
0
|
my $self = shift; |
221
|
|
|
|
return $SO_PREMATURE_START_GAINED_VARIANT_CLASS; |
222
|
|
|
|
} |
223
|
|
|
|
sub getCodonVariantClass{ |
224
|
|
55
|
0
|
my $self = shift; |
225
|
|
|
|
return $SO_CODON_VARIANT_CLASS; |
226
|
|
|
|
} |
227
|
|
|
|
sub get5PrimeUtrVariantClass{ |
228
|
|
421
|
0
|
my $self = shift; |
229
|
|
|
|
return $SO_5PRIME_UTR_VARIANT_CLASS; |
230
|
|
|
|
} |
231
|
|
|
|
sub get3PrimeUtrVariantClass{ |
232
|
|
292
|
0
|
my $self = shift; |
233
|
|
|
|
return $SO_3PRIME_UTR_VARIANT_CLASS; |
234
|
|
|
|
} |
235
|
|
|
|
sub getNonCodingTranscriptVariantClass{ |
236
|
|
28
|
0
|
my $self = shift; |
237
|
|
|
|
return $SO_NC_TRANS_VARIANT_CLASS; |
238
|
|
|
|
} |
239
|
|
|
|
sub getEssentialSpliceSiteVariantClass{ |
240
|
|
944
|
0
|
my $self = shift; |
241
|
|
|
|
return $SO_ESS_SPLICE_VARIANT_CLASS; |
242
|
|
|
|
} |
243
|
|
|
|
sub getSpliceRegionVariantClass{ |
244
|
|
1232
|
0
|
my $self = shift; |
245
|
|
|
|
return $SO_SPLICE_REGION_VARIANT_CLASS; |
246
|
|
|
|
} |
247
|
|
|
|
sub getFrameShiftVariantClass{ |
248
|
|
569
|
0
|
my $self = shift; |
249
|
|
|
|
return $SO_FRAMESHIFT_VARIANT_CLASS; |
250
|
|
|
|
} |
251
|
|
|
|
sub getInFrameVariantClass{ |
252
|
|
562
|
0
|
my $self = shift; |
253
|
|
|
|
return $SO_INFRAME_VARIANT_CLASS; |
254
|
|
|
|
} |
255
|
|
|
|
sub getInFrameCodonGainVariantClass { |
256
|
|
16
|
0
|
my $self = shift; |
257
|
|
|
|
return $SO_INFRAME_CODON_GAIN_VARIANT_CLASS; |
258
|
|
|
|
} |
259
|
|
|
|
sub getInFrameCodonLossVariantClass { |
260
|
|
10
|
0
|
my $self = shift; |
261
|
|
|
|
return $SO_INFRAME_CODON_LOSS_VARIANT_CLASS; |
262
|
|
|
|
} |
263
|
|
|
|
sub getComplexChangeVariantClass { |
264
|
|
1024
|
0
|
my $self = shift; |
265
|
|
|
|
return $SO_COMPLEX_CHANGE_VARIANT_CLASS; |
266
|
|
|
|
} |
267
|
|
|
|
sub get5KBUpStreamVariantClass { |
268
|
|
1233
|
0
|
my $self = shift; |
269
|
|
|
|
return $SO_5KB_UPSTREAM_VARIANT_CLASS; |
270
|
|
|
|
} |
271
|
|
|
|
sub get2KBUpStreamVariantClass { |
272
|
|
1276
|
0
|
my $self = shift; |
273
|
|
|
|
return $SO_2KB_UPSTREAM_VARIANT_CLASS; |
274
|
|
|
|
} |
275
|
|
|
|
sub get5KBDownStreamVariantClass { |
276
|
|
1166
|
0
|
my $self = shift; |
277
|
|
|
|
return $SO_5KB_DOWNSTREAM_VARIANT_CLASS; |
278
|
|
|
|
} |
279
|
|
|
|
sub get500BPDownStreamVariantClass { |
280
|
|
1178
|
0
|
my $self = shift; |
281
|
|
|
|
return $SO_500BP_DOWNSTREAM_VARIANT_CLASS; |
282
|
|
|
|
} |
283
|
|
|
|
# variant transcript region classes |
284
|
|
|
|
|
285
|
|
|
|
sub getExonClass { |
286
|
|
1078
|
0
|
my $self = shift; |
287
|
|
|
|
return $SO_EXON_CLASS; |
288
|
|
|
|
} |
289
|
|
|
|
|
290
|
|
|
|
sub getIntronClass { |
291
|
|
1285
|
0
|
my $self = shift; |
292
|
|
|
|
return $SO_INTRON_CLASS; |
293
|
|
|
|
} |
294
|
|
|
|
|
295
|
|
|
|
sub getSpliceRegionClass { |
296
|
|
982
|
0
|
my $self = shift; |
297
|
|
|
|
return $SO_SPLICE_REGION_CLASS; |
298
|
|
|
|
} |
299
|
|
|
|
|
300
|
|
|
|
sub getEssentialSpliceSiteClass { |
301
|
|
966
|
0
|
my $self = shift; |
302
|
|
|
|
return $SO_ESS_SPLICE_SITE_CLASS; |
303
|
|
|
|
} |
304
|
|
|
|
|
305
|
|
|
|
# mature transcript mRNA region classes |
306
|
|
|
|
|
307
|
|
|
|
sub getCDSClass { |
308
|
|
976
|
0
|
my $self = shift; |
309
|
|
|
|
return $SO_CDS_CLASS; |
310
|
|
|
|
} |
311
|
|
|
|
|
312
|
|
|
|
sub get5PrimeUtrClass { |
313
|
|
453
|
0
|
my $self = shift; |
314
|
|
|
|
return $SO_5PRIME_UTR_CLASS; |
315
|
|
|
|
} |
316
|
|
|
|
|
317
|
|
|
|
sub get3PrimeUtrClass { |
318
|
|
293
|
0
|
my $self = shift; |
319
|
|
|
|
return $SO_3PRIME_UTR_CLASS; |
320
|
|
|
|
} |
321
|
|
|
|
|
322
|
|
|
|
# gene/transcript type classes |
323
|
|
|
|
|
324
|
|
|
|
sub getProteinCodingClass { |
325
|
|
1448
|
0
|
my $self = shift; |
326
|
|
|
|
return $SO_PROTEIN_CODING_CLASS; |
327
|
|
|
|
} |
328
|
|
|
|
|
329
|
|
|
|
sub getNonProteinCodingClass { |
330
|
|
222
|
0
|
my $self = shift; |
331
|
|
|
|
return $SO_NON_PROTEIN_CODING_CLASS; |
332
|
|
|
|
} |
333
|
|
|
|
|
334
|
|
|
|
1; |