line |
bran |
sub |
pod |
code |
1
|
|
|
|
package Sanger::CGP::Vcf::BamUtil; |
2
|
|
|
|
|
3
|
|
|
|
##########LICENCE########## |
4
|
|
|
|
# Copyright (c) 2014,2015 Genome Research Ltd. |
5
|
|
|
|
# |
6
|
|
|
|
# Author: David Jones <cgpit@sanger.ac.uk> |
7
|
|
|
|
# |
8
|
|
|
|
# This file is part of cgpVcf. |
9
|
|
|
|
# |
10
|
|
|
|
# cgpVcf is free software: you can redistribute it and/or modify it under |
11
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
12
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
13
|
|
|
|
# later version. |
14
|
|
|
|
# |
15
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
16
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
18
|
|
|
|
# details. |
19
|
|
|
|
# |
20
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
21
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
22
|
|
|
|
# |
23
|
|
|
|
# 1. The usage of a range of years within a copyright statement contained within |
24
|
|
|
|
# this distribution should be interpreted as being equivalent to a list of years |
25
|
|
|
|
# including the first and last year specified and all consecutive years between |
26
|
|
|
|
# them. For example, a copyright statement that reads âCopyright (c) 2005, 2007- |
27
|
|
|
|
# 2009, 2011-2012â should be interpreted as being identical to a statement that |
28
|
|
|
|
# reads âCopyright (c) 2005, 2007, 2008, 2009, 2011, 2012â and a copyright |
29
|
|
|
|
# statement that reads âCopyright (c) 2005-2012â should be interpreted as being |
30
|
|
|
|
# identical to a statement that reads âCopyright (c) 2005, 2006, 2007, 2008, |
31
|
|
|
|
# 2009, 2010, 2011, 2012â." |
32
|
|
|
|
########## LICENCE ########## |
33
|
|
|
|
|
34
|
|
1
|
|
use strict; |
35
|
|
1
|
|
use warnings FATAL => 'all'; |
36
|
|
1
|
|
use Sanger::CGP::Vcf; |
37
|
|
1
|
|
use Carp; |
38
|
|
|
|
|
39
|
|
|
|
our $VERSION = Sanger::CGP::Vcf->VERSION; |
40
|
|
|
|
|
41
|
|
|
|
sub parse_contigs{ |
42
|
|
0
|
0
|
my($self,$header_txt, $man_species, $man_assembly) = @_; |
43
|
|
|
|
|
44
|
|
|
|
my $contigs = {}; |
45
|
|
|
|
|
46
|
|
|
|
foreach my $line (split(/\n/,$header_txt)){ |
47
|
0
|
|
|
if($line =~ /^\@SQ/){ |
48
|
|
|
|
my ($name) = $line =~ /SN:([^\t]+)/; |
49
|
|
|
|
my ($length) = $line =~ /LN:([^\t]+)/; |
50
|
|
|
|
my ($assembly) = $line =~ /AS:([^\t]+)/; |
51
|
|
|
|
my ($species) = $line =~ /SP:([^\t]+)/; |
52
|
|
|
|
|
53
|
0
|
|
|
if(defined $man_species) { |
54
|
0
|
|
|
if(defined $species) { |
55
|
0
|
|
|
die "ERROR: Manually entered species ($man_species) doesn't match BAM file header ($species)\n" |
56
|
|
|
|
if($man_species ne $species); |
57
|
|
|
|
} |
58
|
|
|
|
else { $species = $man_species; } |
59
|
|
|
|
} |
60
|
0
|
|
|
die "ERROR: Species must be defined, check options/BAM header\n" unless(defined $species); |
61
|
|
|
|
|
62
|
0
|
|
|
if(defined $man_assembly) { |
63
|
0
|
|
|
if(defined $assembly) { |
64
|
0
|
|
|
die "ERROR: Manually entered assembly ($man_assembly) doesn't match BAM file header ($assembly)\n" |
65
|
|
|
|
if($man_assembly ne $assembly); |
66
|
|
|
|
} |
67
|
|
|
|
else { $assembly = $man_assembly; } |
68
|
|
|
|
} |
69
|
0
|
|
|
die "ERROR: Assembly must be defined, check options/BAM header\n" unless(defined $assembly); |
70
|
|
|
|
|
71
|
|
|
|
my $contig = new Sanger::CGP::Vcf::Contig( |
72
|
|
|
|
-name => $name, |
73
|
|
|
|
-length => $length, |
74
|
|
|
|
-assembly => $assembly, |
75
|
|
|
|
-species => $species |
76
|
|
|
|
); |
77
|
|
|
|
|
78
|
0
|
|
|
if(exists $contigs->{$name}){ |
79
|
0
|
|
|
croak "ERROR: Trying to merge contigs with conflicting data:\n".Dumper($contigs->{$name})."\n".Dumper($contig) |
80
|
|
|
|
unless $contig->compare($contigs->{$name}); |
81
|
|
|
|
} else { |
82
|
|
|
|
$contigs->{$name} = $contig; |
83
|
|
|
|
} |
84
|
|
|
|
} |
85
|
|
|
|
} |
86
|
|
|
|
return $contigs; |
87
|
|
|
|
} |
88
|
|
|
|
|
89
|
|
|
|
sub parse_samples{ |
90
|
|
0
|
0
|
my($self,$header_txt,$study,$protocol,$accession,$accession_source,$description,$platform_in) = @_; |
91
|
|
|
|
|
92
|
|
|
|
my $samples = {}; |
93
|
|
|
|
|
94
|
|
|
|
foreach my $line (split(/\n/,$header_txt)){ |
95
|
0
|
|
|
if($line =~ /^\@RG/){ |
96
|
|
|
|
my ($name) = $line =~ /SM:([^\t]+)/; |
97
|
|
|
|
my $platform; |
98
|
0
|
|
|
if($line =~ /PL:([^\t]+)/) { |
99
|
|
|
|
$platform = $1; |
100
|
0
|
|
|
if(defined $platform && $platform ne $platform_in){ |
101
|
|
|
|
$platform = $platform_in; |
102
|
|
|
|
warn "Manually entered platform ($platform_in) doesn't match BAM file header ($platform). Overriding with manual platform ($platform_in)\n" ; |
103
|
|
|
|
} |
104
|
|
|
|
} |
105
|
|
|
|
else { |
106
|
|
|
|
$platform = $platform_in; |
107
|
|
|
|
} |
108
|
|
|
|
|
109
|
0
|
|
|
$samples->{$name} = new Sanger::CGP::Vcf::Sample( |
110
|
|
|
|
-name => $name, |
111
|
|
|
|
-study => $study, |
112
|
|
|
|
-platform => $platform, |
113
|
|
|
|
-seq_protocol => $protocol, |
114
|
|
|
|
-accession => $accession, |
115
|
|
|
|
-accession_source => $accession_source, |
116
|
|
|
|
-description => $description |
117
|
|
|
|
) unless exists $samples->{$name}; |
118
|
|
|
|
} |
119
|
|
|
|
} |
120
|
|
|
|
return $samples; |
121
|
|
|
|
} |
122
|
|
|
|
|
123
|
|
|
|
return 1; |