#!/usr/bin/perl =head1 Name repeat_to_gff.pl -- convert repeat raw formats to gff format =head1 Description This program can read repeatmasker .out file, repeatproteinmask .annot file, or trf .dat file, and convert it to gff format. =head1 Version Author: Fan Wei, fanw@genomics.org.cn Version: 1.0, Date: 2006-12-6 Note: =head1 Usage --prefix <str> set a prefix before repeat element ID --verbose output running progress information to screen --help output help information to screen =head1 Exmple perl repeat_to_gff.pl ./rice.frag1M.fa.trf.dat perl repeat_to_gff.pl ./rice.frag1M.fa.RepeatMasker.out perl repeat_to_gff.pl ./rice.frag1M.fa.Proteinmask.annot perl repeat_to_gff.pl ./rice.frag1M.fa.Proteinmask.dust =cut use strict; use Getopt::Long; use FindBin qw($Bin $Script); use File::Basename qw(basename dirname); use Data::Dumper; use File::Path; ## function " mkpath" and "rmtree" deal with directory ##get options from command line into variables and set default values my ($Verbose,$Help,$Prefix); GetOptions( "prefix:s"=>\$Prefix, "verbose"=>\$Verbose, "help"=>\$Help ); die `pod2text $0` if (@ARGV == 0 || $Help); my $repeat_file = shift; dat_to_gff3($repeat_file,$Prefix) if($repeat_file =~ /\.dat$/); dust_to_gff3($repeat_file,$Prefix) if($repeat_file =~ /\.dust$/); out_to_gff3($repeat_file,$Prefix) if($repeat_file =~ /\.out$/); annot_to_gff3($repeat_file,$Prefix) if($repeat_file =~ /\.annot$/); #################################################### ################### Sub Routines ################### #################################################### ##facilitate to creat marks #################################################### sub create_marker { my $number = shift || 100000; $number =~ s/\d/0/g; $number++; return $number; } ##Start End PeriodSize CopyNumber ConsensusSize PercentMatches PercentIndels Score A C G T Entropy(0-2) consensus repeatSequences ##19670039 19670073 4 8.8 4 93 0 61 22 0 28 48 1.51 TGTA TGTATGTATGTATGTATGTATGTAGGTATGTATGT #################################################### sub dat_to_gff3 { my $file = shift; my $pre_tag = shift; my $output; $pre_tag .= "_" if($pre_tag); my $line_num = `wc -l $file`; $line_num = $1 if($line_num =~ /^(\d+)/); my $mark = create_marker($line_num); my $chr; open IN,$file || die "fail open $file"; while (<IN>) { $chr = $1 if(/^Sequence:\s+(\S+)/); my @t = split /\s+/; next if(@t != 15); my $start = $t[0]; my $end = $t[1]; my $id = $pre_tag."TR".$mark; my $score = $t[7]; my $strand = "+"; $output .= "$chr\tTRF\ttandemRepeat\t$start\t$end\t$score\t$strand\t.\tID=$id;PeriodSize=$t[2];CopyNumber=$t[3];PercentMatches=$t[5];PercentIndels=$t[6];Consensus=$t[13];\n"; $mark++; } close IN; open OUT,">$file.gff" || die "fail creat $file"; print OUT "##gff-version 3\n$output"; close OUT; } ##>chrLG1 ##386 - 501 #################################################### sub dust_to_gff3 { my $file = shift; my $pre_tag = shift; my $output; $pre_tag .= "_" if($pre_tag); my $line_num = `grep -v '>' $file|wc -l`; $line_num = $1 if($line_num =~ /^(\d+)/); my $mark = create_marker($line_num); my $chr; open IN,$file || die "fail open $file"; while (<IN>) { if(/^>(\S+)/) { $chr = $1; next; } my @t = split /\s+/; my $start = $t[0]; $start++; # intervals are 0-based coordinates my $end = $t[2]; my $id = $pre_tag."DUST".$mark; my $score = '.'; my $strand = "+"; $output .= "$chr\tdustmasker\tdustRepeat\t$start\t$end\t$score\t$strand\t.\tID=$id;\n"; $mark++; } close IN; open OUT,">$file.gff" || die "fail creat $file"; print OUT "##gff-version 3\n$output"; close OUT; } ## SW perc perc perc query position in query matching repeat position in repeat ##score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID #245 35.2 2.5 0.6 chr1 1001400 1001556 (18753082) + TS2 SINE 80 239 (416) 12 #################################################### sub out_to_gff3 { my $file = shift; my $pre_tag = shift; my $output; $pre_tag .= "_" if($pre_tag); my $line_num = `wc -l $file`; $line_num = $1 if($line_num =~ /^(\d+)/); my $mark = create_marker($line_num); open IN,$file || die "fail open $file"; while (<IN>) { s/^\s+//; my @t = split /\s+/; next if($t[0] =~ /\D/ || !$t[0]); my $start = $t[5]; my $end = $t[6]; my $id = $pre_tag."TE".$mark; my $chr = $t[4]; my $score = $t[0]; my $strand = ($t[8] eq '+') ? "+" : "-"; my $target = $t[9]; my $class = $t[10]; my @ary; push @ary,$t[11] if($t[11] !~ /[\(\)]/); push @ary,$t[12] if($t[12] !~ /[\(\)]/); push @ary,$t[13] if($t[13] !~ /[\(\)]/); @ary = sort {$a<=>$b} @ary; my ($target_start,$target_end) = ($ary[0],$ary[1]); $output .= "$chr\tRepeatMasker\tTransposon\t$start\t$end\t$score\t$strand\t.\tID=$id;Target=$target $target_start $target_end;Class=$class;PercDiv=$t[1];PercDel=$t[2];PercIns=$t[3];\n"; $mark++; } close IN; open OUT,">$file.gff" || die "fail creat $file"; print OUT "##gff-version 3\n$output"; close OUT; } ##6.00e-22 117 WUBlastX Chr07frag1M 26352 26816 - RETRO1_pol LTR/Gypsy 360 533 #################################################### sub annot_to_gff3 { my $file = shift; my $pre_tag = shift; my $output; $pre_tag .= "_" if($pre_tag); my $line_num = `wc -l $file`; $line_num = $1 if($line_num =~ /^(\d+)/); my $mark = create_marker($line_num); open IN,$file || die "fail open $file"; while (<IN>) { s/^\s+//; my @t = split /\s+/; next if($t[0] =~ /^pValue/); my $start = $t[4]; my $end = $t[5]; my $id = $pre_tag."TP".$mark; my $chr = $t[3]; my $score = $t[1]; ##use pvalue here my $strand = $t[6]; my $target = $t[7]; my $class = $t[8]; my @ary = ($t[9],$t[10]); @ary = sort {$a<=>$b} @ary; my ($target_start,$target_end) = ($ary[0],$ary[1]); $output .= "$chr\tRepeatProteinMask\tTEprotein\t$start\t$end\t$score\t$strand\t.\tID=$id;Target=$target $target_start $target_end;Class=$class;pValue=$t[0];\n"; $mark++; } close IN; open OUT,">$file.gff" || die "fail creat $file"; print OUT "##gff-version 3\n$output"; close OUT; }