#Script by KM Dlugosch, April 2020 #Associated with paper: "TagSeq for gene expression in non-model plants: a pilot study at the Santa Rita Experimental Range NEON core site" by Hannah E. Marx; Stephen Scheidt; Michael S. Barker; Katrina M. Dlugosch. 2020 bioRXiv. #This script takes a reference transcriptome fasta file (sequence can be multiple lines) and creates a GTF file of the form: #tab delim file with no header #col1: name of the sequence, i.e. the scaffold or contig number that is first in header (sep by space from other info) #col2: some source name, e.g. the software used (doesn't matter here) #col3: a label for the feature. seems like htseq will want 'exon' #col4: start position - will always be 1 for us #col5: end position - will be the sequence length #score: no info, so use '.' #strand: no info, so use '.' #frame: no info, so use '.' (Would be 0 if in frame) #attributes: these are semicolon delim and two are required: 'gene_id "giveName"; transcript_id "";' where the giveName is the name of the sequence # run: perl create_GTF.pl <inputFasta> ################################### use strict; use warnings; my $FAFile = "$ARGV[0]"; #read in file and put loci into GTF format at the same time open OUT, ">$FAFile.gtf"; open IN1, "<$FAFile"; my ($header,$seq) = (); foreach (<IN1>) { chomp $_; if ($_) { #if the line is not blank if ($_ =~ m/^>/) { #the line is a header if ($header) { #if there is already a previous header recorded, then we have passed the end of the sequence and are hitting the next one (since this is a header line again) #measure sequence and print to output my $length = length $seq; print OUT "$header\tsoaptrans\texon\t1\t$length\t.\t.\t.\tgene_id \"$header\"; transcript_id \"\";\n"; #delete the stored sequence $seq = (); #record new header my @parts = split / /, $_; #split header at space $header = "$parts[0]"; $header = substr($header, 1); #get rid of > } else { #record the first header my @parts = split / /, $_; #split header at space $header = "$parts[0]"; $header = substr($header, 1); #get rid of > } } else { #the line is not a header and not blank $seq = $seq.$_; #collect the sequence } } } close IN1; close OUT;