#Script by KM Dlugosch, April 2020
#Associated with paper: "TagSeq for gene expression in non-model plants: a pilot study at the Santa Rita Experimental Range NEON core site" by Hannah E. Marx; Stephen Scheidt; Michael S. Barker; Katrina M. Dlugosch. 2020 bioRXiv.

#This script takes a reference transcriptome fasta file (sequence can be multiple lines) and creates a GTF file of the form:
#tab delim file with no header
#col1: name of the sequence, i.e. the scaffold or contig number that is first in header (sep by space from other info)
#col2: some source name, e.g. the software used (doesn't matter here)
#col3: a label for the feature. seems like htseq will want 'exon'
#col4: start position - will always be 1 for us
#col5: end position - will be the sequence length
#score: no info, so use '.'
#strand: no info, so use '.'
#frame: no info, so use '.' (Would be 0 if in frame)
#attributes: these are semicolon delim and two are required: 'gene_id "giveName"; transcript_id "";' where the giveName is the name of the sequence


# run: perl create_GTF.pl <inputFasta>

###################################
use strict;
use warnings;

my $FAFile = "$ARGV[0]";

#read in file and put loci into GTF format at the same time
open OUT, ">$FAFile.gtf";
open IN1, "<$FAFile";
my ($header,$seq) = ();
foreach (<IN1>) {
	chomp $_;
	if ($_) { #if the line is not blank
		if ($_ =~ m/^>/) { #the line is a header
			if ($header) { #if there is already a previous header recorded, then we have passed the end of the sequence and are hitting the next one (since this is a header line again)

				#measure sequence and print to output
				my $length = length $seq;
				print OUT "$header\tsoaptrans\texon\t1\t$length\t.\t.\t.\tgene_id \"$header\"; transcript_id \"\";\n";

				#delete the stored sequence
				$seq = ();

				#record new header
				my @parts = split / /, $_; #split header at space
				$header = "$parts[0]"; 
				$header = substr($header, 1); #get rid of >
			}
			else { #record the first header
				my @parts = split / /, $_; #split header at space
				$header = "$parts[0]"; 
				$header = substr($header, 1); #get rid of >
			}
		} 
		else { #the line is not a header and not blank
			$seq = $seq.$_; #collect the sequence
		}
	}
}
close IN1;
close OUT;