#!usr/bin/perl -w

## Written by Adam C. Payton
## edited for use in pulling U and V-linked genes by Sarah B. Carey

## current version doesn't work in a perl loop, but does work in a bash loop
## to run on all alns can use:
## for f in *.fa; do perl array_hash_extractor_fasta_unlock_ks.pl orthologs.txt $f; done
## this assumes the alignments end in .fa. Will also need a txt document that lists
## in one column all U and V-linked one-to-one orthologs of interest (e.g., orthologs.txt)

## the output of this can be used in ceratodon_aln_to_axt.pl to make 
## the axt input for KaKs Calculator

$usage = "perl <script_name> <name_of_fasta_file_to_mine><name_of_file_contain_list_of_names_to_mine>\n\n";
#This script will read a fasta file, wrapped or otherwise, into memory and then compare a user specified list of sequence names against the sequences in the fasta file. A new fasta file is created containing only the sequences in the user provided list.  The user provided list must be a text file consisting of a single column of names no '>' and spaces are allowed (but never recommend).

$file_ext = $ARGV[0]; 
@file_array = glob "*$file_ext";
$list = $ARGV[1];

for $file (@file_array)
{
open IN, "$file" or die "\n\n$file not found program terminated\n\nusage: $usage\n\n";

%genomehash = ();

while (<IN>)
	{	if (/^>(Ceratodon_purpureus_v1pt1_CepurR40.VG\d+|Ceratodon_purpureus_v1pt1_CepurGG1.UG\d+)/)
			{	$seqname = $1; 
				chomp $seqname;
				$genomehash{$seqname} = "";
				$seq = (<IN>);
				chomp $seq;
				$genomehash{$seqname} = $genomehash{$seqname} . $seq;	
			}
	}

print "\n\nHash table created successfully\n\n";


open IN2, "$list" or die "/n/nERROR    $list not found in same directory program terminated\n\nusage: $usage\n\n";

open OUT, ">$file.out.sex.fa";


@minningarray = ();
$found_count =0;

while (<IN2>)
	{	$line = $_;
		chomp $line;
		push (@minningarray, $line);
	}

for $name (@minningarray)
	{	if (exists $genomehash{$name})
			{	print OUT ">$name\n$genomehash{$name}\n";	
				$found_count++;
			}
		else	{	print "WARNING: $name was not found in Hash\n";	}
	}

print "\n\nFinished\n\n$found_count\t sequences in $file\n\n";
}

close IN;
close IN2;


@minningarray = ();
%genomehash = ();

