# Sarah Hill, June 2020
# This file takes an alignment and looks at pre-specified primer binding locations to determine mismatching sites.
# Because binding site locations are given relative to the Brazilian alignment, it should therefore NOT be used on
# other datasets without very careful adaptation.
# In the output files, '.' is a match and any other character specifies the mismatch

from Bio import SeqIO
from Bio.Data import IUPACData
from Bio.Alphabet import IUPAC
import os
#import re
#from Bio import SeqUtils
#from Bio.Seq import Seq
#from itertools import combinations
#from Bio import pairwise2

#Set this to the location of the files
os.chdir('/PATH_TO_FILES/')

#Brazilian SARS-CoV-2 fasta alignment
cov_file = 'short_alignemnt.2020-05-12.Brazil.fasta'
#Fasta file of primer sequences
primer_file = 'Primers_and_probes_Brazil.fas'
#Locations of primer binding sites
locations = 'Primers_and_probes_locations_Brazil.csv'
#Output file
out_file = 'short_alignemnt.2020-05-12.Brazil.mismatches.txt'


start= {}
stop = {}
direction = {}
assay = {}

#Create dictionaries of start and stop sites
with open(locations, 'rU') as handle:
    for line in handle:
        line = line.split(',')
        assay[line[1]] = line[0]
        direction[line[1]] = line[2]
        start[line[1]] = line[3]
        stop[line[1]] = line[4]

#Search through both files to allow comparisons of every primer with every sequence
with open(cov_file, "rU") as cov_handle:
    cov_records = SeqIO.parse(cov_handle, 'fasta', alphabet=IUPAC.ambiguous_dna)
    with open(out_file, 'w') as out_handle:
        print >> out_handle, 'Coronavirus_ID', 'Assay','Primer_ID', 'Primer_length','Number_mismatches','Percentage_mismatches', 'Mismatching_sequence','Primer_sequence'
        for cov in cov_records:
            with open(primer_file, "rU") as primer_handle:
                primer_records = SeqIO.parse(primer_handle, "fasta", alphabet=IUPAC.ambiguous_dna)
                for primer in primer_records:

                    #Slice the coronavirus sequences down to only the appropriate sites
                    #Decide if you have to reverse translate the coronavirus sequence
                    primer_name = primer.id
                    primer_direction = direction.get(primer_name)

                    start_site = int(start.get(primer_name))
                    stop_site = int(stop.get(primer_name))
                    assay_name = assay.get(primer_name)

                    start_site = start_site-1
                    stop_site = stop_site

                    cov_sequence = cov.seq.upper()
                    cov_sequence = cov_sequence[start_site:stop_site]

                    primer_sequence = primer.seq.upper()

                    # Take the primer sequence and the coronavirus sequence
                    if primer_direction is 'F':
                        pass
                    elif primer_direction is 'R':
                        cov_sequence = cov_sequence.reverse_complement()
                    if 'N' in cov_sequence:
                        pass
                    else:

                        score = 0
                        #Search one by one through all sequences
                        mismatching_sequence = ''
                        for i in range(len(primer_sequence)):
                            ambig_primer = primer_sequence[i]
                            ambig_primer = IUPACData.ambiguous_dna_values[ambig_primer]
                            ambig_cov = cov_sequence[i]
                            ambig_cov = IUPACData.ambiguous_dna_values[ambig_cov]
                            joint = ''.join(set(ambig_cov).intersection(ambig_primer))
                            if len(joint) >0:
                                mismatching_sequence = mismatching_sequence + '.'
                                score = score+1
                            else:
                                mismatching_sequence = mismatching_sequence + cov_sequence[i]
                        print mismatching_sequence

                        length_primer = len(primer.seq)
                        mismatch_sites = length_primer - score
                        mismatch_percentage = (float(mismatch_sites)/ float(length_primer))*100

                        print >> out_handle, cov.id, assay_name, primer.id, length_primer, mismatch_sites, mismatch_percentage, mismatching_sequence, primer_sequence