//
//  lmf_graph.hpp
//  sequence
//
//  Created by Sensei on 9/24/15.
//  Copyright © 2015 Franco "Sensei" Milicchio. All rights reserved.
//

#ifndef lmf_graph_hpp
#define lmf_graph_hpp

#include <fstream>
#include <atomic>
#include <tbb/concurrent_hash_map.h>
#include <tbb/concurrent_vector.h>

#include <vector>
#include <algorithm>

#include "../libseq/common.h"
#include "../libseq/timer.h"

//! \brief The static kmer type. Change this to create different kmer size types.
typedef seq::bit_container<15> kmertype;

//! \brief The property storage for a kmer
typedef uint32_t kmerprop;

//! \brief Threading Building Block
namespace tbb
{
    /*!
     * Specialize hashing for bit containers
     */
    template<>
    struct tbb_hash_compare<kmertype>
    {
        static size_t hash(const kmertype& a)
        {
            return tbb_hasher(a.get());
        }
        
        static bool equal(const kmertype& a, const kmertype& b)
        {
            return a.get() == b.get();
        }
    };
}

/*!
 * This class represents De Bruijn graph: the hashmap *IS* the graph.
 *
 * Key      is a bit container: the kmer
 * Value    is a std::size_t: the frequency
 *
 * The value will be utilized to build the graph with no additional memory:
 *
 *         MSB         |         LSB
 * Value : |-----...---|GCTA|GCTA|
 *                      HBIT LBIT
 *
 * A bit it set if there exists a preceding kmer (in LBIT) or a following one
 * (in HBIT).
 *
 * The rest of the std::size_t is right now unutilized.
 */
class lmf_graph
{
public:
    
    //! \brief Constructor, the parameter specifies if the kmers should be matched in revcomplement
    lmf_graph(bool matchrevs = true) : matchrev_{matchrevs}
    {
        // NOP
    }
    
    //! \brief This function returns the kmer size
    inline unsigned int kmersize() const
    {
        return kmertype().size();
    }

    //! \brief This function returns the allocated hash buckets
    inline std::size_t buckets()
    {
        return hash_.bucket_count();
    }
    
    //! \brief This function preallocates buckets
    inline void resize(std::size_t nbuckets)
    {
        seq::timer t;
        
        t.start("Preallocating hashmap");
        hash_.rehash(nbuckets);
        t.stop();
    }

    
    //! \brief This function returns the number of unique kmers
    inline std::size_t size() const
    {
        return hash_.size();
    }
    
    //! \brief Inserts an element in the hashmap
    inline void insert(std::string &&s)
    {
        kmertype k, j;
        seq::container_traits<kmertype>::fromString(k, s);
        
        // Swap kmer/rev keeping the lesser one (as int)
        if (matchrev_)
        {
            // Find the revese complement of the kmer, and keep the lower
            j = revfn(k);

            if (j.get() < k.get())
            {
                k = j;
            }
        }
        
        // Increase the count number
        typename hashtype::accessor accessor;
        hash_.insert(accessor, k);
        accessor->second++;
    }
    
    //! \brief Dumps the hashmap to a CSV file
    void dump_csv(const std::string &name);

    //! \brief Dumps the graph to a graphviz file
    void dump_graph(const std::string &name);

    //! \brief Cleanup the kmer list before creating the graph
    void cleanup(double error = 0.01);
    
    //! \brief Build the graph
    void build();
    
    //! \brief Just a shorthand function to create a kmer from a string (lazy to use the whole traits)
    inline std::string tostring(kmertype k)
    {
        return seq::container_traits<kmertype>::toString(k);
    }

    //! \brief Just a shorthand function to create a string from a kmer (lazy to use the whole traits)
    inline kmertype fromstring(std::string k)
    {
        kmertype t;
        
        seq::container_traits<kmertype>::fromString(t, k);
        
        return t;
    }
    
    //! \brief Returns the next kmer with traliling letter (e.g., ATAC -> TAC-). USE WITH CAUTION!
    inline kmertype getnext(const kmertype &k, char traliling)
    {
        kmertype t, q;
        
        t.get() = (k.get() >> 2);// | (traliling << ((k.size() - 1) << 1));
        q.get() = (traliling << ((k.size() - 1) << 1));
        t.get() = t.get() | q.get();
        
        // Clean up trailing bits
        if (k.size() < 16)
        {
            q.get() = (~0) << (k.size() << 1);
            
            t.get() = t.get() & (~q.get());
        }
        
        return t;
    }

    //! \brief Returns the next kmer with traliling letter (e.g., ATAC -> TAC-). USE WITH CAUTION!
    inline kmertype getprev(const kmertype &k, char traliling)
    {
        kmertype t, q;
        
        t.get() = ((k.get() & ~(3 << ((k.size() - 1) << 1))) << 2) | traliling;
        
        // Clean up trailing bits
        if (k.size() < 16)
        {
            q.get() = (~0) << (k.size() << 1);
            
            t.get() = t.get() & (~q.get());
        }

        return t;
    }

private:
    
    //! \brief Handy typedef for the concurrent hashmap
    typedef tbb::concurrent_hash_map<kmertype, kmerprop> hashtype;
    //typedef tbb::concurrent_vector<kmerprop> hashtype;
    
    //! \brief Hashmap of kmers
    hashtype hash_;
    
    //! \brief Reverse complement
    seq::details::reverse revfn;
    
    //! \brief True if the graph should match reverse complements
    bool matchrev_;
    
    //! \brief Pair of bools used to check if a kmer is found as forward or reverse
    typedef std::pair<bool, bool> fwdrev;
    
    // Kmers with a given aminoacid found in forward or reverse order
    static constexpr kmerprop fwdA = 1 << kmertype::A;
    static constexpr kmerprop fwdT = 1 << kmertype::T;
    static constexpr kmerprop fwdC = 1 << kmertype::C;
    static constexpr kmerprop fwdG = 1 << kmertype::G;
    static constexpr kmerprop revA = (1 << kmertype::A) << 4;
    static constexpr kmerprop revT = (1 << kmertype::T) << 4;
    static constexpr kmerprop revC = (1 << kmertype::C) << 4;
    static constexpr kmerprop revG = (1 << kmertype::G) << 4;
};

#endif /* lmf_graph_hpp */
