//
//  accelerator_sse.cpp
//
//  Copyright 2018 Franco Milicchio. All rights reserved.
//

#include "accelerator_sse.hpp"

using namespace libseq;

using storage_type   = accelerator_sse::storage_type;

const char accelerator_sse::name_[] = "accelerator_sse";

storage_type accelerator_sse::mask_[64] =
{
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03), // k = 01
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f), // k = 02
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x0f), // k = 03
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f), // k = 04
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f), // k = 05
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 06
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 07
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 08
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 09
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 10
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 11
    _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 12
    _mm_set_epi8(0x00, 0x00, 0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 13
    _mm_set_epi8(0x00, 0x00, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 14
    _mm_set_epi8(0x00, 0x03, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 15
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f), // k = 16
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f), // k = 17
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff), // k = 18
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff), // k = 19
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff, 0x00, 0xff), // k = 20
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff, 0x00, 0xff), // k = 21
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 22
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 23
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 24
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 25
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 26
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 27
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x0f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 28
    _mm_set_epi8(0x00, 0x0f, 0x00, 0x3f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 29
    _mm_set_epi8(0x00, 0x0f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 30
    _mm_set_epi8(0x00, 0x3f, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 31
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff), // k = 32
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x03, 0xff), // k = 33
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x0f, 0xff), // k = 34
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x03, 0xff, 0x0f, 0xff), // k = 35
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 36
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 37
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 38
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 39
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 40
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 41
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 42
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 43
    _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 44
    _mm_set_epi8(0x00, 0xff, 0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 45
    _mm_set_epi8(0x00, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 46
    _mm_set_epi8(0x03, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 47
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff), // k = 48
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff), // k = 49
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff), // k = 50
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff), // k = 51
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 52
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 53
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 54
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 55
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 56
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 57
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 58
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 59
    _mm_set_epi8(0x0f, 0xff, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 60
    _mm_set_epi8(0x0f, 0xff, 0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 61
    _mm_set_epi8(0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 62
    _mm_set_epi8(0x3f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff), // k = 63
    _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)  // k = 64
};

/// Dump the SIMD register into a string
void dump_simd(std::string &substring, storage_type r)
{
    for (int j = 0; j < sizeof(storage_type); j++)
    {
        unsigned char c = *((unsigned char*)(&r) + j);
        
        switch (c)
        {
        case 0x00:
            substring += 'A';
            break;
        
        case 0x0c: case 0x03:
        case 0xc0: case 0x30:
            substring += 'T';
            break;
        
        case 0x02: case 0x08:
        case 0x20: case 0x80:
            substring += 'C';
            break;
        
        case 0x04: case 0x01:
        case 0x40: case 0x10:
            substring += 'G';
            break;
        
        default:
            substring += '.';
            throw std::domain_error("Unkown character in SIMD register.");
        }
    }
};

////////////////////////////////////////////////////////////////////////////////
storage_type accelerator_sse::to_forward(const std::string_view &s) const
{    
    // SIMD 128-bit registers (computational sse0, sse1; substring; bitmask; result)
    storage_type sse0[4], sse1[4], subs[4], mask[2], res;
    
    // Mask to shift to the low bytes
    mask[0] = _mm_set_epi8(1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0, 1 << 2, 1 << 0);
    mask[1] = _mm_set_epi8(1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4, 1 << 6, 1 << 4);
    
    // Load substring data without caring about past-end-of-buffer
    subs[0] = _mm_loadu_si128((__m128i*) (s.data() +   0));
    subs[1] = _mm_loadu_si128((__m128i*) (s.data() +  16));
    subs[2] = _mm_loadu_si128((__m128i*) (s.data() +  32));
    subs[3] = _mm_loadu_si128((__m128i*) (s.data() +  48));
    
    // Offset ASCII 'A' to zero
    sse0[0] = _mm_sub_epi8(subs[0], _mm_set1_epi8('A'));
    sse0[1] = _mm_sub_epi8(subs[1], _mm_set1_epi8('A'));
    sse0[2] = _mm_sub_epi8(subs[2], _mm_set1_epi8('A'));
    sse0[3] = _mm_sub_epi8(subs[3], _mm_set1_epi8('A'));
    
    // Get lower 4 bits to distinguish letters
    sse0[0] = _mm_and_si128(sse0[0], _mm_set1_epi8(0x0f));
    sse0[1] = _mm_and_si128(sse0[1], _mm_set1_epi8(0x0f));
    sse0[2] = _mm_and_si128(sse0[2], _mm_set1_epi8(0x0f));
    sse0[3] = _mm_and_si128(sse0[3], _mm_set1_epi8(0x0f));
    
    // Find 'G' base 0x06
    sse1[0] = _mm_cmpeq_epi8(sse0[0], _mm_set1_epi8(0x06));
    sse1[1] = _mm_cmpeq_epi8(sse0[1], _mm_set1_epi8(0x06));
    sse1[2] = _mm_cmpeq_epi8(sse0[2], _mm_set1_epi8(0x06));
    sse1[3] = _mm_cmpeq_epi8(sse0[3], _mm_set1_epi8(0x06));
    
    // Convert 'G' to 0x01
    subs[0] = _mm_sub_epi8(_mm_andnot_si128(sse1[0], sse0[0]), sse1[0]);
    subs[1] = _mm_sub_epi8(_mm_andnot_si128(sse1[1], sse0[1]), sse1[1]);
    subs[2] = _mm_sub_epi8(_mm_andnot_si128(sse1[2], sse0[2]), sse1[2]);
    subs[3] = _mm_sub_epi8(_mm_andnot_si128(sse1[3], sse0[3]), sse1[3]);
    
    // FMA
    subs[0] = _mm_maddubs_epi16(subs[0], mask[0]);
    subs[1] = _mm_maddubs_epi16(subs[1], mask[1]);
    subs[2] = _mm_maddubs_epi16(subs[2], mask[0]);
    subs[3] = _mm_maddubs_epi16(subs[3], mask[1]);
    
    // Resulting registers must be ORed to get the final result
    res     = _mm_or_si128  (subs[0], subs[1]);
    subs[2] = _mm_slli_epi16(subs[2], 8);
    res     = _mm_or_si128  (res, subs[2]);
    subs[3] = _mm_slli_epi16(subs[3], 8);
    res     = _mm_or_si128  (res, subs[3]);
    
    // Use bitmask to clear unwanted bits
    res = _mm_and_si128(res, this->mask(s.length()));

    return res;
}

////////////////////////////////////////////////////////////////////////////////
storage_type accelerator_sse::to_revcomp(const std::string_view &s) const
{
    // SIMD 128-bit registers (computational sse with double size for junk; reverse shuffle mask)
    storage_type sse[4 * 2], revmask, res;
    
    // Mask to shift to the low bytes
    revmask = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f);

    // Load substring data without caring about past-end-of-buffer, in reverse order
    sse[3] = _mm_loadu_si128((__m128i*) (s.data() +   0));
    sse[2] = _mm_loadu_si128((__m128i*) (s.data() +  16));
    sse[1] = _mm_loadu_si128((__m128i*) (s.data() +  32));
    sse[0] = _mm_loadu_si128((__m128i*) (s.data() +  48));
    
    // Reverse substrings
    sse[0] = _mm_shuffle_epi8(sse[0], revmask);
    sse[1] = _mm_shuffle_epi8(sse[1], revmask);
    sse[2] = _mm_shuffle_epi8(sse[2], revmask);
    sse[3] = _mm_shuffle_epi8(sse[3], revmask);
    
    // Call to_forward with appropriate offset
    res = to_forward(std::string_view((char *) sse + (64 - s.size()), s.size()));

    // Reverse bits
    res = _mm_andnot_si128(res, _mm_set1_epi8(0xff));
    
    // Use bitmask to clear unwanted bits
    res = _mm_and_si128(res, this->mask(s.length()));

    return res;
}

////////////////////////////////////////////////////////////////////////////////
storage_type accelerator_sse::to_canonical(const std::string_view &s) const
{
    storage_type f = to_forward(s);
    storage_type r = to_revcomp(s);
        
    return compare(f, r) > 0 ? r : f;
}

////////////////////////////////////////////////////////////////////////////////
std::string accelerator_sse::to_string(const storage_type r, std::size_t k) const
{
    // SIMD 128-bit registers (computational sse0, sse1)
    storage_type sse0[4], sse1[4];
    
    // Result
    std::string res;
    
    res.reserve(k);
    
    sse0[0] = _mm_and_si128(r, _mm_set1_epi16(0x0003));    // extract 8 bases
    sse0[1] = _mm_and_si128(r, _mm_set1_epi16(0x000c));    // second 8
    sse0[2] = _mm_and_si128(r, _mm_set1_epi16(0x0030));    // ...
    sse0[3] = _mm_and_si128(r, _mm_set1_epi16(0x00c0));    // you get the idea
    sse1[0] = _mm_and_si128(r, _mm_set1_epi16(0x0300));    //
    sse1[1] = _mm_and_si128(r, _mm_set1_epi16(0x0c00));    // ...
    sse1[2] = _mm_and_si128(r, _mm_set1_epi16(0x3000));    //
    sse1[3] = _mm_and_si128(r, _mm_set1_epi16(0xc000));    // done.
    
    // Rearrange bytes
    sse0[1] = _mm_shuffle_epi8(sse0[1],          _mm_set_epi8(0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xff));
    sse0[0] = _mm_blendv_epi8 (sse0[0], sse0[1], _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00));
    sse0[3] = _mm_shuffle_epi8(sse0[3],          _mm_set_epi8(0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xff));
    sse0[2] = _mm_blendv_epi8 (sse0[2], sse0[3], _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00));
    sse1[0] = _mm_shuffle_epi8(sse1[0],          _mm_set_epi8(0xff, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01));
    sse1[1] = _mm_blendv_epi8 (sse1[0], sse1[1], _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00));
    sse1[2] = _mm_shuffle_epi8(sse1[2],          _mm_set_epi8(0xff, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01));
    sse1[3] = _mm_blendv_epi8 (sse1[2], sse1[3], _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00));
    
    // Now in order dump sse[0,2], adj[1,3]
    dump_simd(res, sse0[0]);
    dump_simd(res, sse0[2]);
    dump_simd(res, sse1[1]);
    dump_simd(res, sse1[3]);
    
    // Cut the substring length
    return res.substr(0, k);
}

////////////////////////////////////////////////////////////////////////////////
std::size_t accelerator_sse::hash(storage_type r) const
{
    hash_storage_type h;
    return h(r);
}

////////////////////////////////////////////////////////////////////////////////
int accelerator_sse::compare(storage_type r1, storage_type r2) const
{
//    std::size_t a1 = _mm_cvtsi128_si64(r1);
//    std::size_t b1 = _mm_extract_epi64(r1, 1);
//
//    std::size_t a2 = _mm_cvtsi128_si64(r2);
//    std::size_t b2 = _mm_extract_epi64(r2, 1);

    std::size_t a1 = *( (std::size_t*) &r1 );
    std::size_t b1 = *( (std::size_t*) &r1 + 1);
    
    std::size_t a2 = *( (std::size_t*) &r2 );
    std::size_t b2 = *( (std::size_t*) &r2 + 1);

    
    if (a1 < a2) return -1;
    
    
//    if (a1 < b1 || a2 < b2) return -1;
    
    if (a1 == a2 && b1 == b2) return 0;
    
    return (b1 < b2) ? -1 : 1;
}

////////////////////////////////////////////////////////////////////////////////
bool accelerator_sse::equal(storage_type r1, storage_type r2) const
{
    return compare(r1, r2) == 0;
}
