// Cytosim was created by Francois Nedelec.  Copyright 2020 Cambridge University.

#include <cmath>
#include "sparmatsymblk.h"
#include "assert_macro.h"
#include "vector2.h"
#include "vector3.h"
#include <sstream>

// Flags to enable SIMD implementation
#if defined(__AVX__)
#  include "simd.h"
#  include "simd_float.h"
#  define SMSB_USES_AVX 1
#  define SMSB_USES_SSE 1
#elif defined(__SSE3__)
#  include "simd.h"
#  include "simd_float.h"
#  define SMSB_USES_AVX 0
#  define SMSB_USES_SSE 1
#else
#  define SMSB_USES_AVX 0
#  define SMSB_USES_SSE 0
#endif


SparMatSymBlk::SparMatSymBlk()
{
    size_    = 0;
    alloc_   = 0;
    column_  = nullptr;
    colidx_  = new size_t[2];
    colidx_[0] = 0;
}


void SparMatSymBlk::allocate(size_t alc)
{
    if ( alc > alloc_ )
    {
        /*
         'chunk' can be increased to gain performance:
          more memory will be used, but reallocation will be less frequent
        */
        constexpr size_t chunk = 32;
        alc = ( alc + chunk - 1 ) & ~( chunk -1 );

        //fprintf(stderr, "SMSB allocates %u\n", alc);
        Column * ptr = new Column[alc];
       
        if ( column_ )
        {
            for (size_t n = 0; n < alloc_; ++n )
                ptr[n] = column_[n];
            delete[] column_;
        }
        
        column_ = ptr;
        alloc_  = alc;
        
        delete[] colidx_;
        colidx_ = new size_t[alc+1];
        for ( size_t n = 0; n <= alc; ++n )
            colidx_[n] = n;
    }
}


void SparMatSymBlk::deallocate()
{
    delete[] column_;
    delete[] colidx_;
    column_ = nullptr;
    colidx_ = nullptr;
    alloc_ = 0;
}

//------------------------------------------------------------------------------
#pragma mark - Column

SparMatSymBlk::Column::Column()
{
    size_ = 0;
    allo_ = 0;
    inx_ = nullptr;
    blk_ = nullptr;
}


/*
 \todo Columns should use partitions of a single memory pool allocated by Matrix
 This may require some smart allocation scheme.
 */
void SparMatSymBlk::Column::allocate(size_t alc)
{
    if ( alc > allo_ )
    {
        //if ( inx_ ) fprintf(stderr, "SMSB reallocates column %lu for %lu\n", inx_[0], alc);
        //else fprintf(stderr, "SMSB allocates column for %u\n", alc);
        /*
         'chunk' can be increased, to possibly gain performance:
         more memory will be used, but reallocation will be less frequent
         */
        constexpr size_t chunk = 8;
        alc = ( alc + chunk - 1 ) & ~( chunk - 1 );
        
        // use aligned memory:
        void * ptr = new_real(alc*SB+4);
        Block * blk_new  = new(ptr) Block[alc];

        if ( posix_memalign(&ptr, 32, alc*sizeof(size_t)) )
            throw std::bad_alloc();
        size_t * inx_new = (size_t*)ptr;

        if ( inx_ )
        {
            for ( size_t n = 0; n < size_; ++n )
                inx_new[n] = inx_[n];
            free(inx_);
        }

        if ( blk_ )
        {
            for ( size_t n = 0; n < size_; ++n )
                blk_new[n] = blk_[n];
            free_real(blk_);
        }
        inx_  = inx_new;
        blk_  = blk_new;
        allo_ = alc;
        
        //std::clog << "Column " << this << "  " << alc << ": ";
        //std::clog << " alignment " << ((uintptr_t)elem_ & 63) << "\n";
    }
}


void SparMatSymBlk::Column::deallocate()
{
    //if ( inx_ ) fprintf(stderr, "SMSB deallocates column %lu of size %lu\n", inx_[0], allo_);
    free(inx_);
    free_real(blk_);
    inx_ = nullptr;
    blk_ = nullptr;
    allo_ = 0;
    size_ = 0;
}


void SparMatSymBlk::Column::operator =(SparMatSymBlk::Column & col)
{
    //if ( inx_ ) fprintf(stderr, "SMSB transfers column %u\n", inx_[0]);
    free(inx_);
    free_real(blk_);

    size_ = col.size_;
    allo_ = col.allo_;
    inx_ = col.inx_;
    blk_ = col.blk_;
    
    col.size_ = 0;
    col.allo_ = 0;
    col.inx_ = nullptr;
    col.blk_ = nullptr;
}

/**
 This allocates to be able to hold the matrix element if necessary
 */
SparMatSymBlk::Block& SparMatSymBlk::Column::block(size_t ii, size_t jj)
{
    assert_true( ii >= jj );
    if ( size_ > 0 )
    {
        if ( inx_[0] == ii )
            return blk_[0];
        /* This is a silly search that could be optimized */
        for ( size_t n = 1; n < size_; ++n )
            if ( inx_[n] == ii )
                return blk_[n];
    }
    else
    {
        allocate(2);
        // put diagonal term always first:
        inx_[0] = jj;
        blk_[0].reset();
        if ( ii == jj )
        {
            size_ = 1;
            return blk_[0];
        }
        //add the requested term:
        inx_[1] = ii;
        blk_[1].reset();
        size_ = 2;
        return blk_[1];
    }
    
    // add the requested term last:
    size_t n = size_;
    
    // allocate space for new Element if necessary:
    if ( n >= allo_ )
        allocate(n+1);
    
    assert_true( n < allo_ );
    inx_[n] = ii;
    blk_[n].reset();
    size_ = n + 1;
    
    //printColumn(jj);
    return blk_[n];
}


void SparMatSymBlk::Column::reset()
{
    size_ = 0;
}

SparMatSymBlk::Block& SparMatSymBlk::diag_block(size_t ii)
{
    assert_true( ii < size_ );
    Column & col = column_[ii];
    if ( col.size_ == 0 )
    {
        //fprintf(stderr, "new diagonal element for column %i\n", ii);
        col.allocate(1);
        col.size_ = 1;
        // put diagonal term always first:
        col.inx_[0] = ii;
        col.blk_[0].reset();
    }
    assert_true(col.inx_[0] == ii);
    return col.blk_[0];
}


real& SparMatSymBlk::operator()(size_t iii, size_t jjj)
{
    // branchless code to address lower triangle
    size_t ii = std::max(iii, jjj);
    size_t jj = std::min(iii, jjj);
#if ( BLOCK_SIZE == 1 )
    return column_[jj].block(ii, jj).value();
#else
    size_t i = ii % BLOCK_SIZE;
    size_t j = jj % BLOCK_SIZE;
    return column_[jj-j].block(ii-i, jj-j)(i, j);
#endif
}


real* SparMatSymBlk::addr(size_t iii, size_t jjj) const
{
    // branchless code to address lower triangle
    size_t ii = std::max(iii, jjj);
    size_t jj = std::min(iii, jjj);
#if ( BLOCK_SIZE == 1 )
    return &column_[jj].block(ii, jj).value();
#else
    size_t i = ii % BLOCK_SIZE;
    size_t j = jj % BLOCK_SIZE;
    return column_[jj-j].block(ii-i, jj-j).addr(i, j);
#endif
}


//------------------------------------------------------------------------------
#pragma mark -

void SparMatSymBlk::reset()
{
    for ( size_t n = 0; n < size_; ++n )
        column_[n].reset();
}


bool SparMatSymBlk::isNotZero() const
{
    //check for any non-zero sparse term:
    for ( size_t jj = 0; jj < size_; ++jj )
    {
        Column & col = column_[jj];
        for ( size_t n = 0 ; n < col.size_ ; ++n )
            if ( col[n] != 0.0 )
                return true;
    }
    //if here, the matrix is empty
    return false;
}


void SparMatSymBlk::scale(const real alpha)
{
    for ( size_t jj = 0; jj < size_; ++jj )
    {
        Column & col = column_[jj];
        for ( size_t n = 0 ; n < col.size_ ; ++n )
            col[n].scale(alpha);
    }
}


void SparMatSymBlk::addDiagonalBlock(real* mat, size_t ldd,
                                     const size_t start, const size_t cnt) const
{
    assert_false( start % BLOCK_SIZE );
    assert_false( cnt % BLOCK_SIZE );

    size_t end = start + cnt;
    size_t off = start + ldd * start;
    assert_true( end <= size_ );
    
    for ( size_t jj = start; jj < end; jj += BLOCK_SIZE )
    {
        Column & col = column_[jj];
        if ( col.size_ > 0 )
        {
            assert_true(col.inx_[0] == jj);
            col[0].addto_symm(mat+(1+ldd)*jj-off, ldd);
            for ( size_t n = 1; n < col.size_; ++n )
            {
                size_t ii = col.inx_[n];
                assert_true(ii > jj);
                if ( ii < end )
                {
                    //fprintf(stderr, "SMSB %4i %4i % .4f\n", ii, jj, a);
                    col[n].addto(mat+(ii+ldd*jj)-off, ldd);
                    col[n].addto_trans(mat+(jj+ldd*ii)-off, ldd);
                }
            }
        }
    }
}


void SparMatSymBlk::addLowerBand(real alpha, real* mat, size_t ldd,
                                 const size_t start, const size_t cnt, size_t rank) const
{
    assert_false( start % BLOCK_SIZE );
    assert_false( cnt % BLOCK_SIZE );

    size_t end = start + cnt;
    size_t off = start + ldd * start;
    assert_true( end <= size_ );
    
    for ( size_t jj = start; jj < end; jj += BLOCK_SIZE )
    {
        Column & col = column_[jj];
        if ( col.size_ > 0 )
        {
            assert_true(col.inx_[0] == jj);
            col[0].addto_lower(mat+(1+ldd)*jj-off, ldd, alpha);
            for ( size_t n = 1; n < col.size_; ++n )
            {
                size_t ii = col.inx_[n];
                assert_true(ii > jj);
                if ((ii <= jj+rank) & (ii < end))
                {
                    //fprintf(stderr, "SMSB %4i %4i % .4f\n", ii, jj, a);
                    col[n].addto(mat+(ii+ldd*jj)-off, ldd, alpha);
                    //col[n].addto_trans(mat+(jj+ldd*ii)-off, ldd, alpha);
                }
            }
        }
    }
}

/*
addresses `mat' using lower banded storage for a symmetric matrix
mat(i, j) is stored in mat[i-j+ldd*j]
*/
void SparMatSymBlk::addDiagonalTrace(real alpha, real* mat, size_t ldd,
                                     const size_t start, const size_t cnt,
                                     const size_t rank, bool sym) const
{
    assert_false( start % BLOCK_SIZE );
    assert_false( cnt % BLOCK_SIZE );

    size_t end = start + cnt;
    assert_true( end <= size_ );

    for ( size_t jj = start; jj < end; jj += BLOCK_SIZE )
    {
        Column & col = column_[jj];
        if ( col.size_ > 0 )
        {
            size_t j = ( jj - start ) / BLOCK_SIZE;
            assert_true(col.inx_[0] == jj);
            // with banded storage, mat(i, j) is stored in mat[i-j+ldd*j]
            mat[j+ldd*j] += alpha * col[0].trace();  // diagonal term
            for ( size_t n = 1; n < col.size_; ++n )
            {
                size_t ii = col.inx_[n];
                // assuming lower triangle is stored:
                assert_false( ii % BLOCK_SIZE );
                assert_true( ii > jj );
                if (( ii < end ) & ( ii <= jj+rank ))
                {
                    size_t i = ( ii - start ) / BLOCK_SIZE;
                    real a = alpha * col[n].trace();
                    //fprintf(stderr, "SMSB %4lu %4lu : %.4f\n", i, j, a);
                    mat[i+ldd*j] += a;
                    if ( sym ) mat[j+ldd*i] += a;
                }
            }
        }
    }
}



int SparMatSymBlk::bad() const
{
    if ( size_ <= 0 ) return 1;
    for ( size_t jj = 0; jj < size_; jj += BLOCK_SIZE )
    {
        Column & col = column_[jj];
        for ( size_t n = 0 ; n < col.size_ ; ++n )
        {
            if ( col.inx_[n] >= size_ ) return 2;
            if ( col.inx_[n] <= jj )    return 3;
        }
    }
    return 0;
}


/** all allocated elements are counted, even if zero */
size_t SparMatSymBlk::nbElements(size_t start, size_t stop, size_t& alc) const
{
    assert_true( start <= stop );
    assert_true( stop <= size_ );
    alc = 0;
    size_t cnt = 0;
    for ( size_t i = start; i < stop; i += BLOCK_SIZE )
    {
        cnt += column_[i].size_;
        alc += column_[i].allo_;
    }
    return cnt;
}


//------------------------------------------------------------------------------
#pragma mark -


std::string SparMatSymBlk::what() const
{
    size_t alc = 0;
    size_t cnt = nbElements(0, size_, alc);
    std::ostringstream msg;
#if SMSB_USES_AVX && REAL_IS_DOUBLE
    msg << "SMSBx ";
#elif SMSB_USES_SSE && REAL_IS_DOUBLE
    msg << "SMSBe ";
#elif SMSB_USES_SSE
    msg << "SMSBf ";
#else
    msg << "SMSB ";
#endif
    msg << Block::what() << "*" << cnt << " (" << alc*SB << ")";
    return msg.str();
}


void SparMatSymBlk::printSparse(std::ostream& os, real inf, size_t start, size_t stop) const
{
    stop = std::min(stop, size_);
    char str[256];
    std::streamsize p = os.precision();
    os.precision(8);
    if ( ! column_ )
        return;
    for ( size_t jj = start; jj < stop; ++jj )
    {
        Column & col = column_[jj];
        if ( col.isNotZero() )
            os << "% column " << jj << "\n";
        size_t d = 1;
        for ( size_t n = 0 ; n < col.size_ ; ++n, d = 0 )
        {
            size_t ii = col.inx_[n];
            Block B = col.blk_[n];
            for ( size_t y = 0  ; y < BLOCK_SIZE; ++y )
            for ( size_t x = y*d; x < BLOCK_SIZE; ++x )
            {
                real v = B(y, x);
                if ( abs_real(v) >= inf )
                {
                    snprintf(str, sizeof(str), "%6lu %6lu %16.6f\n", ii+y, jj+x, v);
                    os << str;
                }
            }
        }
    }
    os.precision(p);
}


void SparMatSymBlk::printColumns(std::ostream& os, size_t start, size_t stop)
{
    stop = std::min(stop, size_);
    os << "SMSB size " << size_ << ":";
    for ( size_t j = start; j < stop; ++j )
        if ( column_[j].isNotZero() )
        {
            os << "\n   " << j << "   " << column_[j].size_;
            os << " index " << colidx_[j];
        }
    std::endl(os);
}


void SparMatSymBlk::Column::print(std::ostream& os) const
{
    for ( size_t n = 0; n < size_; ++n )
        os << "\n" << inx_[n] << " : " << blk_[n] << "\n";
    std::endl(os);
}


//------------------------------------------------------------------------------
#pragma mark - Vector Multiplication


/// A block element of the sparse matrix suitable for qsort()
class SparMatSymBlk::Element
{
public:
    /// block elements
    real blk[BLOCK_SIZE*BLOCK_SIZE];

    /// index
    size_t inx;
};


/// function for qsort, comparing line indices
static int compareSMSBElement(const void * A, const void * B)
{
    size_t a = static_cast<SparMatSymBlk::Element const*>(A)->inx;
    size_t b = static_cast<SparMatSymBlk::Element const*>(B)->inx;
    
    return ( a > b ) - ( b > a );
}

/**
 This copies the data to the provided temporary array
 */
void SparMatSymBlk::Column::sortElements(Element tmp[], size_t tmp_size)
{
    assert_true( size_ <= tmp_size );
    for ( size_t i = 1; i < size_; ++i )
    {
        blk_[i].store(tmp[i].blk);
        tmp[i].inx = inx_[i];
    }
    
    //std::clog << "sizeof(SparMatSymBlk::Element) " << sizeof(Element) << "\n";
    qsort(tmp+1, size_-1, sizeof(Element), &compareSMSBElement);
    
    for ( size_t i = 1; i < size_; ++i )
    {
        blk_[i].load(tmp[i].blk);
        inx_[i] = tmp[i].inx;
    }
}


size_t SparMatSymBlk::newElements(Element*& ptr, size_t cnt)
{
    constexpr size_t chunk = 16;
    size_t all = ( cnt + chunk - 1 ) & ~( chunk - 1 );
    free(ptr);  // Element has no destructor
    void* tmp = nullptr;
    if ( posix_memalign(&tmp, 32, all*sizeof(Element)) )
        throw std::bad_alloc();
    ptr = new(tmp) Element[all];
    return all;
}


void SparMatSymBlk::sortElements()
{
    //size_t cnt = 0;
    size_t tmp_size = 0;
    Element * tmp = nullptr;
    
    for ( size_t j = colidx_[0]; j < size_; j = colidx_[j+1] )
    {
        assert_true( j < size_ );
        Column & col = column_[j];
        assert_true( col.size_ > 0 );
        //std::clog << "SMSB column " << j << " has " << col.size_ << " elements\n";
        
        // order the elements within the column:
        if ( col.size_ > 2 )
        {
            if ( tmp_size < col.size_ )
                tmp_size = newElements(tmp, col.size_);
            col.sortElements(tmp, tmp_size);
        }
        
        //++cnt;
        
        // diagonal element should be first:
        assert_true( col.inx_[0] == j );
        col.blk_[0].copy_lower();
#ifndef NDEBUG
        for ( size_t n = 1 ; n < col.size_ ; ++n )
        {
            const size_t i = col.inx_[n];
            assert_true( i < size_ );
            assert_true( i != j );
        }
#endif
    }
    
    free(tmp);
    //std::clog << "SparMatSymBlk " << size_ << " with " << cnt << " non-empty columns\n";
}


bool SparMatSymBlk::prepareForMultiply(int)
{
    if ( size_ > 0 )
    {
        size_t inx = size_;
        size_t nxt = size_;
        while ( inx-- > 0 )
        {
            if ( column_[inx].isNotZero() )
                nxt = inx;
            else
                column_[inx].deallocate();
            colidx_[inx] = nxt;
        }
    }
    colidx_[size_] = size_;

    // check if matrix is empty:
    if ( colidx_[0] == size_ )
        return false;
    
    sortElements();
    //printColumns(std::cout);
    return true;
}


//------------------------------------------------------------------------------
#pragma mark - Column Vector Multiplication


#if ( BLOCK_SIZE == 1 )
void SparMatSymBlk::Column::vecMulAdd1D(const real* X, real* Y, size_t jj) const
{
    assert_true(size_ > 0);
    const real X0 = X[jj];
    real D = blk_[0].value();
    real Y0 = Y[jj] + D * X0;
    assert_true(inx_[0]==jj);
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        const real M = blk_[n].value();
        Y[ii] += M * X0;
        Y0 += M * X[ii];
    }
    Y[jj] = Y0;
}
#endif


#if ( BLOCK_SIZE == 2 )
void SparMatSymBlk::Column::vecMulAdd2D(const real* X, real* Y, size_t jj) const
{
    assert_true(size_ > 0);
    const Vector2 xx(X+jj);
    assert_true(inx_[0]==jj);
    assert_small(blk_[0].asymmetry());
    Vector2 yy = blk_[0].vecmul(xx);
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        Block const& M = blk_[n];
        M.vecmul(xx).add_to(Y+ii);
        yy += M.trans_vecmul(X+ii);
    }
    yy.add_to(Y+jj);
}
#endif

#if ( BLOCK_SIZE == 3 )
void SparMatSymBlk::Column::vecMulAdd3D(const real* X, real* Y, size_t jj) const
{
    assert_true(size_ > 0);
    const Vector3 xxx(X+jj);
    assert_true(inx_[0]==jj);
    assert_small(blk_[0].asymmetry());
    Vector3 yyy = blk_[0].vecmul(xxx);
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        Block const& M = blk_[n];
        M.vecmul(xxx).add_to(Y+ii);
        yyy += M.trans_vecmul(X+ii);
    }
    yyy.add_to(Y+jj);
}
#endif


//------------------------------------------------------------------------------
#pragma mark - Manually Optimized Vector Multiplication

#if ( BLOCK_SIZE == 3 ) && SMSB_USES_SSE && !REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd3D_SSE(const float* X, float* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    printf("SMSB %lu : %lu\n", jj, size_);
    // load 3x3 matrix diagonal element into 3 vectors:
    float const* D = blk_[0];
    
    //multiply with the symmetrized block, assuming it has been symmetrized:
    // Y0 = Y[jj  ] + M[0] * X0 + M[1] * X1 + M[2] * X2;
    // Y1 = Y[jj+1] + M[1] * X0 + M[4] * X1 + M[5] * X2;
    // Y2 = Y[jj+2] + M[2] * X0 + M[5] * X1 + M[8] * X2;
    /* vec4 s0, s1, s2 add lines of the transposed-matrix multiplied by 'xyz' */
    const vec4f tt = loadu4f(X+jj);
# if ( BLD == 4 )
    vec4f s0 = mul4f(streamload4f(D  ), tt);
    vec4f s1 = mul4f(streamload4f(D+4), tt);
    vec4f s2 = mul4f(streamload4f(D+8), tt);
# else
    vec4f s0 = mul4f(load3fZ(D      ), tt);
    vec4f s1 = mul4f(load3fZ(D+BLD  ), tt);
    vec4f s2 = mul4f(load3fZ(D+BLD*2), tt);
# endif
    const vec4f x0 = broadcastXf(tt);
    const vec4f x1 = broadcastYf(tt);
    const vec4f x2 = broadcastZf(tt);

    // There is a dependency in the loop for 's0', 's1' and 's2'.
    #pragma nounroll
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        float const* M = blk_[n];
# if ( BLD == 4 )
        const vec4f m012 = streamload4f(M  );
        const vec4f m345 = streamload4f(M+4);
        const vec4f m678 = streamload4f(M+8);
# else
        const vec4f m012 = load3fZ(M      );
        const vec4f m345 = load3fZ(M+BLD  );
        const vec4f m678 = load3fZ(M+BLD*2);
# endif
        // multiply with the full block:
        //Y[ii  ] +=  M[0] * X0 + M[3] * X1 + M[6] * X2;
        //Y[ii+1] +=  M[1] * X0 + M[4] * X1 + M[7] * X2;
        //Y[ii+2] +=  M[2] * X0 + M[5] * X1 + M[8] * X2;
        vec4f z = fmadd4f(m012, x0, loadu4f(Y+ii));
        z = fmadd4f(m345, x1, z);
        z = fmadd4f(m678, x2, z);
        storeu4f(Y+ii, z);
        
        // multiply with the transposed block:
        //Y0 += M[0] * X[ii] + M[1] * X[ii+1] + M[2] * X[ii+2];
        //Y1 += M[3] * X[ii] + M[4] * X[ii+1] + M[5] * X[ii+2];
        //Y2 += M[6] * X[ii] + M[7] * X[ii+1] + M[8] * X[ii+2];
        vec4f xyz = loadu4f(X+ii);  // xyz = { X0 X1 X2 - }
        s0 = fmadd4f(m012, xyz, s0);
        s1 = fmadd4f(m345, xyz, s1);
        s2 = fmadd4f(m678, xyz, s2);
    }
    /* finally sum horizontally:
     s0 = { Y0 Y0 Y0 0 }, s1 = { Y1 Y1 Y1 0 }, s2 = { Y2 Y2 Y2 0 }
     to { Y0+Y0+Y0, Y1+Y1+Y1, Y2+Y2+Y2, 0 }
     */
    vec4f s3 = setzero4f();
    s0 = add4f(unpacklo4f(s0, s1), unpackhi4f(s0, s1));
    s2 = add4f(unpacklo4f(s2, s3), unpackhi4f(s2, s3));
    s0 = add4f(blend22f(s2, s0), blend22f(s0, s2));
    storeu4f(Y+jj, add4f(loadu4f(Y+jj), s0));
}
#endif


#if ( BLOCK_SIZE == 3 ) && SMSB_USES_SSE && !REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd3D_SSEU(const float* X, float* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    //std::cout << blk_[0].to_string(7,1); printf(" MSSB %lu : %lu\n", jj, size_);
    // load 3x3 matrix diagonal element into 3 vectors:
    float const* D = blk_[0];
    
    //multiply with the diagonal block, assuming it has been symmetrized:
    // Y0 = Y[jj  ] + M[0] * X0 + M[1] * X1 + M[2] * X2;
    // Y1 = Y[jj+1] + M[1] * X0 + M[4] * X1 + M[5] * X2;
    // Y2 = Y[jj+2] + M[2] * X0 + M[5] * X1 + M[8] * X2;
    /* vec4 s0, s1, s2 add lines of the transposed-matrix multiplied by 'xyz' */
    const vec4f tt = loadu4f(X+jj);
# if ( BLD == 4 )
    vec4f s0 = mul4f(streamload4f(D  ), tt);
    vec4f s1 = mul4f(streamload4f(D+4), tt);
    vec4f s2 = mul4f(streamload4f(D+8), tt);
# else
    vec4f s0 = mul4f(load3fZ(D      ), tt);
    vec4f s1 = mul4f(load3fZ(D+BLD  ), tt);
    vec4f s2 = mul4f(load3fZ(D+BLD*2), tt);
# endif
    
    if ( size_ > 1 )
    {
        const vec4f x0 = broadcastXf(tt);
        const vec4f x1 = broadcastYf(tt);
        const vec4f x2 = broadcastZf(tt);

        size_t n = 1;
        {
            const size_t end = 1 + 2 * ((size_-1)/2);
            
            // process 2 by 2
#pragma nounroll
            for ( ; n < end; n += 2 )
            {
                const size_t ii = inx_[n  ];
                const size_t kk = inx_[n+1];
                assert_true( ii < kk );
                float const* M = blk_[n  ];
                float const* P = blk_[n+1];
                vec4f z = loadu4f(Y+ii);
                vec4f t = loadu4f(Y+kk);
# if ( BLD == 4 )
                const vec4f m012 = streamload4f(M);
                const vec4f p012 = streamload4f(P);
                const vec4f m345 = streamload4f(M+4);
                const vec4f p345 = streamload4f(P+4);
                const vec4f m678 = streamload4f(M+8);
                const vec4f p678 = streamload4f(P+8);
# else
                const vec4f m012 = load3fZ(M);
                const vec4f p012 = load3fZ(P);
                const vec4f m345 = load3fZ(M+BLD);
                const vec4f p345 = load3fZ(P+BLD);
                const vec4f m678 = load3fZ(M+BLD*2);
                const vec4f p678 = load3fZ(P+BLD*2);
# endif
                // multiply with the full block:
                z = fmadd4f(m012, x0, z);
                t = fmadd4f(p012, x0, t);
                vec4f xyz = loadu4f(X+ii);  // xyz = { X0 X1 X2 - }
                vec4f tuv = loadu4f(X+kk);  // xyz = { X0 X1 X2 - }
                z = fmadd4f(m345, x1, z);
                t = fmadd4f(p345, x1, t);
                s0 = fmadd4f(m012, xyz, s0);
                s1 = fmadd4f(m345, xyz, s1);
                s2 = fmadd4f(m678, xyz, s2);
                z = fmadd4f(m678, x2, z);
                t = fmadd4f(p678, x2, t);
                s0 = fmadd4f(p012, tuv, s0);
                s1 = fmadd4f(p345, tuv, s1);
                s2 = fmadd4f(p678, tuv, s2);
                storeu4f(Y+ii, z);
                storeu4f(Y+kk, t);
            }
        }
        
        // process remaining blocks
#pragma nounroll
        for ( ; n < size_; ++n )
        {
            const size_t ii = inx_[n];
            float const* M = blk_[n];
# if ( BLD == 4 )
            const vec4f m012 = streamload4f(M  );
            const vec4f m345 = streamload4f(M+4);
            const vec4f m678 = streamload4f(M+8);
# else
            const vec4f m012 = load3fZ(M      );
            const vec4f m345 = load3fZ(M+BLD  );
            const vec4f m678 = load3fZ(M+BLD*2);
# endif
            // multiply with the full block:
            //Y[ii  ] +=  M[0] * X0 + M[3] * X1 + M[6] * X2;
            //Y[ii+1] +=  M[1] * X0 + M[4] * X1 + M[7] * X2;
            //Y[ii+2] +=  M[2] * X0 + M[5] * X1 + M[8] * X2;
            vec4f z = fmadd4f(m012, x0, loadu4f(Y+ii));
            z = fmadd4f(m345, x1, z);
            z = fmadd4f(m678, x2, z);
            storeu4f(Y+ii, z);
            
            // multiply with the transposed block:
            //Y0 += M[0] * X[ii] + M[1] * X[ii+1] + M[2] * X[ii+2];
            //Y1 += M[3] * X[ii] + M[4] * X[ii+1] + M[5] * X[ii+2];
            //Y2 += M[6] * X[ii] + M[7] * X[ii+1] + M[8] * X[ii+2];
            vec4f xyz = loadu4f(X+ii);  // xyz = { X0 X1 X2 - }
            s0 = fmadd4f(m012, xyz, s0);
            s1 = fmadd4f(m345, xyz, s1);
            s2 = fmadd4f(m678, xyz, s2);
        }
    }
    
    /* finally sum horizontally:
     s0 = { Y0 Y0 Y0 0 }, s1 = { Y1 Y1 Y1 0 }, s2 = { Y2 Y2 Y2 0 }
     to { Y0+Y0+Y0, Y1+Y1+Y1, Y2+Y2+Y2, 0 }
     */
    vec4f s3 = setzero4f();
    s0 = add4f(unpacklo4f(s0, s1), unpackhi4f(s0, s1));
    s2 = add4f(unpacklo4f(s2, s3), unpackhi4f(s2, s3));
    s0 = add4f(blend22f(s2, s0), blend22f(s0, s2));
    storeu4f(Y+jj, add4f(loadu4f(Y+jj), s0));
}
#endif


//------------------------------------------------------------------------------
#pragma mark - Manually Optimized Vector Multiplication

#if ( BLOCK_SIZE == 2 ) && defined(__SSE3__) && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd2D_SSE(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    vec2 x0, x1;
    vec2 yy = load2(Y+jj);
    {
        //const real X0 = X[jj  ];
        //const real X1 = X[jj+1];
        vec2 xx = load2(X+jj);
        x0 = unpacklo2(xx, xx);
        x1 = unpackhi2(xx, xx);
        
        // load 2x2 matrix element into 2 vectors:
        double const* D = blk_[0];
        //assume the block is already symmetrized:
        // Y0 = Y[jj  ] + M[0] * X0 + M[1] * X1;
        // Y1 = Y[jj+1] + M[1] * X0 + M[3] * X1;
        xx = add2(mul2(load2(D  ), x0), yy);
        yy = add2(mul2(load2(D+2), x1), xx);
    }
    
    // while x0 and x1 are constant, there is a dependency in the loop for 'yy'.
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        vec2 xx = load2(X+ii);
        
        // load 2x2 matrix element into 2 vectors:
        double const* M = blk_[n];
        vec2 m01 = load2(M);
        vec2 m23 = load2(M+2);
        
        // multiply with the full block:
        //Y[ii  ] += M[0] * X0 + M[2] * X1;
        //Y[ii+1] += M[1] * X0 + M[3] * X1;
        vec2 mx0 = add2(mul2(m01, x0), load2(Y+ii));
        mx0 = add2(mul2(m23, x1), mx0);
        store2(Y+ii, mx0);

        // multiply with the transposed block:
        //Y0 += M[0] * X[ii] + M[1] * X[ii+1];
        //Y1 += M[2] * X[ii] + M[3] * X[ii+1];
        vec2 mxx = mul2(m01, xx);
        vec2 myy = mul2(m23, xx);
        yy = add2(add2(unpacklo2(mxx, myy), unpackhi2(mxx, myy)), yy);
    }
    //Y[jj  ] = Y0;
    //Y[jj+1] = Y1;
    store2(Y+jj, yy);
}
#endif

#if ( BLOCK_SIZE == 2 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd2D_AVX(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    // xy = { X0 X1 X0 X1 }
    vec4 xy = broadcast2(X+jj);
    //multiply with full block, assuming it is symmetric:
    // Y0 = M[0] * X0 + M[1] * X1;
    // Y1 = M[1] * X0 + M[3] * X1;
    
    // yyyy = { Y0 Y0 Y1 Y1 }
    // load 2x2 matrix element into 2 vectors:
    vec4 ss = mul4(streamload4(blk_[0]), xy);

    //const real X0 = X[jj  ];
    //const real X1 = X[jj+1];
    // xxyy = { X0 X0 X1 X1 }
    const vec4 xxyy = permute4(xy, 0b1100);

    // while x0 and x1 are constant, there is a dependency in the loop for 'yy'.
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t& ii = inx_[n];
        vec4 mat = streamload4(blk_[n]);      // load 2x2 matrix
        vec4 yy = load2Z(Y+ii);         // yy = { Y0 Y1 0 0 }
        vec4 xx = broadcast2(X+ii);     // xx = { X0 X1 X0 X1 }

        // multiply with the full block:
        //Y[ii  ] += M[0] * X0 + M[2] * X1;
        //Y[ii+1] += M[1] * X0 + M[3] * X1;
        vec4 u = fmadd4(mat, xxyy, yy);
        store2(Y+ii, add2(getlo(u), gethi(u)));
        
        // multiply with the transposed block:
        //Y0 += M[0] * X[ii] + M[1] * X[ii+1];
        //Y1 += M[2] * X[ii] + M[3] * X[ii+1];
        ss = fmadd4(mat, xx, ss);
    }
    // need to collapse yyyy = { S0 S0 S1 S1 }
    // Y[jj  ] += yyyy[0] + yyyy[1];
    // Y[jj+1] += yyyy[2] + yyyy[3];
    vec2 yy = load2(Y+jj);
    vec2 h = gethi(ss);
    store2(Y+jj, add2(yy, add2(unpacklo2(getlo(ss), h), unpackhi2(getlo(ss), h))));
}
#endif


#if ( BLOCK_SIZE == 2 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
inline static void multiply2D(double const* X, double* Y, size_t ii, vec4 const& mat, vec4 const& xxxx, vec4& ss)
{
    vec4 xx = broadcast2(X+ii);
    vec4 u = fmadd4(mat, xxxx, load2Z(Y+ii));
    store2(Y+ii, add2(getlo(u), gethi(u)));
    ss = fmadd4(mat, xx, ss);
}
#endif


#if ( BLOCK_SIZE == 2 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd2D_AVXU(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    vec4 xyxy = broadcast2(X+jj);
    vec4 ss = mul4(streamload4(blk_[0]), xyxy);
    const vec4 xxyy = permute4(xyxy, 0b1100);
    vec4 s1 = setzero4();

    size_t n = 1;
    const size_t end = 1 + 2 * ((size_-1)/2);
    // process 2 by 2:
    #pragma nounroll
    for ( ; n < end; n += 2 )
    {
#if ( 0 )
        /*
         Since all the indices are different, the blocks can be processed in
         parallel, and micro-operations can be interleaved to avoid latency.
         The compiler however cannot assume this, because the indices of the
         blocks are not known at compile time.
         */
        multiply2D(X, Y, inx_[n  ], streamload4(blk_[n  ]), xxyy, ss);
        multiply2D(X, Y, inx_[n+1], streamload4(blk_[n+1]), xxyy, s1);
#else
        /* we remove here the apparent dependency on the values of Y[],
         which are read and written, but at different indices.
         The compiler can reorder instructions to avoid lattencies */
        const size_t i0 = inx_[n  ];
        const size_t i1 = inx_[n+1];
        assert_true( i0 < i1 );
        vec4 mat0 = streamload4(blk_[n  ]);
        vec4 mat1 = streamload4(blk_[n+1]);
        vec4 u0 = fmadd4(mat0, xxyy, load2Z(Y+i0));
        vec4 u1 = fmadd4(mat1, xxyy, load2Z(Y+i1));
        ss = fmadd4(mat0, broadcast2(X+i0), ss);
        s1 = fmadd4(mat1, broadcast2(X+i1), s1);
        store2(Y+i0, add2(getlo(u0), gethi(u0)));
        store2(Y+i1, add2(getlo(u1), gethi(u1)));
#endif
    }
    // collapse 'ss'
    ss = add4(ss, s1);
    // process remaining blocks:
    #pragma nounroll
    for ( ; n < size_; ++n )
        multiply2D(X, Y, inx_[n], streamload4(blk_[n]), xxyy, ss);
    /* finally horizontally sum ss = { SX SX SY SY } */
    vec2 h = gethi(ss);
    h = add2(unpacklo2(getlo(ss), h), unpackhi2(getlo(ss), h));
    store2(Y+jj, add2(load2(Y+jj), h));
}
#endif


#if ( BLOCK_SIZE == 2 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd2D_AVXUU(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    vec4 xyxy = broadcast2(X+jj);
    vec4 ss = mul4(streamload4(blk_[0]), xyxy);
    const vec4 xxyy = permute4(xyxy, 0b1100);
    vec4 s1 = setzero4();
    vec4 s2 = setzero4();
    vec4 s3 = setzero4();

    size_t n = 1;
    const size_t end = 1 + 4 * ((size_-1)/4);
    // process 4 by 4:
    #pragma nounroll
    for ( ; n < end; n += 4 )
    {
#if ( 0 )
        /*
         Since all the indices are different, the blocks can be processed in
         parallel, and micro-operations can be interleaved to avoid latency.
         The compiler however cannot assume this, because the indices of the
         blocks are not known at compile time.
         */
        multiply2D(X, Y, inx_[n  ], streamload4(blk_[n  ]), xxyy, ss);
        multiply2D(X, Y, inx_[n+1], streamload4(blk_[n+1]), xxyy, s1);
        multiply2D(X, Y, inx_[n+2], streamload4(blk_[n+2]), xxyy, s2);
        multiply2D(X, Y, inx_[n+3], streamload4(blk_[n+3]), xxyy, s3);
#else
        /* we remove here the apparent dependency on the values of Y[],
         which are read and written, but at different indices.
         The compiler can reorder instructions to avoid lattencies */
        assert_true( inx_[n  ] < inx_[n+1] );
        assert_true( inx_[n+1] < inx_[n+2] );
        assert_true( inx_[n+2] < inx_[n+3] );
        const size_t i0 = inx_[n  ];
        const size_t i1 = inx_[n+1];
        const size_t i2 = inx_[n+2];
        const size_t i3 = inx_[n+3];
        vec4 mat0 = streamload4(blk_[n  ]);
        vec4 mat1 = streamload4(blk_[n+1]);
        vec4 mat2 = streamload4(blk_[n+2]);
        vec4 mat3 = streamload4(blk_[n+3]);
        vec4 u0 = fmadd4(mat0, xxyy, load2Z(Y+i0));
        vec4 u1 = fmadd4(mat1, xxyy, load2Z(Y+i1));
        vec4 u2 = fmadd4(mat2, xxyy, load2Z(Y+i2));
        vec4 u3 = fmadd4(mat3, xxyy, load2Z(Y+i3));
        ss = fmadd4(mat0, broadcast2(X+i0), ss);
        s1 = fmadd4(mat1, broadcast2(X+i1), s1);
        s2 = fmadd4(mat2, broadcast2(X+i2), s2);
        s3 = fmadd4(mat3, broadcast2(X+i3), s3);
        store2(Y+i0, add2(getlo(u0), gethi(u0)));
        store2(Y+i1, add2(getlo(u1), gethi(u1)));
        store2(Y+i2, add2(getlo(u2), gethi(u2)));
        store2(Y+i3, add2(getlo(u3), gethi(u3)));
#endif
    }
    // collapse 'ss'
    ss = add4(add4(ss,s1), add4(s2,s3));
    // process remaining blocks:
    #pragma nounroll
    for ( ; n < size_; ++n )
        multiply2D(X, Y, inx_[n], streamload4(blk_[n]), xxyy, ss);
    /* finally sum ss = { S0 S0 S1 S1 } */
    vec2 h = gethi(ss);
    h = add2(unpacklo2(getlo(ss), h), unpackhi2(getlo(ss), h));
    store2(Y+jj, add2(load2(Y+jj), h));
}
#endif


#if ( BLOCK_SIZE == 3 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd3D_AVX(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    // load 3x3 matrix diagonal element into 3 vectors:
    double const* D = blk_[0];
    
    //multiply with the symmetrized block, assuming it has been symmetrized:
    // Y0 = Y[jj  ] + M[0] * X0 + M[1] * X1 + M[2] * X2;
    // Y1 = Y[jj+1] + M[1] * X0 + M[4] * X1 + M[5] * X2;
    // Y2 = Y[jj+2] + M[2] * X0 + M[5] * X1 + M[8] * X2;
    /* vec4 s0, s1, s2 add lines of the transposed-matrix multiplied by 'xyz' */
    vec4 s0, s1, s2;
    vec4 x0, x1, x2;
    {
        vec4 tt = loadu4(X+jj);
# if ( BLD == 4 )
        s0 = mul4(streamload4(D  ), tt);
        s1 = mul4(streamload4(D+4), tt);
        s2 = mul4(streamload4(D+8), tt);
# else
        s0 = mul4(load3(D      ), tt);
        s1 = mul4(load3(D+BLD  ), tt);
        s2 = mul4(load3(D+BLD*2), tt);
# endif
        // sum non-diagonal elements:
#if ( 0 )
        x0 = broadcast1(X+jj);
        x1 = broadcast1(X+jj+1);
        x2 = broadcast1(X+jj+2);
#else
        vec4 p = swap2f128(tt);
        vec4 l = blend22(tt, p);
        vec4 u = blend22(p, tt);
        x0 = duplo4(l);
        x1 = duphi4(l);
        x2 = duplo4(u);
    }
#endif
    // There is a dependency in the loop for 's0', 's1' and 's2'.
    #pragma nounroll
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        double const* M = blk_[n];
# if ( BLD == 4 )
        const vec4 m012 = streamload4(M  );
        const vec4 m345 = streamload4(M+4);
        const vec4 m678 = streamload4(M+8);
# else
        const vec4 m012 = load3(M      );
        const vec4 m345 = load3(M+BLD  );
        const vec4 m678 = load3(M+BLD*2);
# endif
        // multiply with the full block:
        //Y[ii  ] +=  M[0] * X0 + M[3] * X1 + M[6] * X2;
        //Y[ii+1] +=  M[1] * X0 + M[4] * X1 + M[7] * X2;
        //Y[ii+2] +=  M[2] * X0 + M[5] * X1 + M[8] * X2;
        vec4 z = fmadd4(m012, x0, loadu4(Y+ii));
        z = fmadd4(m345, x1, z);
        z = fmadd4(m678, x2, z);
        storeu4(Y+ii, z);
        
        // multiply with the transposed block:
        //Y0 += M[0] * X[ii] + M[1] * X[ii+1] + M[2] * X[ii+2];
        //Y1 += M[3] * X[ii] + M[4] * X[ii+1] + M[5] * X[ii+2];
        //Y2 += M[6] * X[ii] + M[7] * X[ii+1] + M[8] * X[ii+2];
        vec4 xyz = loadu4(X+ii);  // xyz = { X0 X1 X2 - }
        s0 = fmadd4(m012, xyz, s0);
        s1 = fmadd4(m345, xyz, s1);
        s2 = fmadd4(m678, xyz, s2);
    }
    // finally sum s0 = { Y0 Y0 Y0 - }, s1 = { Y1 Y1 Y1 - }, s2 = { Y2 Y2 Y2 - }
#if ( 0 )
    Y[jj  ] += s0[0] + s0[1] + s0[2];
    Y[jj+1] += s1[0] + s1[1] + s1[2];
    Y[jj+2] += s2[0] + s2[1] + s2[2];
#else
    vec4 s3 = setzero4();
    s0 = add4(unpacklo4(s0, s1), unpackhi4(s0, s1));
    s2 = add4(unpacklo4(s2, s3), unpackhi4(s2, s3));
    s1 = add4(catshift2(s0, s2), blend22(s0, s2));
    storeu4(Y+jj, add4(loadu4(Y+jj), s1));
#endif
}
#endif


#if ( BLOCK_SIZE == 3 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd3D_AVXU(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    const real* M = blk_[0];

    vec4 s0, s1, s2;
    vec4 x0, x1, x2;
    vec4 t0 = setzero4();
    vec4 t1 = setzero4();
    vec4 t2 = setzero4();
    // load 3x3 matrix element into 3 vectors:
    {
        vec4 tt = loadu4(X+jj);
        // multiply by diagonal elements:
        s0 = mul4(streamload4(M  ), tt);
        s1 = mul4(streamload4(M+4), tt);
        s2 = mul4(streamload4(M+8), tt);
        // prepare broadcasted vectors:
        vec4 p = swap2f128(tt);
        vec4 l = blend22(tt, p);
        vec4 u = blend22(p, tt);
        x0 = duplo4(l);
        x1 = duphi4(l);
        x2 = duplo4(u);
    }
    M += SB;
    // There is a dependency in the loop for 's0', 's1' and 's2'.
    const real* end = blk_[1+2*((size_-1)/2)];
    const size_t * inx = inx_+1;
    /*
     Unrolling will reduce the dependency chain, which may be limiting the
     throughput here. However the number of registers (16 for AVX CPU) limits
     the level of unrolling that can be done.
     */
    //process 2 by 2:
    #pragma nounroll
    for ( ; M < end; M += 2*SB )
    {
        const size_t i0 = inx[0];
        const size_t i1 = inx[1];
        assert_true( i0 < i1 );
        inx += 2;
        //printf("--- %4i %4i\n", i0, i1);
        vec4 ma0 = streamload4(M);
        vec4 ma1 = streamload4(M+SB);
        vec4 z0 = fmadd4(ma0, x0, loadu4(Y+i0));
        vec4 z1 = fmadd4(ma1, x0, loadu4(Y+i1));
        vec4 xyz0 = loadu4(X+i0);
        vec4 xyz1 = loadu4(X+i1);
        s0 = fmadd4(ma0, xyz0, s0);
        t0 = fmadd4(ma1, xyz1, t0);
        // multiply with the full block:
        vec4 mb0 = streamload4(M+4);
        vec4 mb1 = streamload4(M+(SB+4));
        z0 = fmadd4(mb0, x1, z0);
        z1 = fmadd4(mb1, x1, z1);
        s1 = fmadd4(mb0, xyz0, s1);
        t1 = fmadd4(mb1, xyz1, t1);
        vec4 mc0 = streamload4(M+8);
        vec4 mc1 = streamload4(M+(SB+8));
        z0 = fmadd4(mc0, x2, z0);
        z1 = fmadd4(mc1, x2, z1);
        s2 = fmadd4(mc0, xyz0, s2);
        t2 = fmadd4(mc1, xyz1, t2);
        /*
         Attention: the 4th elements of the vectors z0 and z1 would be correct,
         because only zero was added to the value loaded from 'Y'. However, in the
         case where the indices i0 and i1 are consecutive and reverted (i1 < i0),
         the value stored in z0 would not have been updated giving a wrong results.
         The solution is to either use a 'store3(Y+i1, z1)', or to make sure that
         indices are non-consecutive or ordered in the column in increasing order.
         This affects performance since 'store3' is slower than 'storeu4'
         */
        storeu4(Y+i0, z0);
        storeu4(Y+i1, z1);
    }
    s0 = add4(s0, t0);
    s1 = add4(s1, t1);
    s2 = add4(s2, t2);
    
    // process remaining blocks:
    end = blk_[size_];
    #pragma nounroll
    for ( ; M < end; M += SB )
    {
        const size_t ii = inx[0];
        ++inx;
        //printf("--- %4i\n", ii);
        vec4 ma = streamload4(M);
        vec4 z = fmadd4(ma, x0, loadu4(Y+ii));
        vec4 xyz = loadu4(X+ii);
        s0 = fmadd4(ma, xyz, s0);
        
        vec4 mb = streamload4(M+4);
        z = fmadd4(mb, x1, z);
        s1 = fmadd4(mb, xyz, s1);
        
        vec4 mc = streamload4(M+8);
        z = fmadd4(mc, x2, z);
        s2 = fmadd4(mc, xyz, s2);
        storeu4(Y+ii, z);
    }
    // finally sum s0 = { Y0 Y0 Y0 0 }, s1 = { Y1 Y1 Y1 0 }, s2 = { Y2 Y2 Y2 0 }
    x0 = setzero4();
    s0 = add4(unpacklo4(s0, s1), unpackhi4(s0, s1));
    s1 = add4(unpacklo4(s2, x0), unpackhi4(s2, x0));
    s0 = add4(catshift2(s0, s1), blend22(s0, s1));
    storeu4(Y+jj, add4(loadu4(Y+jj), s0));
}
#endif


#if ( BLOCK_SIZE == 4 ) && SMSB_USES_AVX && REAL_IS_DOUBLE
void SparMatSymBlk::Column::vecMulAdd4D_AVX(const double* X, double* Y, size_t jj) const
{
    assert_true(size_ > 0);
    assert_true(inx_[0] == jj);
    double const* D = blk_[0];
    //multiply with the symmetrized block, assuming it has been symmetrized:
    /* vec4 s0, s1, s2 add lines of the transposed-matrix multiplied by 'xyz' */
    vec4 s0, s1, s2, s3;
    vec4 x0, x1, x2, x3;
    {
        vec4 tt = load4(X+jj);
        s0 = mul4(streamload4(D   ), tt);
        s1 = mul4(streamload4(D+4 ), tt);
        s2 = mul4(streamload4(D+8 ), tt);
        s3 = mul4(streamload4(D+12), tt);
        // sum non-diagonal elements:
#if ( 0 )
        x0 = broadcast1(X+jj);
        x1 = broadcast1(X+jj+1);
        x2 = broadcast1(X+jj+2);
        x3 = broadcast1(X+jj+3);
#else
        x1 = duplo2f128(tt);
        x3 = duphi2f128(tt);
        x0 = duplo4(x1);
        x1 = duphi4(x1);
        x2 = duplo4(x3);
        x3 = duphi4(x3);
#endif
    }
    // There is a dependency in the loop for 's0', 's1' and 's2'.
    #pragma nounroll
    for ( size_t n = 1; n < size_; ++n )
    {
        const size_t ii = inx_[n];
        double const* M = blk_[n];
        const vec4 yy = load4(Y+ii);
        const vec4 xyzt = load4(X+ii);  // xyzt = { X0 X1 X2 X3 }
        const vec4 m0 = streamload4(M);
        vec4 z = fmadd4(m0, x0, yy);
        s0 = fmadd4(m0, xyzt, s0);
        
        const vec4 m1 = streamload4(M+4);
        z  = fmadd4(m1, x1, z);
        s1 = fmadd4(m1, xyzt, s1);

        const vec4 m2 = streamload4(M+8);
        z  = fmadd4(m2, x2, z);
        s2 = fmadd4(m2, xyzt, s2);

        const vec4 m3 = streamload4(M+12);
        z  = fmadd4(m3, x3, z);
        s3 = fmadd4(m3, xyzt, s3);
        store4(Y+ii, z);
    }
    // finally sum s0 = { Y0 Y0 Y0 Y0 }, s1 = { Y1 Y1 Y1 Y1 }, s2 = { Y2 Y2 Y2 Y2 }
    s0 = add4(unpacklo4(s0, s1), unpackhi4(s0, s1));
    s2 = add4(unpacklo4(s2, s3), unpackhi4(s2, s3));
    s1 = add4(catshift2(s0, s2), blend22(s0, s2));
    store4(Y+jj, add4(load4(Y+jj), s1));
}
#endif


//------------------------------------------------------------------------------
#pragma mark - Matrix-Vector Add-multiply

#if SMSB_USES_AVX && REAL_IS_DOUBLE
#   define VECMULADD2D vecMulAdd2D_AVXU
#   define VECMULADD3D vecMulAdd3D_AVXU
#   define VECMULADD4D vecMulAdd4D_AVX
#elif SMSB_USES_SSE && REAL_IS_DOUBLE
#   define VECMULADD2D vecMulAdd2D_SSE
#   define VECMULADD3D vecMulAdd3D
#   define VECMULADD4D vecMulAdd4D
#elif SMSB_USES_SSE
#   define VECMULADD2D vecMulAdd2D
#   define VECMULADD3D vecMulAdd3D_SSEU
#   define VECMULADD4D vecMulAdd4D
#else
#   define VECMULADD2D vecMulAdd2D
#   define VECMULADD3D vecMulAdd3D
#   define VECMULADD4D vecMulAdd4D
#endif


// multiplication of a vector: Y = Y + M * X
void SparMatSymBlk::vecMulAdd(const real* X, real* Y, size_t start, size_t stop) const
{
    assert_true( start <= stop );
    assert_true( stop <= size_ );
#if ( 1 )
    for ( size_t jj = colidx_[start]; jj < stop; jj = colidx_[jj+1] )
#else
    for ( size_t jj = start; jj < stop; jj += BLOCK_SIZE )
        if ( !column_[jj].empty() )
#endif
            {
                //std::clog << "SparMatSymBlk column " << jj << "  " << size_ << " \n";
#if ( BLOCK_SIZE == 1 )
                column_[jj].vecMulAdd1D(X, Y, jj);
#elif ( BLOCK_SIZE == 2 )
                column_[jj].VECMULADD2D(X, Y, jj);
#elif ( BLOCK_SIZE == 3 )
                column_[jj].VECMULADD3D(X, Y, jj);
#elif ( BLOCK_SIZE == 4 )
                column_[jj].VECMULADD4D(X, Y, jj);
#endif
            }
}


// multiplication of a vector: Y = Y + M * X
void SparMatSymBlk::vecMulAdd_ALT(const real* X, real* Y) const
{
    for ( size_t jj = colidx_[0]; jj < size_; jj = colidx_[jj+1] )
    {
        //std::clog << "SparMatSymBlk column " << jj << "  " << size_ << " \n";
#if ( BLOCK_SIZE == 1 )
        column_[jj].vecMulAdd1D(X, Y, jj);
#elif ( BLOCK_SIZE == 2 )
        column_[jj].vecMulAdd2D(X, Y, jj);
#elif ( BLOCK_SIZE == 3 )
        column_[jj].vecMulAdd3D(X, Y, jj);
#elif ( BLOCK_SIZE == 4 )
        column_[jj].vecMulAdd4D(X, Y, jj);
#endif
    }
}


// multiplication of a vector: Y = Y + M * X
void SparMatSymBlk::vecMulAdd_TIME(const real* X, real* Y) const
{
    size_t cnt = 0, col = 0;
    //auto rdt = timer();
    for ( size_t jj = colidx_[0]; jj < size_; jj = colidx_[jj+1] )
    {
        col++;
        cnt += column_[jj].size_;
        //std::clog << "SparMatSymBlk column " << jj << "  " << size_ << " \n";
#if ( BLOCK_SIZE == 1 )
        column_[jj].vecMulAdd1D(X, Y, jj);
#elif ( BLOCK_SIZE == 2 )
        column_[jj].vecMulAdd2D(X, Y, jj);
#elif ( BLOCK_SIZE == 3 )
        column_[jj].vecMulAdd3D(X, Y, jj);
#elif ( BLOCK_SIZE == 4 )
        column_[jj].vecMulAdd4D(X, Y, jj);
#endif
    }
    /*
    if ( cnt > 0 )
        fprintf(stderr, "SMSB %6lu rows %6lu blocks  cycles/block: %5.2f\n",\
                col, cnt, real(timer()-rdt)/cnt);
     */
}

//------------------------------------------------------------------------------
#pragma mark - Vector Multiplication

void SparMatSymBlk::vecMul(const real* X, real* Y) const
{
    zero_real(size_, Y);
    vecMulAdd(X, Y, 0, size_);
}
