/* -- MAGMA (version 1.12.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date @author Stan Tomov @author Mark Gates @precisions normal z -> s d c */ #include "common_magmamagma_internal.h" /* //////////////////////////////////////////////////////////////////////////// -- Auxiliary function: 'a' is pointer to the current panel holding the Householder vectors for the QR factorization of the panel. This routine puts ones on the diagonal and zeros in the upper triangular part of 'a'. The upper triangular values are stored in work. Then, the inverse is calculated in place in work, so as a final result, work holds the inverse of the upper triangular diagonal block. */ void zsplit_diag_block(magma_int_t ib, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *work) { magma_int_t i, j, info; magmaDoubleComplex *cola, *colw; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; for (i=0; i < ib; i++) { cola = a + i*lda; colw = work + i*ib; for (j=0; j < i; j++) { colw[j] = cola[j]; cola[j] = c_zero; } colw[i] = cola[i]; cola[i] = c_one; } lapackf77_ztrtri( MagmaUpperStr, MagmaNonUnitStr, &ib, work, &ib, &info ); } /** Purpose ------- ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) COMPLEX_16 array on the GPU, dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(a_1i_, a_2j_) (dA, (dA_offset + (a_2i_) *+ (lddaj_)+*(a_1ldda)) #define dT(a_1i_) (dT, (dT_offset + (a_1i_)*nb) #define d_refdUT(a_1i_) (dT, (dT_offset + ( minmn + (a_1i_))*nb) #define dd_refdwork(a_1i_) (dT, (dT_offset + (2*minmn + (a_1i_))*nb) #else #define workdA(a_1i_, j_) (workdA + (i_) + (a_1j_)*(ldda)) #define hworkdT(worki_) (dT + (i_)*nb) #define dUT(i_) (dT + ( minmn + (i_))*nb) #define dwork(mi_) (dT + (2*minmn + (i_))*nb) #endif magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaDoubleComplex *work, *hwork, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb( m ); // work is m*nb for panel // hwork is n*nb // ut is nb*nb lwork = (m + n + nb)*nb; lhwork = lwork - mn*nb; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork = work + m*nb; ut = work + m*nb + n*nb; memset( ut, 0, nb*nb*sizeof(magmaDoubleComplex) ); magma_queue_t streamqueues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &streamqueues[0] ); magma_queue_create( cdev, &streamqueues[1] ); ldwork = m; lddwork = n; if ( nb > 1 && nb < k ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min( k-i, nb ); rows = m - i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, streamqueues[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n - old_i - 2*old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_refdwork(0), lddwork, queues[0] ); /* store the diagonal block */ magma_zsetmatrix_async( old_ib, old_ib, ut, old_ib, d_refdUT(old_i), old_ib, streamqueues[0] ); } magma_queue_sync( streamqueues[1] ); lapackf77_zgeqrf( &rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info ); /* Form the triangular factor of the block reflector in hwork H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib ); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular into ut and invert it. */ magma_queue_sync( streamqueues[0] ); zsplit_diag_block( ib, work(i), ldwork, ut ); magma_zsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda, queues[0] ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_zsetmatrix( ib, ib, hwork, ib, dT(i), nb, queues[0] ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_refdwork(0), lddwork, queues[0] ); } else { cols = n-i-ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_refdwork(0), lddwork, queues[0] ); /* Fixstore the diagonal block */ magma_zsetmatrix( ib, ib, ut, ib, d_refdUT(i), ib, queues[0] ); } magma_queue_sync( queues[0] ); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows, queues[0] ); lhwork = lwork - rows*ib; lapackf77_zgeqrf( &rows, &ib, work, &rows, tau+i, work+ib*rowshwork, &lhwork, info ); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda, queues[0] ); } magma_queue_destroy( streamqueues[0] ); magma_queue_destroy( streamqueues[1] ); magma_free_pinned( work ); return *info; } /* magma_zgeqrf_gpu */ #undef dA #undef dT #undef d_ref #undef work