! ***********************************************************************
! Copyright (C) 2012  Bill Paxton
! This file is part of MESA.
! MESA is free software; you can redistribute it and/or modify
! it under the terms of the GNU General Library Public License as published
! by the Free Software Foundation; either version 2 of the License, or
! (at your option) any later version.
! MESA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
! GNU Library General Public License for more details.
! You should have received a copy of the GNU Library General Public License
! along with this software; if not, write to the Free Software
! Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
! ***********************************************************************

! derived from BCYCLIC written hirshman et. al.
! S.P.Hirshman, K.S.Perumalla, V.E.Lynch, & R.Sanchez,
! BCYCLIC: A parallel block tridiagonal matrix cyclic solver,
! J. Computational Physics, 229 (2010) 6392-6404.


      module star_bcyclic

      use star_private_def
      use const_def, only: dp, qp, ln10
      use mtx_lib, only: num_klu_ipar_decsol, num_klu_rpar_decsol, &
         klu_dble_decsols_nrhs_0_based, dense_to_col_with_diag_0_based, &
         block_thomas_klu_decsolblk
      use utils_lib, only: fill_with_NaNs
      
      implicit none

      integer, parameter :: lid = num_klu_ipar_decsol 
      integer, parameter :: lrd = num_klu_rpar_decsol
      
      !logical, parameter :: use_block_thomas = .true.
      logical, parameter :: use_block_thomas = .false.

      logical, parameter :: dbg = .false.
      logical, parameter :: do_fill_with_NaNs = .false.
      
      !logical, parameter :: test_sparse = .true.
      logical, parameter :: test_sparse = .false.
      integer, parameter :: k_debug = -1 ! 68
      real(dp), parameter :: test_sparse_tol = 1d-2
      real(dp) :: test_mtx(184,184), test_rhs(184) ! for use with mesa_179.net
      
      
      contains


      subroutine bcyclic_factor ( &
            s, lblk1, dblk1, dblk1_qp, ublk1, &
            ipivot1, brhs1, nvar, nz, sparse, iter, &
            lrd, rpar_decsol, lid, ipar_decsol, ierr)
         type (star_info), pointer :: s
         real(dp), pointer :: lblk1(:) ! row section of lower block
         real(dp), pointer :: dblk1(:) ! row section of diagonal block
         real(qp), pointer :: dblk1_qp(:) ! row section of diagonal block
         real(dp), pointer :: ublk1(:) ! row section of upper block
         integer, pointer :: ipivot1(:) 
            ! row section of pivot array for block factorization
         real(dp), pointer :: brhs1(:) ! row section of rhs
         integer, intent(in) :: nvar ! linear size of each block
         integer, intent(in) :: nz ! number of block rows
         logical, intent(in) :: sparse
         integer, intent(in) :: lrd, lid, iter
         real(dp), pointer, intent(inout) :: rpar_decsol(:) ! (lrd)
         integer, pointer, intent(inout) :: ipar_decsol(:) ! (lid)
         integer, intent(out) :: ierr
      
         integer, pointer :: iptr(:,:), nslevel(:), ipivot(:)
         integer :: ncycle, nstemp, maxlevels, nlevel, i, j, k
         logical :: have_odd_storage, have_klu_storage, use_quad
         real(dp), pointer, dimension(:,:) :: dmat
         real(qp), pointer, dimension(:,:) :: dmat_qp
         real(dp), pointer, dimension(:) :: brhs

         include 'formats'
            
         ierr = 0      
         
         if (sparse .and. use_block_thomas) then
            nullify(brhs)
            write(*,*) 'call block_thomas_klu_decsolblk to factor'
            call block_thomas_klu_decsolblk( &
               0,0,nvar,nz,lblk1,dblk1,ublk1,brhs,ipivot1,&
               lrd,rpar_decsol,lid,ipar_decsol,ierr)
            return
         end if
         
         if (dbg) write(*,*) 'start bcyclic_factor'

         ! compute number of cyclic reduction levels
         ncycle = 1
         maxlevels = 0
         do while (ncycle < nz)
            ncycle = 2*ncycle
            maxlevels = maxlevels+1
         end do
         maxlevels = max(1, maxlevels)
      
         have_odd_storage = associated(s% bcyclic_odd_storage)
         if (have_odd_storage) then
            if (size(s% bcyclic_odd_storage) < maxlevels) then
               call clear_storage(s)
               have_odd_storage = .false.
            end if
         end if

         if (.not. have_odd_storage) then
            allocate (s% bcyclic_odd_storage(maxlevels+3), stat=ierr)
            if (ierr /= 0) then
               write(*,*) 'alloc failed for odd_storage in bcyclic'
               return
            end if
            do nlevel = 1, size(s% bcyclic_odd_storage)
               s% bcyclic_odd_storage(nlevel)% ul_size = 0
            end do
         end if

         allocate (nslevel(maxlevels), stat=ierr)
         if (ierr /= 0) return
      
         if (sparse) then
            have_klu_storage = associated(s% bcyclic_sprs_storage)
            if (have_klu_storage) then
               if (size(s% bcyclic_sprs_storage) < nz) then
                  write(*,*) 'call clear_klu_storage'
                  call clear_klu_storage(s)
                  have_klu_storage = .false.
               end if
            end if
            if (.not. have_klu_storage) then
               write(*,2) 'allocate bcyclic_sprs_storage', nz*2 + 1000
               allocate (s% bcyclic_sprs_storage(nz*2 + 1000), stat=ierr)
               if (ierr /= 0) then
                  write(*,*) 'alloc failed for klu_storage in bcyclic'
                  return
               end if
               do k = 1, size(s% bcyclic_sprs_storage)
                  s% bcyclic_sprs_storage(k)% sprs_nonzeros = -1
                  s% bcyclic_sprs_storage(k)% ia => null()
                  s% bcyclic_sprs_storage(k)% ja => null()
                  s% bcyclic_sprs_storage(k)% values => null()
               end do
            end if    
         end if  

         ncycle = 1
         nstemp = nz
         nlevel = 1

         if (dbg) write(*,*) 'start factor_cycle'

         factor_cycle: do ! perform cyclic-reduction factorization

            nslevel(nlevel) = nstemp
            
            if (dbg) write(*,2) 'call cycle_onestep', nstemp

            call cycle_onestep( &
               s, nvar, nz, nstemp, ncycle, nlevel, sparse, iter, &
               lblk1, dblk1, dblk1_qp, ublk1, ipivot1, ierr)
            if (ierr /= 0) then
               !write(*,*) 'cycle_onestep failed'
               call dealloc
               return
            end if

            if (nstemp == 1) exit
         
            nstemp = (nstemp+1)/2
            nlevel = nlevel+1
            ncycle = 2*ncycle

            if (nlevel > maxlevels) exit

         end do factor_cycle

         if (dbg) write(*,*) 'done factor_cycle'
      
         ! factor row 1
         dmat(1:nvar,1:nvar) => dblk1(1:nvar*nvar)
         use_quad = (s% lnT(1)/ln10 >= s% min_logT_for_quad)
         if (use_quad) then
            dmat_qp(1:nvar,1:nvar) => dblk1_qp(1:nvar*nvar)
         else
            nullify(dmat_qp)
         end if
         
         if (sparse) then
            call sparse_factor(s, 1, nvar, iter, dmat, ierr)
            if (ierr /= 0) then
               write(*,*) 'row 1 factor failed in sparse_factor'
               call dealloc
               return
            end if
         end if
         
         if ((.not. sparse) .or. (sparse .and. test_sparse)) then
            ipivot(1:nvar) => ipivot1(1:nvar)
            call dense_factor( &
               s, 1, nvar, dmat, dmat_qp, ipivot, use_quad, ierr)
            !call my_getf2(nvar, dmat, nvar, ipivot, ierr)         
            if (ierr /= 0) then
               write(*,*) 'row 1 factor failed in my_getf2'
               call dealloc
               return
            end if
         end if
      
         call dealloc
         
         if (dbg) write(*,*) 'done bcyclic_factor'
      
         contains 
      
         subroutine dealloc
            deallocate (nslevel)
         end subroutine dealloc
      
      
      end subroutine bcyclic_factor


      subroutine bcyclic_solve ( &
            s, lblk1, dblk1, dblk1_qp, ublk1, ipivot1, &
            brhs1, nvar, nz, sparse, &
            lrd, rpar_decsol, lid, ipar_decsol, ierr)
         type (star_info), pointer :: s
         real(dp), pointer :: lblk1(:) ! row section of lower block
         real(dp), pointer :: dblk1(:) ! row section of diagonal block
         real(qp), pointer :: dblk1_qp(:) ! row section of diagonal block
         real(dp), pointer :: ublk1(:) ! row section of upper block
         integer, pointer :: ipivot1(:) 
            ! row section of pivot array for block factorization
         real(dp), pointer :: brhs1(:)   ! row section of rhs
         integer, intent(in) :: nvar ! linear size of each block
         integer, intent(in) :: nz     ! number of block rows
         logical, intent(in) :: sparse
         integer, intent(in) :: lrd, lid
         real(dp), pointer, intent(inout) :: rpar_decsol(:) ! (lrd)
         integer, pointer, intent(inout) :: ipar_decsol(:) ! (lid)
         integer, intent(out) :: ierr
      
         integer, pointer :: iptr(:,:), nslevel(:), ipivot(:)
         integer :: ncycle, nstemp, maxlevels, nlevel, nvar2, i
         real(dp), pointer, dimension(:,:) :: dmat
         real(qp), pointer, dimension(:,:) :: dmat_qp
         real(dp), pointer, dimension(:) :: bptr1
         real(dp), target, dimension(nvar) :: b_array
         logical :: okay, use_quad

         include 'formats'
      
         
         if (dbg) write(*,*) 'start bcyclic_solve'

         ierr = 0      

         
         if (sparse .and. use_block_thomas) then
            write(*,*) 'call block_thomas_klu_decsolblk to solve'
            call block_thomas_klu_decsolblk( &
               1,0,nvar,nz,lblk1,dblk1,ublk1,brhs1,ipivot1,&
               lrd,rpar_decsol,lid,ipar_decsol,ierr)
            return
         end if


         nvar2 = nvar*nvar
         ncycle = 1
         maxlevels = 0
         do while (ncycle < nz)
            ncycle = 2*ncycle
            maxlevels = maxlevels+1
         end do
         maxlevels = max(1, maxlevels)

         allocate (nslevel(maxlevels), stat=ierr)
         if (ierr /= 0) return

         ncycle = 1
         nstemp = nz
         nlevel = 1
         
         if (dbg) write(*,*) 'start forward_cycle'

         forward_cycle: do

            nslevel(nlevel) = nstemp
            if (dbg) write(*,2) 'call cycle_rhs', nstemp
            call cycle_rhs( &
               s, nstemp, nvar, ncycle, nlevel, &
               sparse, dblk1, dblk1_qp, brhs1, ipivot1, ierr)
            if (ierr /= 0) then
               call dealloc
               return
            end if

            if (nstemp == 1) exit
         
            nstemp = (nstemp+1)/2
            nlevel = nlevel+1
            ncycle = 2*ncycle

            if (nlevel > maxlevels) exit

         end do forward_cycle
         
         if (dbg) write(*,*) 'done forward_cycle'
         
         dmat(1:nvar,1:nvar) => dblk1(1:nvar2)
         use_quad = (s% lnT(1)/ln10 >= s% min_logT_for_quad)
         if (use_quad) then
            dmat_qp(1:nvar,1:nvar) => dblk1_qp(1:nvar2)
         else
            nullify(dmat_qp)
         end if
         if ((.not. sparse) .or. (sparse .and. test_sparse)) then
            ipivot(1:nvar) => ipivot1(1:nvar)
            bptr1(1:nvar) => brhs1(1:nvar)
            if (sparse) then ! copy rhs to temp vector
               do i=1,nvar
                  b_array(i) = bptr1(i)
               end do
               bptr1(1:nvar) => b_array(1:nvar)
            end if
            call dense_solve1( &
               s, 1, nvar, bptr1, dmat, dmat_qp, ipivot, &
               use_quad, .false., ierr)
            if (ierr /= 0) then
               write(*,*) 'failed in my_getrs1'
               call dealloc
               return
            end if
         end if
         
         if (sparse) then
            call sparse_solve1(s,1,nvar,brhs1,dmat,ierr)
            if (ierr /= 0) then
               write(*,*) 'failed in sparse_solve'
               call dealloc
               return
            end if
            if (test_sparse) then ! compare to dense answer saved in b_array
               okay = .true.
               do i=1,nvar
                  if (abs(b_array(i) - brhs1(i)) > &
                        test_sparse_tol* &
                           max(test_sparse_tol, min(abs(b_array(i)), abs(brhs1(i))))) then
                     write(*,3) '(b_array(i) - brhs1(i)/max...', i, 1, &
                        (b_array(i) - brhs1(i))/ &
                           max(test_sparse_tol, min(abs(b_array(i)), abs(brhs1(i)))), &
                        b_array(i) - brhs1(i), b_array(i), brhs1(i)
                     okay = .false.
                  end if
               end do
               if (.not. okay) stop 'problem in sparse solve in bcyclic'
            end if
         end if
      
         ! back solve for even x's
         back_cycle: do while (ncycle > 1)      
            ncycle = ncycle/2
            nlevel = nlevel-1
            if (nlevel < 1) then
               ierr = -1
               exit
            end if
            nstemp = nslevel(nlevel)
            call cycle_solve( &
               s, nvar, nz, ncycle, nstemp, nlevel, &
               sparse, lblk1, ublk1, brhs1)
         end do back_cycle
      
         call dealloc
         
         if (dbg) write(*,*) 'done bcyclic_solve'
      
      
         contains 
      
      
         subroutine dealloc
            deallocate (nslevel)
         end subroutine dealloc


      end subroutine bcyclic_solve
      
      
      subroutine clear_storage(s)
         type (star_info), pointer :: s
         integer :: nlevel
         nlevel = size(s% bcyclic_odd_storage)
         do while (nlevel > 0)
            if (s% bcyclic_odd_storage(nlevel)% ul_size > 0) then
               deallocate(s% bcyclic_odd_storage(nlevel)% umat1)
               deallocate(s% bcyclic_odd_storage(nlevel)% lmat1)
            end if
            nlevel = nlevel-1
         end do
         deallocate(s% bcyclic_odd_storage)
         nullify(s% bcyclic_odd_storage)
      end subroutine clear_storage
      
      
      subroutine clear_klu_storage(s)
         type (star_info), pointer :: s
         integer :: k
         do k = 1, size(s% bcyclic_sprs_storage)
            if (associated(s% bcyclic_sprs_storage(k)% ia)) &
               deallocate(s% bcyclic_sprs_storage(k)% ia)
            if (associated(s% bcyclic_sprs_storage(k)% ja)) &
               deallocate(s% bcyclic_sprs_storage(k)% ja)
            if (associated(s% bcyclic_sprs_storage(k)% values)) &
               deallocate(s% bcyclic_sprs_storage(k)% values)
         end do
         deallocate(s% bcyclic_sprs_storage)
         nullify(s% bcyclic_sprs_storage)
      end subroutine clear_klu_storage


      subroutine cycle_onestep( &
            s, nvar, nz, nblk, ncycle, nlevel, sparse, iter, &
            lblk1, dblk1, dblk1_qp, ublk1, ipivot1,  ierr)
         type (star_info), pointer :: s
         integer, intent(in) :: nvar, nz, nblk, ncycle, nlevel, iter
         logical, intent(in) :: sparse
         real(dp), pointer, dimension(:), intent(inout) :: &
            lblk1, dblk1, ublk1
         real(qp), pointer, dimension(:) :: dblk1_qp
         integer, pointer, intent(out) :: ipivot1(:)
         integer, intent(out) :: ierr
     
         integer, pointer :: ipivot(:)
         real(dp), pointer, dimension(:,:) :: dmat, umat, lmat, umat0, lmat0
         real(qp), pointer, dimension(:,:) :: dmat_qp
         real(dp), pointer, dimension(:,:) :: lnext, unext, lprev, uprev
         real(dp), pointer, dimension(:) :: mat1
         integer :: i, j, shift, min_sz, new_sz, shift1, shift2, nvar2, &
            ns, ierr_loc, nmin, kcount, k, ii, jj, kk
         logical :: use_quad
      
         include 'formats'

         ierr = 0
         nvar2 = nvar*nvar     
         nmin = 1
         kcount = 1+(nblk-nmin)/2
         min_sz = nvar2*kcount
         if (s% bcyclic_odd_storage(nlevel)% ul_size < min_sz) then
            if (s% bcyclic_odd_storage(nlevel)% ul_size > 0) &
               deallocate( &
                  s% bcyclic_odd_storage(nlevel)% umat1, &
                  s% bcyclic_odd_storage(nlevel)% lmat1)         
            new_sz = min_sz*1.1 + 100
            s% bcyclic_odd_storage(nlevel)% ul_size = new_sz
            allocate (s% bcyclic_odd_storage(nlevel)% umat1(new_sz), &
                      s% bcyclic_odd_storage(nlevel)% lmat1(new_sz), stat=ierr)
            if (ierr /= 0) then
               write(*,*) 'allocation error in cycle_onestep'
               return
            end if
         end if

!$OMP PARALLEL DO private(ns,kcount,shift,shift2,i)
         do ns = nmin, nblk, 2  ! copy umat and lmat
            kcount = (ns-nmin)/2 + 1
            shift = nvar2*(kcount-1)
            shift2 = nvar2*ncycle*(ns-1)
            do i=1,nvar2
               s% bcyclic_odd_storage(nlevel)% umat1(shift+i) = ublk1(shift2+i)
               s% bcyclic_odd_storage(nlevel)% lmat1(shift+i) = lblk1(shift2+i)
            end do
         end do
!$OMP END PARALLEL DO

         if (nvar2*kcount > s% bcyclic_odd_storage(nlevel)% ul_size) then
            write(*,*) 'nvar2*kcount > ul_size in cycle_onestep'
            ierr = -1
            return
         end if
         
         if (dbg) write(*,*) 'start lu factorization'
         ! compute lu factorization of even diagonal blocks
         nmin = 2         
!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(ipivot,dmat,dmat_qp,use_quad,ns,ierr_loc,shift1,shift2,i,j,k)
         do ns = nmin, nblk, 2
         
            k = ncycle*(ns-1) + 1
            shift1 = nvar*(k-1)
            shift2 = nvar*shift1
            dmat(1:nvar,1:nvar) => dblk1(shift2+1:shift2+nvar2)
            use_quad = (s% lnT(k)/ln10 >= s% min_logT_for_quad)
            if (use_quad) then
               dmat_qp(1:nvar,1:nvar) => dblk1_qp(shift2+1:shift2+nvar2)
            else
               nullify(dmat_qp)
            end if

            ierr_loc = 0

            if (sparse) then
               call sparse_factor(s, k, nvar, iter, dmat, ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
               end if
            end if
            
            if ((.not. sparse) .or. (sparse .and. test_sparse)) then
               if (k == k_debug) then
                  do j=1,nvar
                     do i=1,nvar
                        test_mtx(i,j) = dmat(i,j)
                     end do
                  end do
               end if
               ipivot(1:nvar) => ipivot1(shift1+1:shift1+nvar)
               call dense_factor( &
                  s, k, nvar, dmat, dmat_qp, ipivot, use_quad, ierr_loc)
               !call my_getf2(nvar, dmat, nvar, ipivot, ierr_loc)         
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
               end if
            end if
            
         end do 
!$OMP END PARALLEL DO
	      if (ierr /= 0) then
	         !write(*,*) 'factorization failed in bcyclic'
	         return
	      end if
	      
         if (dbg) write(*,*) 'done lu factorization; start solve'

!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(ns,k,shift1,shift2,ipivot,dmat,dmat_qp,use_quad,umat,lmat,mat1,i,j,ierr_loc)
         do ns = nmin, nblk, 2       
            ! compute new l=-d[-1]l, u=-d[-1]u for even blocks
            k = ncycle*(ns-1) + 1
            shift1 = nvar*(k-1)
            shift2 = nvar*shift1
            
            lmat(1:nvar,1:nvar) => lblk1(shift2+1:shift2+nvar2)

            if ((.not. sparse) .or. (sparse .and. test_sparse)) then
               ipivot(1:nvar) => ipivot1(shift1+1:shift1+nvar)
               dmat(1:nvar,1:nvar) => dblk1(shift2+1:shift2+nvar2)
               use_quad = (s% lnT(k)/ln10 >= s% min_logT_for_quad)
               if (use_quad) then
                  dmat_qp(1:nvar,1:nvar) => dblk1_qp(shift2+1:shift2+nvar2)
               else
                  nullify(dmat_qp)
               end if
               call dense_solve( &
                  s, k, nvar, dmat, dmat_qp, ipivot, lmat, &
                  use_quad, ierr_loc)
               !call my_getrs(nvar, nvar, dmat, nvar, ipivot, lmat, nvar, ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
            end if

            if (sparse) then
               mat1(1:nvar2) => lblk1(shift2+1:shift2+nvar2)
               call sparse_solve(s, k, nvar, nvar, mat1, ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
            end if

            do j=1,nvar
               do i=1,nvar
                  lmat(i,j) = -lmat(i,j)
               end do
            end do

            umat(1:nvar,1:nvar) => ublk1(shift2+1:shift2+nvar2)
            
            if ((.not. sparse) .or. (sparse .and. test_sparse)) then
               call dense_solve(s, k, nvar, dmat, dmat_qp, ipivot, umat, &
                  use_quad, ierr_loc)
               !call my_getrs(nvar, nvar, dmat, nvar, ipivot, umat, nvar, ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
            end if
            
            if (sparse) then
               mat1(1:nvar2) => ublk1(shift2+1:shift2+nvar2)
               call sparse_solve(s,k,nvar,nvar,mat1,ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
            end if
            
            do j=1,nvar
               do i=1,nvar
                  umat(i,j) = -umat(i,j)
               end do
            end do

         end do 
!$OMP END PARALLEL DO
         if (dbg) write(*,*) 'done solve'

         if (ierr /= 0) return

         ! compute new odd blocks in terms of even block factors
         ! compute odd hatted matrix elements except at boundaries
         nmin = 1
!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(i,ns,shift2,dmat,umat,lmat,lnext,unext,lprev,uprev,kcount,shift,umat0,lmat0,k)
         do i= 1, 3*(1+(nblk-nmin)/2)
         
            ns = 2*((i-1)/3) + nmin
            k = ncycle*(ns-1) + 1
            shift2 = nvar2*(k-1)
            dmat(1:nvar,1:nvar) => dblk1(shift2+1:shift2+nvar2)
            umat(1:nvar,1:nvar) => ublk1(shift2+1:shift2+nvar2)
            lmat(1:nvar,1:nvar) => lblk1(shift2+1:shift2+nvar2)

            if (ns < nblk) then
               shift2 = nvar2*ncycle*ns
               lnext(1:nvar,1:nvar) => lblk1(shift2+1:shift2+nvar2)
               unext(1:nvar,1:nvar) => ublk1(shift2+1:shift2+nvar2)
            end if

            if (ns > 1) then
               shift2 = nvar2*ncycle*(ns-2)
               lprev(1:nvar,1:nvar) => lblk1(shift2+1:shift2+nvar2)
               uprev(1:nvar,1:nvar) => ublk1(shift2+1:shift2+nvar2)
            end if

            kcount = 1+(ns-nmin)/2
            shift = nvar2*(kcount-1)
            lmat0(1:nvar,1:nvar) => &
               s% bcyclic_odd_storage(nlevel)% lmat1(shift+1:shift+nvar2)
            umat0(1:nvar,1:nvar) => &
               s% bcyclic_odd_storage(nlevel)% umat1(shift+1:shift+nvar2)

            select case(mod(i-1,3))
            case (0)
               if (ns > 1) then
                  ! lmat = matmul(lmat0, lprev)
                  call my_gemm0_p1(nvar,nvar,nvar,lmat0,nvar,lprev,nvar,lmat,nvar)
               end if  
            case (1)
               if (ns < nblk) then
                  ! umat = matmul(umat0, unext)
                  call my_gemm0_p1(nvar,nvar,nvar,umat0,nvar,unext,nvar,umat,nvar)
               end if
            case (2)
               if (ns < nblk) then
                  if (ns > 1) then
                     ! dmat = dmat + matmul(umat0, lnext) + matmul(lmat0,uprev)
                     call my_gemm_plus_mm(nvar,nvar,nvar,umat0,lnext,lmat0,uprev,dmat)
                  else
                     ! dmat = dmat + matmul(umat0, lnext)
                     call my_gemm_p1(nvar,nvar,nvar,umat0,nvar,lnext,nvar,dmat,nvar)
                  end if
               else if (ns > 1) then
                  ! dmat = dmat + matmul(lmat0,uprev)
                  call my_gemm_p1(nvar,nvar,nvar,lmat0,nvar,uprev,nvar,dmat,nvar)
               end if  
            end select

         end do
!$OMP END PARALLEL DO
         if (dbg) write(*,*) 'done cycle_onestep'

      end subroutine cycle_onestep


      subroutine cycle_rhs( &
            s, nblk, nvar, ncycle, nlevel, sparse, &
            dblk1, dblk1_qp, brhs1, ipivot1, ierr)
         use chem_def, only: chem_isos
         type (star_info), pointer :: s
         integer, intent(in) :: nblk, nvar, ncycle, nlevel
         logical, intent(in) :: sparse
         real(dp), pointer, intent(in) :: dblk1(:)
         real(qp), pointer, intent(in) :: dblk1_qp(:)
         real(dp), pointer, intent(inout) :: brhs1(:)
         integer, pointer, intent(in) :: ipivot1(:)
         integer, intent(out) :: ierr
      
         integer :: i, k, ns, ierr_loc, nmin, kcount, shift, shift1, shift2, nvar2
         integer, pointer :: ipivot(:)
         real(dp), pointer, dimension(:,:) :: dmat, umat, lmat
         real(qp), pointer, dimension(:,:) :: dmat_qp
         real(dp), pointer, dimension(:) :: bprev, bnext, bptr
         real(dp), target :: b_array(nvar)
         logical :: okay, use_quad
      
         include 'formats'
      
         ierr = 0
         nvar2 = nvar*nvar
         ! compute dblk[-1]*brhs for even indices and store in brhs(even)
         nmin = 2
   	   ierr_loc = 0
!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(ns,shift1,ipivot,shift2,k,dmat,dmat_qp,use_quad,bptr,i,b_array,okay,ierr_loc)
         do ns = nmin, nblk, 2
            k = ncycle*(ns-1) + 1
            shift1 = nvar*(k-1)
            shift2 = nvar*shift1
            dmat(1:nvar,1:nvar) => dblk1(shift2+1:shift2+nvar2)
            use_quad = (s% lnT(k)/ln10 >= s% min_logT_for_quad)
            if (use_quad) then
               dmat_qp(1:nvar,1:nvar) => dblk1_qp(shift2+1:shift2+nvar2)
            else
               nullify(dmat_qp)
            end if
            if ((.not. sparse) .or. (sparse .and. test_sparse)) then
               ipivot(1:nvar) => ipivot1(shift1+1:shift1+nvar)
               bptr(1:nvar) => brhs1(shift1+1:shift1+nvar)
               if (sparse) then ! copy rhs to temp vector
                  do i=1,nvar
                     b_array(i) = bptr(i)
                  end do
                  bptr(1:nvar) => b_array(1:nvar)
                  if (.false.) then
                     do i=1,nvar
                        test_rhs(i) = bptr(i)
                     end do
                  end if
               end if
               call dense_solve1( &
                  s, k, nvar, bptr, dmat, dmat_qp, ipivot, &
                  use_quad, .true., ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
            end if
            
            if (sparse) then
               bptr(1:nvar) => brhs1(shift1+1:shift1+nvar)
               call sparse_solve1(s,k,nvar,bptr,dmat,ierr_loc)
               if (ierr_loc /= 0) then
                  ierr = ierr_loc
                  cycle
               end if
               if (test_sparse) then ! compare to dense answer saved in b_array
                  if (.false.) write(*,3) 'sparse solve', k, nvar
                  okay = .true.
                  do i=s% nvar_hydro+1,nvar
                     if (abs(b_array(i) - bptr(i)) > &
                           test_sparse_tol* &
                              max(test_sparse_tol, min(abs(b_array(i)), abs(bptr(i))))) then
                        write(*,3) '(b_array(i) - bptr(i))/max... ' // &
                           trim(chem_isos% name(s% chem_id(i - s% nvar_hydro))), &
                           i - s% nvar_hydro, k, &
                           (b_array(i) - bptr(i))/ &
                              max(test_sparse_tol, min(abs(b_array(i)), abs(bptr(i)))), &
                           b_array(i) - bptr(i), b_array(i), bptr(i)
                        okay = .false.
                     end if
                  end do
                  if (.not. okay) then
                     do i=s% nvar_hydro+1,nvar
                        write(*,3) 'dense sparse ' // &
                           trim(chem_isos% name(s% chem_id(i - s% nvar_hydro))), &
                              i - s% nvar_hydro, k, &
                           b_array(i), bptr(i)
                     end do
                     write(*,1) 'dense sum(b_array(species))', &
                        sum(b_array(s% nvar_hydro+1:nvar))
                     write(*,1) 'sparse sum(bptr(species))', sum(bptr(s% nvar_hydro+1:nvar))
                     stop 'problem in sparse solve in bcyclic'
                  end if
               end if
            end if
                        
         end do
!$OMP END PARALLEL DO

   	  if (ierr /= 0) return

        ! compute odd (hatted) sources (b-hats) for interior rows
         nmin = 1
         kcount = 0
!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(ns,shift1,bptr,kcount,shift,umat,lmat,bnext,bprev)
         do ns = nmin, nblk, 2
            shift1 = nvar*ncycle*(ns-1)
            bptr(1:nvar) => brhs1(shift1+1:shift1+nvar)
            kcount = 1+(ns-nmin)/2
            shift = nvar2*(kcount-1)         
            umat(1:nvar,1:nvar) => &
               s% bcyclic_odd_storage(nlevel)% umat1(shift+1:shift+nvar2)
            lmat(1:nvar,1:nvar) => &
               s% bcyclic_odd_storage(nlevel)% lmat1(shift+1:shift+nvar2)
            if (ns > 1) then
               shift1 = nvar*ncycle*(ns-2)
               bprev => brhs1(shift1+1:shift1+nvar)
            end if
            if (ns < nblk) then
               shift1 = nvar*ncycle*ns
               bnext => brhs1(shift1+1:shift1+nvar)
               if (ns > 1) then
                  ! bptr = bptr - matmul(umat,bnext) - matmul(lmat,bprev)
                  call my_gemv_mv(nvar,nvar,umat,bnext,lmat,bprev,bptr)
               else
                  ! bptr = bptr - matmul(umat,bnext)
                  call my_gemv(nvar,nvar,umat,nvar,bnext,bptr)
               end if
            else if (ns > 1) then
               ! bptr = bptr - matmul(lmat,bprev)
               call my_gemv(nvar,nvar,lmat,nvar,bprev,bptr)
            end if
         end do 
!$OMP END PARALLEL DO

         if (nvar2*kcount > s% bcyclic_odd_storage(nlevel)% ul_size) then
            write(*,*) 'nvar2*kcount > ul_size in cycle_rhs'
            ierr = -1
            return
         end if

      end subroutine cycle_rhs


      ! computes even index solution from the computed (at previous,higher level)
      ! odd index solutions at this level.
      ! note at this point, the odd brhs values have been replaced (at the highest cycle)
      ! with the solution values (x), at subsequent (lower) cycles, the
      ! odd values are replaced by the even solutions at the next highest cycle. the even 
      ! brhs values were multiplied by d[-1] and stored in cycle_rhs
      ! solve for even index values in terms of (computed at this point) odd index values
      subroutine cycle_solve( &
            s, nvar, nz, ncycle, nblk, nlevel, sparse, lblk1, ublk1, brhs1)
         type (star_info), pointer :: s
         integer, intent(in) :: nvar, nz, ncycle, nblk, nlevel
         logical, intent(in) :: sparse
         real(dp), pointer, intent(in) :: lblk1(:), ublk1(:)
         real(dp), pointer, intent(inout) :: brhs1(:)

         real(dp), pointer :: umat(:,:), lmat(:,:), bprev(:), bnext(:), bptr(:)
         real(dp), pointer, dimension(:) :: bprevr, bnextr
         integer :: shift1, shift2, nvar2, ns, ierr, nmin

         nvar2 = nvar*nvar
         nmin = 2
!$OMP PARALLEL DO SCHEDULE(static,3) &
!$OMP PRIVATE(ns,shift1,bptr,shift2,lmat,bprev,umat,bnext)
         do ns = nmin, nblk, 2
            shift1 = ncycle*nvar*(ns-1)
            bptr(1:nvar) => brhs1(shift1+1:shift1+nvar)
            shift2 = nvar*shift1
            lmat(1:nvar,1:nvar) => lblk1(shift2+1:shift2+nvar2)
            if (ns > 1) then
               shift1 = ncycle*nvar*(ns-2)
               bprev(1:nvar) => brhs1(shift1+1:shift1+nvar)
            end if
            if (ns < nblk) then
               umat(1:nvar,1:nvar) => ublk1(shift2+1:shift2+nvar2)
               shift1 = ncycle*nvar*ns
               bnext(1:nvar) => brhs1(shift1+1:shift1+nvar)
               if (ns > 1) then
                  ! bptr = bptr + matmul(umat,bnext) + matmul(lmat,bprev)
                  call my_gemv_p_mv(nvar,nvar,umat,bnext,lmat,bprev,bptr)
               else
                  ! bptr = bptr + matmul(umat,bnext)
                  call my_gemv_p1(nvar,nvar,umat,nvar,bnext,bptr)
               end if
            else if (ns > 1) then
               ! bptr = bptr + matmul(lmat,bprev)
               call my_gemv_p1(nvar,nvar,lmat,nvar,bprev,bptr)
            end if
         end do
!$OMP END PARALLEL DO

      end subroutine cycle_solve
      

      subroutine sparse_factor(s, k, nvar, iter, mtx, ierr)   
         use star_sparse
         
         type (star_info), pointer :: s
         integer, intent(in) :: k, nvar, iter
         real(dp), pointer, intent(inout) :: mtx(:,:)
         integer, intent(out) :: ierr
         
         logical, parameter :: use_pivoting = .true.
         !logical, parameter :: use_pivoting = .false.
         
         logical :: did_refactor
         real(dp) :: rgrowth, condest
         type(sparse_info), pointer :: ks(:)
         include 'formats'

         if (.false.) write(*,2) 'sparse_factor', k

         ks => s% bcyclic_sprs_storage
         
         call star_sparse_setup_shared(s, k, 1, nvar, ierr)
         if (ierr /= 0) stop 'sparse_factor'
      
         if (.not. use_pivoting) then
            call star_sparse_no_pivot(s, k, nvar, ierr) 
            if (ierr /= 0) stop 'sparse_factor'
         end if

         call star_sparse_store_new_values(s, k, nvar, mtx, .false., ierr)                  
         if (ierr /= 0) then
            write(*,3) 'sparse_store_new_values failed', k, s% model_number
            stop 'sparse_factor'
         end if
      
         did_refactor = .false.
         
         if (s% sparse_min_refactor_rgrowth > 0d0 .and. &
               iter > 1 .and. ks(k)% have_Numeric) then ! try refactor
            rgrowth = 0
            call star_sparse_refactor(s, k, nvar, mtx, ierr) 
            if (ierr == 0) rgrowth = star_sparse_rgrowth(s, k, nvar, ierr)
            if (ierr /= 0 .or. rgrowth < s% sparse_min_refactor_rgrowth) then ! reject it
               ierr = 0
            else
               did_refactor = .true.
            end if
         end if
         
         if (.not. did_refactor) then
            call star_sparse_factor(s, k, nvar, mtx, ierr)  
            if (ierr /= 0) then
               if (dbg) then
                  write(*,3) 'sparse_factor failed', k, s% model_number
               end if
               return
               stop 'sparse_factor'
            end if
         
            if (.false.) then
               condest = star_sparse_condest(s, k, nvar, ierr)
               write(*,3) 'sparse_factor condest logT', &
                  k, s% model_number, condest, s% lnT(k)/ln10
            end if

         end if
         
      end subroutine sparse_factor





      subroutine dense_factor(s, k, nvar, mtx, mtx_qp, ipivot, &
            use_quad, ierr)
         type (star_info), pointer :: s
         integer, intent(in) :: k, nvar
         real(dp), pointer :: mtx(:,:)
         real(qp), pointer :: mtx_qp(:,:)
         integer, pointer :: ipivot(:)
         logical, intent(in) :: use_quad
         integer, intent(out) :: ierr
         logical :: singular
         integer :: i, j
         real(dp), pointer :: work(:)
         integer, pointer :: iwork(:)
         real(dp) :: anorm, rcond
         include 'formats'
         ierr = 0
         
         if (use_quad) then
         
            if (k == s% nz) then
               write(*,*) 'dense_factor use_quad', k
               !do j=1,nvar
               !   do i=1,nvar
               !      if (mtx(i,j) /= 0) write(*,'(2i5,5x,1pd26.16)') i, j, mtx(i,j)
               !   end do
               !end do
               !stop
            end if
            do j=1,nvar
               do i=1,nvar
                  mtx_qp(i,j) = mtx(i,j)
               end do
            end do
            call my_getf2_qp(nvar, mtx_qp, nvar, ipivot, ierr)
            
            return
         
         end if
            
         call my_getf2(nvar, mtx, nvar, ipivot, ierr)         
         
      end subroutine dense_factor


      subroutine dense_solve(s, k, nvar, mtx, mtx_qp, ipivot, B_mtx, &
            use_quad, ierr)
         type (star_info), pointer :: s
         integer, intent(in) :: k, nvar
         real(dp), pointer :: mtx(:,:), B_mtx(:,:)
         real(qp), pointer :: mtx_qp(:,:)
         integer, pointer :: ipivot(:)
         logical, intent(in) :: use_quad
         integer, intent(out) :: ierr
         integer :: i
         real(dp), pointer :: b(:)
         !call my_getrs(nvar, nvar, mtx, nvar, ipivot, B_mtx, nvar, ierr)
         ierr = 0
         do i=1,nvar
            !call my_getrs1(nvar, mtx, nvar, ipivot, B_mtx(1:nvar,i), nvar, ierr)
            b(1:nvar) => B_mtx(1:nvar,i)
            call dense_solve1(s, k, nvar, b, mtx, mtx_qp, ipivot, &
               use_quad, .false., ierr)
            if (ierr /= 0) return
         end do
      end subroutine dense_solve


      subroutine dense_solve1(s, k, nvar, b, mtx, mtx_qp, ipivot, &
            use_quad, dbg, ierr)
         type (star_info), pointer :: s
         integer, intent(in) :: k, nvar
         real(dp), pointer :: b(:), mtx(:,:)
         real(qp), pointer :: mtx_qp(:,:)
         integer, pointer :: ipivot(:)
         logical, intent(in) :: use_quad
         logical, intent(in) :: dbg
         integer, intent(out) :: ierr
         ierr = 0
         
         if (use_quad) then
            call get1_qp
            return
         end if
         
         call my_getrs1(nvar, mtx, nvar, ipivot, b, nvar, ierr)

         contains
         
         
         subroutine get1_qp
            real(qp) :: b_qp(nvar)
            integer :: i, j
            if (k == s% nz .and. dbg) then
               write(*,*) 'dense_solve1 use_quad', k
               write(*,*) 'rhs'
               do j=1,nvar
                  write(*,'(i5,5x,1pd26.16)') j, b(j)
               end do
               !stop
            end if
            !if (k == s% nz) write(*,*) 'dense_solve1 use_quad', k
            do i=1,nvar
               b_qp(i) = b(i)
            end do
            call my_getrs1_qp(nvar, mtx_qp, nvar, ipivot, b_qp, nvar, ierr)
            do i=1,nvar
               b(i) = b_qp(i)
            end do
            if (k == s% nz .and. dbg) then
               write(*,*) 'dense_solve1 use_quad', k
               write(*,*) 'soln'
               do j=1,nvar
                  write(*,'(i5,5x,1pd26.16)') j, b(j)
               end do
               stop
            end if
         end subroutine get1_qp
         
         
         subroutine do_dense_refine
            real(dp), target :: rhs(nvar), x0(nvar)
            integer :: i, j, n
            real(dp) :: maxerr_limit, maxerr, maxerr0

            include 'formats'
         
            do j=1,nvar ! save the rhs
               rhs(j) = b(j)
            end do
            
            call my_getrs1(nvar, mtx, nvar, ipivot, b, nvar, ierr)
            if (ierr /= 0) return
            ! solution is now in b
         
            maxerr_limit = s% op_split_burn_refine_maxerr_limit
         
            ! refine solution         
            do n=1,10
            
               if (n == 1) then ! save the current solution
                  do j=1,nvar
                     x0(j) = b(j)
                  end do
               end if
         
               maxerr = 0
               ! set rhs to b = rhs - A*x0, the residual
               do j=1,nvar
                  b(j) = rhs(j)
                  do i=1,nvar
                     b(j) = b(j) - test_mtx(j,i)*x0(i)
                  end do
                  if (abs(b(j)) > maxerr) maxerr = abs(b(j))
               end do
               
               write(*,3) 'maxerr', n, k, maxerr
         
               if (maxerr < maxerr_limit) then
                  do j=1,nvar
                     b(j) = x0(j)
                  end do
                  return
               end if
            
               maxerr_limit = 1d-16
   
               call my_getrs1(nvar, mtx, nvar, ipivot, b, nvar, ierr)
               if (ierr /= 0) return
               ! b now has the correction to the solution
         
               do j=1,nvar
                  b(j) = b(j) + x0(j)
               end do
               ! b now has the refined solution
         
               ! check revised maxerr
         
               maxerr0 = maxerr
               maxerr = 0
            
               ! save the solution
               do j=1,nvar
                  x0(j) = b(j)
               end do
            
               ! set b to residual rhs - A*x0
               do j=1,nvar
                  b(j) = rhs(j)
                  do i=1,nvar
                     b(j) = b(j) - test_mtx(j,i)*x0(i)
                  end do
                  if (abs(b(j)) > maxerr) maxerr = abs(b(j))
               end do

               !if (k == s% nz) &
                  write(*,3) 'solve_dp maxerr0, maxerr', k, n, maxerr0, maxerr
            
               ! put the solution back in b
               do j=1,nvar
                  b(j) = x0(j)
               end do
         
            end do
            
         end subroutine do_dense_refine
         
      end subroutine dense_solve1




      subroutine sparse_solve(s, k, nrhs, nvar, b, ierr)
         use star_sparse, only: star_sparse_solve
         type (star_info), pointer :: s
         integer, intent(in) :: k, nrhs, nvar
         real(dp), pointer :: b(:)
         integer, intent(out) :: ierr
         call star_sparse_solve(s, k, nrhs, nvar, b, ierr)         
      end subroutine sparse_solve


      subroutine sparse_solve1(s, k, nvar, b, mtx, ierr)
         use star_sparse, only: star_sparse_solve, star_sparse_condest
         type (star_info), pointer :: s
         integer, intent(in) :: k, nvar
         real(dp), pointer :: b(:), mtx(:,:)
         integer, intent(out) :: ierr
         
         real(dp) :: condest
         
         if (.false.) then
         condest = star_sparse_condest(s, k, nvar, ierr)
         if (condest < 1d9) then
!$OMP critical
            call testing
!$OMP end critical
         end if
         end if
         
         call star_sparse_solve(s, k, 1, nvar, b, ierr)
      
         
         contains
         
         
         subroutine testing
            real(dp), target :: rhs_a(nvar)
            real(dp), pointer :: rhs(:)
            integer, target :: ipiv_a(nvar)
            integer, pointer :: ipiv(:)
            real(qp), pointer :: mtx_qp(:,:)
            integer :: i, j, nnz
            include 'formats'
            
            rhs => rhs_a
            ipiv => ipiv_a
            
            nnz = 0
            do j=1,nvar
               do i=1,nvar
                  if (mtx(i,j) == 0d0) cycle
                  nnz = nnz + 1
               end do
            end do
            write(*,*) nvar, nnz
            write(*,'(2a5,5x,a16)') 'i', 'j', 'mtx(i,j)'
            do j=1,nvar
               do i=1,nvar
                  if (mtx(i,j) == 0d0) cycle
                  write(*,'(2i5,5x,1pd26.16)') i, j, mtx(i,j)
               end do
            end do
            write(*,'(1a5,5x,a16)') 'j', 'rhs(j)'
            do j=1,nvar
               write(*,'(i5,5x,1pd26.16)') j, b(j)
               rhs(j) = b(j)
            end do
            call dense_factor( &
               s, k, nvar, mtx, mtx_qp, ipiv, .false., ierr)
            !call my_getf2(nvar, mtx, nvar, ipiv, ierr)         
            if (ierr /= 0) then
               stop 'failed in my_getf2'
            end if
            call my_getrs1(nvar, mtx, nvar, ipiv, rhs, nvar, ierr)
            if (ierr /= 0) then
               stop 'failed in my_getrs1'
            end if
            write(*,'(1a5,5x,a16)') 'j', 'soln(j)'
            do j=1,nvar
               write(*,'(i5,5x,1pd26.16)') j, rhs(j)
            end do
            write(*,2) 'condest', k, condest
            stop
            
         
         end subroutine testing
                  
         
         subroutine do_sparse_refine
            real(dp), target :: rhs(nvar), x0(nvar)
            integer :: i, j, n
            real(dp) :: maxerr_limit, maxerr

            include 'formats'
         
            do j=1,nvar ! save the rhs
               rhs(j) = b(j)
            end do
            
            call star_sparse_solve(s, k, 1, nvar, b, ierr)
            if (ierr /= 0) return
            ! solution is now in b
         
            maxerr_limit = s% op_split_burn_refine_maxerr_limit
         
            do n=1,10 ! refine multiple times if needed
         
               maxerr = 0
               ! set b = rhs - A*x0, the residual error for x0
               do j=1,nvar
                  x0(j) = b(j) ! save the current solution
                  b(j) = rhs(j)
                  do i=1,nvar
                     b(j) = b(j) - mtx(j,i)*x0(i)
                  end do
                  if (abs(b(j)) > maxerr) maxerr = abs(b(j))
               end do
               
               if (k == k_debug) then
                  write(*,3) 'maxerr', n, k, maxerr, maxerr_limit
               end if
         
               if (maxerr < maxerr_limit) then
                  do j=1,nvar
                     b(j) = x0(j)
                  end do
                  !if (k == k_debug .and. n > 1) stop 'do_sparse_refine'
                  return
               end if
   
               call star_sparse_solve(s, k, 1, nvar, b, ierr)
               if (ierr /= 0) return
               ! b now has the correction to the solution
         
               do j=1,nvar
                  b(j) = b(j) + x0(j)
               end do
               ! b now has the refined solution
         
            end do
            
            !if (k == k_debug) stop 'do_sparse_refine'
            
         end subroutine do_sparse_refine
         
      end subroutine sparse_solve1


      subroutine bcyclic_deallocate ( &
            s, lblk1, dblk1, ublk1, ipivot1, brhs1, nvar, nz, sparse, &
            lrd, rpar_decsol, lid, ipar_decsol, ierr)
         type (star_info), pointer :: s
         real(dp), pointer :: lblk1(:) ! row section of lower block
         real(dp), pointer :: dblk1(:) ! row section of diagonal block
         real(dp), pointer :: ublk1(:) ! row section of upper block
         integer, pointer :: ipivot1(:) 
            ! row section of pivot array for block factorization
         real(dp), pointer :: brhs1(:) ! row section of rhs
         integer, intent(in) :: nvar ! linear size of each block
         integer, intent(in) :: nz ! number of block rows
         logical, intent(in) :: sparse
         integer, intent(in) :: lrd, lid
         real(dp), pointer, intent(inout) :: rpar_decsol(:) ! (lrd)
         integer, pointer, intent(inout) :: ipar_decsol(:) ! (lid)
         integer, intent(out) :: ierr
         
         integer :: k
         real(dp), pointer :: rpar(:), b(:)
         type(sparse_info), pointer :: ks(:)
         integer, pointer :: ipar(:)
         real(dp), target :: b_ary(1)
         
         ierr = 0
         if (.not. sparse) return
         
         write(*,*) 'bcyclic_deallocate'
         
         ks => s% bcyclic_sprs_storage
         b(1:1) => b_ary(1:1)
         do k = 1, size(s% bcyclic_sprs_storage)
            if (ks(k)% sprs_nonzeros > 0) then
               ipar => ks(k)% ipar8_decsol
               rpar => ks(k)% rpar_decsol
               call klu_dble_decsols_nrhs_0_based( & ! free
                  2, 0, nvar, ks(k)% sprs_nonzeros, &
                  ks(k)% ia, ks(k)% ja, ks(k)% values, b, &
                  lrd, rpar, lid, ipar, ierr)
               ks(k)% sprs_nonzeros = -1
            end if
         end do

      end subroutine bcyclic_deallocate
      
      
      include 'mtx_solve_routines.inc'
      include 'mtx_solve_routines_quad.inc'
      
      
      end module star_bcyclic
