! ***********************************************************************
!
!   Copyright (C) 2012  Bill Paxton
!
!   This file is part of MESA.
!
!   MESA is free software; you can redistribute it and/or modify
!   it under the terms of the GNU General Library Public License as published
!   by the Free Software Foundation; either version 2 of the License,or
!   (at your option) any later version.
!
!   MESA is distributed in the hope that it will be useful,
!   but WITHOUT ANY WARRANTY; without even the implied warranty of
!   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!   GNU Library General Public License for more details.
!
!   You should have received a copy of the GNU Library General Public License
!   along with this software; if not,write to the Free Software
!   Foundation,Inc.,59 Temple Place,Suite 330,Boston,MA 02111-1307 USA
!
! ***********************************************************************

! based on MPI version of trisolve
! written by P. Garaud and J.D. Garaud; V1. released October 1st 2007.

   module test_trisolve
   
   use mtx_lib
   use mtx_def
   
   implicit none

   ! User-input parameters related to the matrix

   integer, parameter :: maxi = 100    ! size of each square block (has to be the same for all)
   integer, parameter :: mlines = 40  ! number of blocklines

   ! User-input parameters related to the cluster

   integer, parameter :: nproc = 8   ! Number of processors 
   integer, parameter :: nbppmax = mlines/nproc +1    ! number of block-lines per proc. 

   real(dp), dimension(maxi,maxi,nbppmax,0:nproc-1) :: ABOVE,DIAG,BELOW

   integer, dimension(2,nbppmax,0:nproc-1) :: NBELOW 
   integer, dimension(nbppmax,0:nproc-1) :: NDIAG,NABOVE

   real(dp), dimension(maxi,0:nproc-1) :: work1d1
   real(dp), dimension(maxi,maxi,0:nproc-1) :: work1d2,work2d2

   
   contains
   

   subroutine trisolve

      !     Routine solves MX = B where 
      !     M is a block-tridiagonal matrix with mlines block-lines
      !     of maxi*maxi sized-blocks
      !     On exit,sol is the solution.

      real(dp), dimension(maxi,mlines) :: sol
      integer :: idp

      real(dp) :: piv

      integer, dimension(0:nproc-1) :: nbpp,nend,noffset
      integer :: nbpploc,nendloc,nstartloc,nbav,nbrest

      integer, dimension(2,0:nproc-1) :: NV,NV2

      integer :: ierr,lastproc,nblocks,op_err

      integer :: nstore,nsend,nrecv,idest,isource,nshift
      integer :: nc,nl,i,j,n

      character*2 :: mmtype
      character :: vmtype

      ! BELOW are the lower diagonal matrices.
      ! DIAG are the diagonal matrices  
      ! ABOVE are the upper diagonal matrices
      ! B is the rhs matrices
      ! NBELOW,NDIAG,NABOVE,NB are the dimensions of BELOW,DIAG and ABOVE and B

      do j=1,mlines
         do i=1,maxi
            sol(i,j) = 0.d0
         end do
      end do
      lastproc = nproc-1          !id number of the last proc

      ! *** Allocate evenly the number of blocks per processor
      ! *** The first few processors have one more block than the last ones
      ! *** Each processor knows all of this info
      nbav = mlines/nproc
      nbrest = mod(mlines,nproc)
      do i=0,nbrest-1
         nbpp(i) = nbav+1
      end do
      do i=nbrest,nproc-1
         nbpp(i) = nbav
      end do
      nend(0) = nbpp(0)
      do i=1,nproc-1
         nend(i) = nend(i-1) + nbpp(i)
      end do
      noffset(0) = 0
      do i=1,nproc-1
         noffset(i) = nend(i-1)
      end do

      ierr=0

      !write(*,*) 'Read matrices'
!$OMP PARALLEL DO PRIVATE(idp,nendloc,nbpploc,nstartloc,nstore,i,piv,j)
      do idp = 0,lastproc
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nstartloc=noffset(idp)+1
         call readmat(BELOW(:,:,:,idp),DIAG(:,:,:,idp),ABOVE(:,:,:,idp),&
            sol(:,nstartloc:nendloc),NBELOW(:,:,idp),NDIAG(:,idp),NABOVE(:,idp),&
            nstartloc,nendloc,nbpploc,maxi,idp)
      end do
!$OMP END PARALLEL DO

      do idp = 0,lastproc
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1
         do nstore = 1,nbpploc
            do i=1,NDIAG(nstore,idp)
               !sol(i,nshift+nstore) = B(i,nstore,idp)
            end do
         end do
      end do

!$OMP PARALLEL DO PRIVATE(idp,nendloc,nbpploc,nshift,nstartloc,nstore,i,piv,j,nsend,op_err,vmtype,mmtype,nc,nl)
      do idp = 0,lastproc
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1
         do nstore=1,nbpploc
            do i=1,maxi
               piv = diag(i,i,nstore,idp)
               sol(i,nstore+nshift) = sol(i,nstore+nshift)/piv
               do j=1,maxi
                  diag(i,j,nstore,idp) = diag(i,j,nstore,idp)/piv
                  below(i,j,nstore,idp) = below(i,j,nstore,idp)/piv
                  above(i,j,nstore,idp) = above(i,j,nstore,idp)/piv
               end do
            end do
         end do
         ! ************************** First part : *****************************
         ! Structured reduction,first sweep. Pivot one line,update the next.
         ! *********************************************************************
         do nstore = 1,nbpploc-1
            op_err = 0
            call P_pivot( &
               BELOW(:,:,nstore,idp),DIAG(:,:,nstore,idp),ABOVE(:,:,nstore,idp),&
               sol(:,nstore+nshift),NBELOW(:,nstore,idp),&
               NDIAG(nstore,idp),NABOVE(nstore,idp),maxi,work1d1(:,idp),idp,op_err)
            if (op_err /= 0) ierr = op_err

            ! update sol
            vmtype = 'b'
            call vecmult(BELOW(:,:,nstore+1,idp),sol(:,nstore+nshift),work1d1(:,idp),maxi,&
               NBELOW(:,nstore+1,idp),NDIAG(nstore+1,idp),vmtype,idp)

            do nl=1,NDIAG(nstore+1,idp)
               sol(nl,nstore+nstartloc) = sol(nl,nstore+nstartloc) - work1d1(nl,idp)
            end do

            ! Update DIAG
            mmtype = 'ba'
            NV(1,idp) = NABOVE(nstore,idp)
            call matmult( &
               BELOW(:,:,nstore+1,idp),ABOVE(:,:,nstore,idp),&
               work1d2(:,:,idp),maxi,NBELOW(:,nstore+1,idp),&
               NV(:,idp),NDIAG(nstore+1,idp),mmtype,idp)

            do nc = 1,NABOVE(nstore,idp)
               do nl = 1,NDIAG(nstore+1,idp)
                  DIAG(nl,nc,nstore+1,idp) = DIAG(nl,nc,nstore+1,idp) - work1d2(nl,nc,idp)
               end do
            end do

            ! Update BELOW
            mmtype = 'bb'
            call matmult( &
               BELOW(:,:,nstore+1,idp),BELOW(:,:,nstore,idp),work1d2(:,:,idp),&
               maxi,NBELOW(:,nstore+1,idp),NBELOW(:,nstore,idp),NDIAG(nstore+1,idp),mmtype,idp)

            NBELOW(1,nstore+1,idp) = NBELOW(1,nstore,idp) 
            NBELOW(2,nstore+1,idp) = NBELOW(2,nstore,idp) 

            do nc = NBELOW(1,nstore,idp)-NBELOW(2,nstore,idp)+1,NBELOW(1,nstore,idp)
               do nl = 1,NDIAG(nstore+1,idp)
                  BELOW(nl,nc,nstore+1,idp) = -work1d2(nl,nc,idp)
               end do
            end do

         end do

      end do
!$OMP END PARALLEL DO

      if (ierr /= 0) then
         stop 1
      end if

      ! **********************************************************************
      ! Note that the last diagonal blocks have not been pivoted yet.
      ! Structured reduction,second sweep step. All processors pass their 
      ! last blocks to the next,which now sweep through the above matrices.
      ! **********************************************************************

      !write(*,*) 'Structured reduction,second sweep'

!$OMP PARALLEL DO PRIVATE(idp,nendloc,nbpploc,nshift,nstartloc,vmtype,mmtype,nl,nc,nstore,nrecv)
      do idp = 0,lastproc
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1

         if (idp.gt.0) then
            nrecv = nbpp(idp-1)
            do nstore = 1,nbpploc-1

               ! Update sol
               vmtype = 'a'
               NV(1,idp) = NABOVE(nrecv,idp-1)
               call vecmult(&
                  ABOVE(:,:,nrecv,idp-1),sol(:,nstore+nshift),&
                  work1d1(:,idp),maxi,NV(:,idp),NDIAG(nrecv,idp-1),vmtype,idp)
               do nl=1,NDIAG(nrecv,idp-1)
                  sol(nl,nshift) = sol(nl,nshift) - work1d1(nl,idp)
               end do

               ! Update DIAG         
               mmtype = 'ab'
               NV(1,idp) = NABOVE(nrecv,idp-1)
               call matmult(&
                  ABOVE(:,:,nrecv,idp-1),BELOW(:,:,nstore,idp),&
                  work1d2(:,:,idp),maxi,NV(:,idp),NBELOW(:,nstore,idp),NDIAG(nrecv,idp-1),mmtype,idp)

               do nc = NBELOW(1,nstore,idp)-NBELOW(2,nstore,idp)+1,NBELOW(1,nstore,idp)
                  do nl = 1,NDIAG(nrecv,idp-1)
                     DIAG(nl,nc,nrecv,idp-1) = DIAG(nl,nc,nrecv,idp-1) - work1d2(nl,nc,idp)
                  end do
               end do

               ! Update ABOVE
               mmtype = 'aa'
               NV(1,idp) = NABOVE(nrecv,idp-1)
               NV2(1,idp) = NABOVE(nstore,idp)
               call matmult(&
                  ABOVE(:,:,nrecv,idp-1),ABOVE(:,:,nstore,idp),work2d2(:,:,idp),&
                  maxi,NV(:,idp),NV2(:,idp),NDIAG(nrecv,idp-1),mmtype,idp)

               do nc = 1,NABOVE(nstore,idp)
                  do nl = 1,NDIAG(nrecv,idp-1)
                     ABOVE(:,:,nrecv,idp-1) = - work2d2(nl,nc,idp)
                  end do
               end do

            end do
         endif
         
      end do
!$OMP END PARALLEL DO

      ! *********************************************************************
      ! At this point,all the processors posses one unknown block left to be 
      ! inverted in one big tridiagonal system spanning the last blocks in each
      ! processor
      ! *********************************************************************

      ! *********************************************************************
      ! Last steps: sweeping reduction across processors. 
      ! *********************************************************************
      ! Wait to receive the data from previous processor,and
      ! when it comes,store it in the buffer (if you're not the first
      ! proc.)

      !write(*,*) 'sweeping reduction across processors'
      
      do idp = 0,lastproc ! NOT PARALLEL
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1

         if (idp.gt.0) then
            nrecv = nbpp(idp-1)

            ! Update DIAG

            mmtype = 'ba'
            NV(1,idp) = NABOVE(nrecv,idp-1)
            call matmult(&
               BELOW(:,:,nbpploc,idp),ABOVE(:,:,nrecv,idp-1),work1d2(:,:,idp),&
               maxi,NBELOW(:,nbpploc,idp),NV(:,idp),NDIAG(nbpploc,idp),mmtype,idp)

            do nc = 1,NABOVE(nrecv,idp-1)
               do nl = 1,NDIAG(nbpploc,idp)
                  DIAG(nl,nc,nbpploc,idp) = DIAG(nl,nc,nbpploc,idp) - work1d2(nl,nc,idp)
               end do
            end do

            ! update B

            vmtype = 'b'
            call vecmult(&
               BELOW(:,:,nbpploc,idp),sol(:,nend(idp-1)),work1d1(:,idp),maxi,&
               NBELOW(:,nbpploc,idp),NDIAG(nbpploc,idp),vmtype,idp)

            do nl=1,NDIAG(nbpploc,idp)
               sol(nl,nendloc) = sol(nl,nendloc) - work1d1(nl,idp)
            end do

            ! update BELOW (simply zero it)

            NBELOW(1,nbpploc,idp) = NBELOW(1,nrecv,idp-1) 
            NBELOW(2,nbpploc,idp) = NBELOW(2,nrecv,idp-1) 

         endif

         ! Pivot the new diagonal matrix and update the 
         ! the above matrix and b-vector.
         ! Note that if you are processor 0,then this is where you start.
         ! If you're the last proc,this is where you end.

         call P_pivot(&
            BELOW(:,:,nbpploc,idp),DIAG(:,:,nbpploc,idp),ABOVE(:,:,nbpploc,idp),&
            sol(:,nendloc),NBELOW(:,nbpploc,idp),&
            NDIAG(nbpploc,idp),NABOVE(nbpploc,idp),maxi,work1d1(:,idp),idp,ierr)
         if (ierr /= 0) exit

      end do

      if (ierr /= 0) then
         stop 1
      end if

      ! Now the very last proc has the real solution for its
      ! last block in B.

      ! ******************************************************************8
      ! First step of the backsub: send the solution back to each processors
      ! now in decreasing order,and solve for the "last-block" variables.
      ! ******************************************************************8

      !write(*,*) 'First step of the backsub'

      do idp = lastproc-1,0,-1 ! NOT PARALLEL
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1

         vmtype = 'a'
         NV(1,idp) = NABOVE(nbpploc,idp)

         call vecmult(&
            ABOVE(:,:,nbpploc,idp),sol(:,nend(idp+1)),work1d1(:,idp),&
            maxi,NV(:,idp),NDIAG(nbpploc,idp),vmtype,idp)

         do nl=1,NDIAG(nbpploc,idp)
            sol(nl,nshift+nbpploc) = sol(nl,nshift+nbpploc) - work1d1(nl,idp)
         end do

      end do

      ! So by now each processor has the solution to the "lastblock"
      ! variable stored in the B of its last block. 

      ! Now each processor can continue the backsubstitution entirely
      ! independently.

      !write(*,*) 'each processor continue the backsubstitution'
      
!$OMP PARALLEL DO PRIVATE(idp,nendloc,nbpploc,nstartloc,nshift,nstore,vmtype,nl)
      do idp = 0,lastproc
         nendloc=nend(idp)
         nbpploc=nbpp(idp)
         nshift=noffset(idp)
         nstartloc=nshift+1

         do nstore=nbpploc-1,1,-1
            vmtype = 'a'
            NV(1,idp) = NABOVE(nstore,idp)
            call vecmult(&
               ABOVE(:,:,nstore,idp),sol(:,nshift+nstore+1),work1d1(:,idp),&
               maxi,NV(:,idp),NDIAG(nstore,idp),vmtype,idp)
            do nl=1,NDIAG(nstore,idp)
               sol(nl,nshift+nstore) = sol(nl,nshift+nstore) - work1d1(nl,idp)
            end do

            if (idp.gt.0) then
               vmtype = 'b'
               call vecmult(&
                  BELOW(:,:,nstore,idp),sol(:,nshift),work1d1(:,idp),maxi,&
                  NBELOW(:,nstore,idp),NDIAG(nstore,idp),vmtype,idp)

               do nl=1,NDIAG(nstore,idp)
                  sol(nl,nshift+nstore) = sol(nl,nshift+nstore) - work1d1(nl,idp)
               end do
            endif
         end do

         ! *******************************************************************
         ! At this point,the Vector B in each processor
         ! contains its bit of the solution. We put this into sol.
         ! *******************************************************************
                  
      end do
!$OMP END PARALLEL DO


      ! CHECK TEST RESULTS

      open(12,file='solution.dat',status='unknown')
      do n=1,mlines
         do i=1,maxi
            write(12,*) sol(i,n)
         end do
      end do
      write(*,*)

      stop 'trisolve'

   end subroutine trisolve


   subroutine readmat( &
         BELOW,DIAG,ABOVE,B,NBELOW,NDIAG,NABOVE,&
         nstartloc,nendloc,nbpp,id,idp)
      integer :: id,idp
      double precision, dimension(:,:,:) :: BELOW,ABOVE,DIAG ! (id,id,nbppmax)
      double precision, dimension(:,:) :: B ! (id,nbppmax)
      integer, dimension(:,:) :: NBELOW ! (2,nbppmax)
      integer, dimension(:) :: NDIAG,NABOVE ! (nbppmax)
      integer :: nstartloc,nendloc,nbpp

      integer :: lastproc,nstart,nend
      integer :: nstore,i,j,n

      do nstore=1,nbpp
         NDIAG(nstore) = maxi
         NBELOW(1,nstore) = maxi
         NBELOW(2,nstore) = maxi
         NABOVE(nstore) = maxi
      end do

      if (idp == 0) then
         NBELOW(1,1) = 0
         NBELOW(2,1) = 0
         NDIAG(1) = maxi
         NBELOW(1,2) = maxi
      endif

      lastproc = nproc-1
      if (idp == lastproc) then
         NABOVE(nbpp) = 0
         NDIAG(nbpp) = maxi
      endif

      ! *** Reading the matrix and vector:

      do n = nstartloc,nendloc

         ! calculate storage index         
         nstore = n-nstartloc+1

         ! Enter the matrix coefficients.
         do j = 1,maxi
            do i=1,maxi
               if (n.gt.1) BELOW(i,j,nstore) = mfunc('l',i,j,n)
               DIAG(i,j,nstore) = mfunc('m',i,j,n)
               if (n.lt.mlines) ABOVE(i,j,nstore) = mfunc('r',i,j,n)
            end do
         end do
         do i=1,maxi
            B(i,nstore) = bfunc(i,n)
         end do

      end do

      if (nstore.ne.nbpp) then
         write(*,*) 'Problem in readmat for processor',idp
      endif

   end subroutine readmat


   subroutine P_pivot(BELOW,DIAG,ABOVE,B,NBELOW,NDIAG,NABOVE,maxi,amax,idp,ierr)

      ! Subroutine diagonalises matrix DIAG by implicit partial pivoting,
      ! and carries out the same operations on matrices BELOW,ABOVE and
      ! on the RHS vector B
      !
      ! Required parameters are the following
      ! Matrix BELOW(maxi,maxi)
      ! Matrix DIAG(maxi,maxi)
      ! Matrix ABOVE(maxi,maxi)
      ! Vector B(maxi)

      ! NBELOW number of columns in matrix BELOW
      ! NDIAG number of lines & columns in matrix DIAG,number of lines in B,
      ! ABOVE and BELOW

      ! maxi = first dimension of DIAG,BELOW,ABOVE in calling program
      ! ierr = error tag if matrix is singular
      ! amax = working array

      ! Routine written on 02/03/01 by P. Garaud and tested on a simple
      ! 4x4 matrix in test.f.
      ! Checked on 17/08/04,compared with num_recipes' gaussj on 3x3 matrices 

      integer :: NDIAG,NABOVE,maxi,ierr,idp
      real(dp), dimension(:,:) :: BELOW,DIAG,ABOVE ! (maxi,maxi)
      real(dp), dimension(:) :: B,amax ! (maxi)
      integer, dimension(:) :: NBELOW ! (2)

      integer :: i,j,ip,jp
      real(dp) :: piv,dum

      ierr = 0

      ! First step,find largest element/row in DIAG (for implicit pivoting)

      do i=1,NDIAG
         amax(i) = 0.d0
         do j=1,NDIAG
            if (dabs(DIAG(i,j)).gt.amax(i)) amax(i)=dabs(DIAG(i,j))
         end do

         !      if (amax(i).lt.1.d-16) then
         !         write(*,*) 'problem in pivot'
         !         pause
         !      endif
      end do

      ! Second step,loop over all columns jp the pivoting algorithm

      do jp=1,NDIAG

         !        write(*,*) 'Pivot number',jp

         ! 1. Find pivot in column jp (jp = index of column to be pivoted)

         piv=0.d0
         do i=jp,NDIAG
            dum= DIAG(i,jp)/amax(i)
            if (dabs(dum).gt.piv) then
               piv=dabs(dum)
               ip = i           ! ip = index of row which holds pivot
            endif
         end do
   
         if (piv == 0.d0) then ! singular matrix
            ierr = -1
            return
            !stop 'trisolve: singular matrix'
         end if


         !         if (piv.lt.1.d-12) write(*,*) 'pivot = ',piv,'in row = ',ip


         ! 2. Swap row ip with row jp if ip.ne.jp,swap amax,and swap rows in 
         ! matrices to be stored s and b

         if (ip.ne.jp) then
            !            write(*,*) 'swap lines'
            do j=1,NDIAG
               dum = DIAG(ip,j)
               DIAG(ip,j) = DIAG(jp,j)
               DIAG(jp,j) = dum
            end do
            dum = amax(ip)
            amax(ip) = amax(jp)
            amax(jp) = dum
            do j= 1,NABOVE
               dum = ABOVE(ip,j)
               ABOVE(ip,j) = ABOVE(jp,j)
               ABOVE(jp,j) = dum
            end do
            do j= NBELOW(1)-NBELOW(2)+1,NBELOW(1)
               dum = BELOW(ip,j)
               BELOW(ip,j) = BELOW(jp,j)
               BELOW(jp,j) = dum
            end do
            dum = B(ip)
            B(ip) = B(jp)
            B(jp) = dum
         endif

         ! pivot is now placed at position (ip,jp)

         ip=jp

         ! 3. Normalise row with pivot

         piv=DIAG(ip,jp)
   
         if (piv.ne.1.d0) then
            do j=1,NDIAG
               DIAG(ip,j) = DIAG(ip,j)/piv
            end do
            do j=1,NABOVE
               ABOVE(ip,j) = ABOVE(ip,j)/piv
            end do
            do j=NBELOW(1)-NBELOW(2)+1,NBELOW(1)
               BELOW(ip,j) = BELOW(ip,j)/piv
            end do
            B(ip)=B(ip)/piv

         endif

         ! 4. Eliminate all elements in the column but the pivot,
         ! and keep track of the operations on the other columns and stored vectors
         ! L_i -> L_i - p_(i,jp) L_ip 

         do i= 1,NDIAG
            if (i.ne.jp) then

               if (DIAG(i,jp).ne.0.d0) then
                  piv = DIAG(i,jp)
                  do j = jp,NDIAG
                     DIAG(i,j) = DIAG(i,j) - piv*DIAG(ip,j)
                  end do
                  do j=1,NABOVE
                     ABOVE(i,j) = ABOVE(i,j) - piv*ABOVE(ip,j)
                  end do
                  do j = NBELOW(1)-NBELOW(2)+1,NBELOW(1)
                     BELOW(i,j) = BELOW(i,j) - piv*BELOW(ip,j)
                  end do
                  B(i) = B(i) - piv*B(ip)
               endif
            endif
         end do

         ! 5. End loop when all columns have been treated that way

      end do

      ! the final matrix p is the unit matrix,and the result of the operation on 
      ! the matrix s and the column vector b have been saved.

   end subroutine P_pivot

   
   subroutine matmult(AMAT,BMAT,RESULT,maxi,NA,NB,ND,mmtype,idp)
      integer :: maxi,ND
      real(dp), dimension(:,:) :: AMAT,BMAT,RESULT ! (maxi,maxi)
      integer, dimension(:) :: NA,NB ! (2)
      character*2 :: mmtype
      integer :: idp

      integer :: imin,imax,jmin,jmax,kmin,kmax
      real(dp) :: res
      integer :: i,j,k

      ! This routine multiplies the matrices AMAT and BMAT 
      ! together and return the result in result. The 
      ! multiplication is optimized. The number of lines
      ! is ND. The number and positions of non-zero
      ! columns in the two matrices is taken into account.

      imin = 1
      imax = ND

      if (mmtype(1:1) == 'a') then
         kmin = 1
         kmax = NA(1)
      endif
      if (mmtype(1:1) == 'b') then
         kmin = NA(1)-NA(2)+1
         kmax = NA(1)
      endif

      if (mmtype(2:2) == 'a') then
         jmin = 1
         jmax = NB(1)
      endif
      if (mmtype(2:2) == 'b') then
         jmin = NB(1)-NB(2)+1
         jmax = NB(1)
      endif

      do j=1,maxi
         do i=1,maxi
            RESULT(i,j) = 0.d0
         end do
      end do

      do j=jmin,jmax
         do i=imin,imax
            res = 0.d0
            do k = kmin,kmax
               res = res + AMAT(i,k)*BMAT(k,j)
            end do
            RESULT(i,j) = res
         end do
      end do

   end subroutine matmult
   

   subroutine vecmult(AMAT,BVEC,RESULT,maxi,NA,ND,vmtype,idp)      
      integer :: maxi,ND
      integer, dimension(:) :: NA ! (2)
      real(dp), dimension(:,:) :: AMAT ! (maxi,maxi) 
      real(dp), dimension(:) :: BVEC,RESULT ! (maxi)
      character :: vmtype
      integer :: idp

      integer :: imin,imax,kmin,kmax,i,k
      real(dp) :: res

      ! Calculates the product between an AMAT matrix and a
      ! Bvec vector,and stores it in result. 

      ! ND is the number of lines in the multiplication. 
      ! Is either ii (all blocks
      ! except the first and last) or ka (first block) or kb 
      ! (last block)

      ! The multiplication is optimized. If the multiplying
      ! matrix is of "above" type then only the first NA
      ! columns are non-zero.
      ! If the multiplying matrix is of "below" type
      ! then NA has two indices,the first one is the number
      ! of lines/columns,the second one is the number
      ! of non-zero columns.

      imin = 1
      imax = ND

      if (vmtype == 'b') then
         kmin = NA(1)-NA(2)+1
         kmax = NA(1)
      endif

      if (vmtype == 'a') then
         kmin = 1
         kmax = NA(1)
      endif

      do i=1,maxi
         RESULT(i)= 0.d0
      end do

      do i=imin,imax
         res = 0.d0
         do k = kmin,kmax
            res = res + AMAT(i,k)*BVEC(k)
         end do
         RESULT(i) = res
      end do

   end subroutine vecmult
   

   double precision function bfunc(i,n)
      integer i,n
      bfunc = dble(1)
   end function bfunc


   double precision function mfunc(c,i,j,n)
      integer :: i,j,n
      character :: c
      if (c == 'l') then
         mfunc = 1.d0
      endif
      if (c == 'r') then
         mfunc = 1.d0
      endif
      if (c == 'm') then
         if (i == j) then 
            mfunc = dble(4)
         else
            mfunc = 1.d0
         endif
      endif
   end function mfunc


   end module test_trisolve
