!dirac_copyright_start
!      Copyright (c) by the authors of DIRAC.
!
!      This program is free software; you can redistribute it and/or
!      modify it under the terms of the GNU Lesser General Public
!      License version 2.1 as published by the Free Software Foundation.
!
!      This program is distributed in the hope that it will be useful,
!      but WITHOUT ANY WARRANTY; without even the implied warranty of
!      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
!      Lesser General Public License for more details.
!
!      If a copy of the GNU LGPL v2.1 was not distributed with this
!      code, you can obtain one at https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html.
!dirac_copyright_end

#if defined (VAR_MPI2)
***********************************************************************
*
*============================================================*
*  Common LUCITA and LUCIAREL routines for parallel purposes *
*                                                            *
*   collected by Stefan Knecht, Jan 18 - ???, 2007           *
*============================================================*
*
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PAR_BDRIV_REL(LUIN,LUOUT,SEGMNT,NDISTND,NBLOCK,
     &                                LBLOCKL,JCOMM,LBLK,IROOT,
     &                                LUINLIST,MY_IOFF_LUIN,I_RUN_CPLX)
C
C     Written by  S. Knecht         - June 11 2007
C
C**********************************************************************
C
C     copy vector from MPI file LUIN to 'normal file' LUOUT blockwise
C
C     NOTE: IROOT = IROOT
C
C     active blocks on the MPI-files are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
#include "implicit.h"
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SEGMNT(*), NDISTND( NBLOCK ), LBLOCKL(*)
      DIMENSION LUINLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN   = 0
      IOFFSET_INT_IN    = 0
C
C     initialize slave tag 
C
      MYNSLV = 0
C
C
C     ================
C      COMPLEX VECTOR
C     ================
C
C     .................
C      REAL PART FIRST 
C     .................
C
C     set new offset
C
C     position in file is at the end of vector IROOT - 1
C
C     note: real part: --> MY_VEC2_IOFF and MY_ACT_BLK2
C
      IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                 ( IROOT - 1 )   * MY_VEC2_IOFF
      IOFFSET_INT_IN   = 1 + NUM_BLK  +
     &                 ( IROOT - 1 )   * MY_ACT_BLK2
C
C
C LOOP OVER BLOCKS
C
       DO 501 II = 1, NBLOCK 
C
C         slaves first ...
C
         IF( MYPROC .ne. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN 
C
             IOFFSET_SCRATCH = 0
             NUM_BLK = 0
C
             GOTO 500
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
             GOTO 100
           ENDIF
C
C        MASTER follows ...
C
         ELSE IF( MYPROC .eq. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN
             IF(NDISTND(II) .eq. -2 )THEN
C
C              block has zero length
C
               LBL = 0
               CALL ITODS(LBL,1,-1,LUOUT)
               IMZERO = 1
               IAMPACK = 0
               CALL ZERORC(LBL,LUOUT,IAMPACK)
C
               IOFFSET_SCRATCH = 0
               NUM_BLK = 0
C
               GOTO 500
             ELSE IF( NDISTND(II) .gt. 0 )THEN
C
C              recieve block from node MYNSLV
C
               MYNSLV = NDISTND(II) 
               GOTO 100 
             END IF
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
C
C            MASTER block
C
             LBL = LBLOCKL(II)
C  
             CALL ITODS(LBL,1,-1,LUOUT)
C
             IF( LUINLIST( IOFFSET_INT_IN ) .gt. 0 ) THEN
C
C             read block in
               call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &              SEGMNT,LBL,ISTAT)
               IMZERO = 0
             ELSE
               IMZERO = 1
             END IF
C
             IF(IMZERO.EQ.0) THEN
                CALL TODSC (SEGMNT,LBL,-1,LUOUT)
             ELSE
                CALL ZERORC(LBL,LUOUT,0)
             END IF
C
             NUM_BLK = 1
             IOFFSET_SCRATCH = LBL
C
             GOTO 500
           END IF
*         /\NDISTND eq 0 or ne 0
        END IF
C
C  ... continue
C
 100    CONTINUE
C
C       slave block
C
        LBL = LBLOCKL(II)
C
        IF( MYPROC .EQ. MASTER ) THEN
C
           CALL ITODS(LBL,1,-1,LUOUT)
C
        END IF
C
        IF(MYPROC.NE.MASTER) THEN
C
          ISENDTOM = 0
          ISENDTOM = LUINLIST( IOFFSET_INT_IN )
C
          call interface_mpi_SEND(ISENDTOM,1,MASTER,90,
     &                  JCOMM)
C
          IF( ISENDTOM .gt. 0 ) THEN
C
            call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &                            SEGMNT,lbl,ISTAT)
            call interface_mpi_send_r1_work_f77(SEGMNT,LBL,MASTER,91,
     &                    JCOMM)
C
          END IF
          IOFFSET_SCRATCH = LBL
          NUM_BLK = 1
        END IF
C
C
        IF(MYPROC.EQ.MASTER) THEN
C
          ISENDTOM = 0
C
          call interface_mpi_RECV(ISENDTOM,1,MYNSLV,90,
     &                  JCOMM)
C
          IF( ISENDTOM .gt. 0 ) THEN
C
            call interface_mpi_recv_r1_work_f77(SEGMNT,LBL,MYNSLV,91,
     &                     JCOMM)
            CALL TODSC(SEGMNT,LBL,-1,LUOUT)
          ELSE
            CALL ZERORC(LBL,LUOUT,0)
          END IF
          IOFFSET_SCRATCH = 0
          NUM_BLK = 0
        END IF
C
 500    CONTINUE
C       
C       keep track of correct offset
        IOFFSET_IN_LUIN = IOFFSET_IN_LUIN + IOFFSET_SCRATCH
        IOFFSET_INT_IN  = IOFFSET_INT_IN  + NUM_BLK
 501  CONTINUE
C
      IF( MYPROC .eq. MASTER )THEN
C       write an EOF-mark
        CALL ITODS(-1,1,-1,LUOUT)
      ENDIF
C
C     ..............
C        IMAG PART 
C     ..............
C
      IOFFSET_SCRATCH = 0
      NUM_BLK = 0
C
      IF( I_RUN_CPLX .eq. 2 )THEN
C
C       set new offset
C
C       position in file is at the end of vector IROOT - 1
C
C       note: complex part: --> MY_VEC1_IOFF and MY_ACT_BLK1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( IROOT - 1 )  * MY_VEC2_IOFF    +
     &                                    MY_VEC1_IOFF
        IOFFSET_INT_IN   = 1 + NUM_BLK  +
     &                   ( IROOT - 1 )  * MY_ACT_BLK2 + MY_ACT_BLK1
C
C LOOP OVER BLOCKS
C
       DO 601 II = 1, NBLOCK 
C
C         slaves first ...
C
         IF( MYPROC .ne. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN 
C
             IOFFSET_SCRATCH = 0
             NUM_BLK = 0
C
             GOTO 600
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
             GOTO 102
           ENDIF
C
C        MASTER follows ...
C
         ELSE IF( MYPROC .eq. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN
             IF(NDISTND(II) .eq. -2 )THEN
C
C              block has zero length
C
               LBL = 0
               CALL ITODS(LBL,1,-1,LUOUT)
               IMZERO = 1
               IAMPACK = 0
               CALL ZERORC(LBL,LUOUT,IAMPACK)
C
               IOFFSET_SCRATCH = 0
               NUM_BLK = 0
C
               GOTO 600
             ELSE IF( NDISTND(II) .gt. 0 )THEN
C
C              recieve block from node MYNSLV
C
               MYNSLV = NDISTND(II) 
               GOTO 102 
             END IF
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
C
C            MASTER block
C
             LBL = LBLOCKL(II)
C  
             CALL ITODS(LBL,1,-1,LUOUT)
C
             IF( LUINLIST( IOFFSET_INT_IN ) .gt. 0 ) THEN
C
C             read block in
               call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &              SEGMNT,LBL,ISTAT)
               IMZERO = 0
             ELSE
               IMZERO = 1
             END IF
C
             IF( IMZERO .eq. 0 ) THEN
                CALL TODSC (SEGMNT,LBL,-1,LUOUT)
             ELSE
                CALL ZERORC(LBL,LUOUT,0)
             END IF
C
             NUM_BLK = 1
             IOFFSET_SCRATCH = LBL
C
             GOTO 600
           END IF
*          ^NDISTND eq 0 or ne 0
          END IF
C
C     ... continue
C
 102      CONTINUE
C
C         slave block
C
          LBL = LBLOCKL(II)
C
          IF( MYPROC .eq. MASTER ) THEN
C
             CALL ITODS(LBL,1,-1,LUOUT)
C  
          END IF
C
          IF( MYPROC .ne. MASTER ) THEN
C
            ISENDTOM = 0
            ISENDTOM = LUINLIST( IOFFSET_INT_IN )
C
            call interface_mpi_SEND(ISENDTOM,1,MASTER,93,
     &                    JCOMM)
C
            IF( ISENDTOM .gt. 0 ) THEN
C
              call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &             SEGMNT,LBL,ISTAT)
              call interface_mpi_send_r1_work_f77(SEGMNT,LBL,MASTER,94,
     &                      JCOMM)
C
            END IF
            IOFFSET_SCRATCH = LBL
            NUM_BLK = 1
          END IF
C
C
          IF( MYPROC .eq. MASTER ) THEN
C
            ISENDTOM = 0
C
            call interface_mpi_RECV(ISENDTOM,1,MYNSLV,93,
     &                    JCOMM)
C
            IF( ISENDTOM .gt. 0 ) THEN
C
              call interface_mpi_recv_r1_work_f77(SEGMNT,LBL,MYNSLV,94,
     &                       JCOMM)
              CALL TODSC(SEGMNT,LBL,-1,LUOUT)
            ELSE
              CALL ZERORC(LBL,LUOUT,0)
            END IF
            IOFFSET_SCRATCH = 0
            NUM_BLK = 0
          END IF
C
 600      CONTINUE
C       
C         keep track of correct offset
          IOFFSET_IN_LUIN = IOFFSET_IN_LUIN + IOFFSET_SCRATCH
          IOFFSET_INT_IN  = IOFFSET_INT_IN  + NUM_BLK
 601    CONTINUE
C
        IF( MYPROC .eq. MASTER )THEN
C         write an EOF-mark
          CALL ITODS(-1,1,-1,LUOUT)
        ENDIF
C
      END IF
C     ^ COMPLEX PART NEEDED?
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PAR_BDRIV_DIA_REL2(LUIN,LUOUT,SEGMNT,NDISTND,
     &                                     NBLOCK,LBLOCKL,JCOMM,LBLK,
     &                                     IROOT,MY_IOFF_LUIN)
C
C     Written by  S. Knecht         - June 22 2007
C
C**********************************************************************
C
C     copy H diagonal from MPI file LUIN to 'normal file' LUOUT blockwise
C
C     NOTE: IROOT = IROOT
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
#include "implicit.h"
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SEGMNT(*), NDISTND( NBLOCK ), LBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN   = 0
C
C     initialize slave tag 
C
      MYNSLV = 0
C
C     set new offset
C
C     position in file is at the end of vector IROOT - 1
C
      IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                 ( IROOT - 1 )   * MY_VEC2_IOFF
C
C LOOP OVER BLOCKS
C
       DO 501 II = 1, NBLOCK 
C
C         slaves first ...
C
         IF( MYPROC .ne. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN 
C
             IOFFSET_SCRATCH = 0
             NUM_BLK = 0
C
             GOTO 500
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
             GOTO 100
           ENDIF
C
C        MASTER follows ...
C
         ELSE IF( MYPROC .eq. MASTER )THEN
           IF( NDISTND(II) .ne. MYPROC )THEN
             IF(NDISTND(II) .eq. -2 )THEN
C
C              block has zero length
C
               LBL = 0
               CALL ITODS(LBL,1,-1,LUOUT)
               IAMPACK = 0
               CALL ZERORC(LBL,LUOUT,IAMPACK)
C
               IOFFSET_SCRATCH = 0
C
               GOTO 500
             ELSE IF( NDISTND(II) .gt. 0 )THEN
C
C              recieve block from node MYNSLV
C
               MYNSLV = NDISTND(II) 
               GOTO 100 
             END IF
           ELSE IF( NDISTND(II) .eq. MYPROC )THEN
C
C            MASTER block
C
             LBL = LBLOCKL(II)
C  
             CALL ITODS(LBL,1,-1,LUOUT)
C
C            read block in
             call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &            SEGMNT,LBL,ISTAT)
C
             CALL TODSC(SEGMNT,LBL,-1,LUOUT)
C
             IOFFSET_SCRATCH = LBL
C
             GOTO 500
           END IF
*         /\NDISTND eq 0 or ne 0
        END IF
C
C  ... continue
C
 100    CONTINUE
C
C       slave block
C
        LBL = LBLOCKL(II)
C
        IF( MYPROC .EQ. MASTER ) THEN
C
           CALL ITODS(LBL,1,-1,LUOUT)
C
        END IF
C
        IF(MYPROC.NE.MASTER) THEN
C
          call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN_LUIN,
     &         SEGMNT,LBL,ISTAT)
          call interface_mpi_send_r1_work_f77(SEGMNT,LBL,MASTER,91,
     &                  JCOMM)
C
          IOFFSET_SCRATCH = LBL
        END IF
C
C
        IF(MYPROC.EQ.MASTER) THEN
C
          call interface_mpi_recv_r1_work_f77(SEGMNT,LBL,MYNSLV,91,
     &                  JCOMM)
          CALL TODSC(SEGMNT,LBL,-1,LUOUT)
C
          IOFFSET_SCRATCH = 0
        END IF
C
 500    CONTINUE
C       
C       keep track of correct offset
        IOFFSET_IN_LUIN = IOFFSET_IN_LUIN + IOFFSET_SCRATCH
 501  CONTINUE
C
      IF( MYPROC .eq. MASTER )THEN
C       write an EOF-mark
        CALL ITODS(-1,1,-1,LUOUT)
      ENDIF
C
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE H0M1TD_REL_PAR(LUOUT,LUDIA,LUIN,SHIFT,VEC1,VEC2,
     &                          LUOUTLIST,LUINLIST,
     &                          NBATCH,LBATCH,LEBATCH,I1BATCH,
     &                          IBATCH,MY_IOFF_LUOUT,MY_IOFF_LUDIA,
     &                          MY_IOFF_LUIN,INV,THRES_E)
C
C     calculate inverted general preconditioner matrix times vector
C
C     original written by Jeppe Olsen - September 1993
C
C     adaption of sequential routine for parallel purposes  
C                      by S. Knecht   -  March 13 2008
C
C     MPI file I/O version
C
C     vecout=  (H0 + shift )-1 * vecin
C
C      LUOUT       LUDIA        LUIN
C
C**********************************************************************
      use interface_to_mpi
#include "implicit.h"
#include "infpar.h"
#include "parluci.h"
*
* =====
* Input
* =====
*
* LUOUT : File to contain output vector
* LUDIA : File containing diagonal of H0
* LUIN  : File containing input vector
* SHIFT : constant ADDED to diagonal
*
* ======
* Output
* ======
*
* LUOUT : contains output vector, not rewinded
*
* =======
* Scratch
* =======
*
* VEC1,VEC2 : Must each be able to hold largest segment of vector
C
      DIMENSION VEC1(*),VEC2(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUDIA
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN, IOFFSET_IN_LUDIA
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN   = 0
      IOFFSET_IN_LUDIA  = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_IN    = 0
      IOFFSET_INT_LUOUT = 0
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
        CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
C       set new offset
C
        IOFFSET_IN_LUIN = MY_IOFF_LUIN + IOFFSET_SCRATCH
        IOFFSET_INT_IN  = 1 + NUM_BLK 
C
CSK     WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK  &                  IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) 'initial VEC2 on LUIN'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
        IOFFSET_IN_LUDIA = MY_IOFF_LUDIA + IOFFSET_SCRATCH
C
CSK     WRITE(LUWRT,*) 'This is my OFFSET for LUDIA',
CSK  &                  IOFFSET_IN_LUDIA
C
C       read in batch ISBATCH from LUDIA to VEC1
C
        CALL RDVEC_BATCH_DRV5(LUDIA,VEC1,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUDIA)
C
CSK     WRITE(LUWRT,*) 'initial VEC1 on LUDIA'
CSK     CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
C       calculate inverse diagonal on VEC1
C
        ILEN_BATCH =  0
        ILEN_BATCH =  LEBATCH(ISBATCH)
C
        IF( ILEN_BATCH .gt. 0 )THEN
           IF( CSCREEN) THEN
C             set proper truncation factor
              THR_TRUNC  = TRUNC_FAC * RNORM_FAC
              THR_ETRUNC = 1.0D-7 * THRES_E
Chj           14-jun-07:   disable THR_ETRUNC
!             THR_ETRUNC = -1.0D0
!testprint    WRITE(LUWRT,*) 'TRUNCATION FACTORS:',THR_TRUNC,THR_ETRUNC
              CALL DIAVC2_TRUNC(VEC1,VEC2,VEC1,SHIFT,LEBATCH(ISBATCH),
     &                          THR_TRUNC,THR_ETRUNC)
           ELSE
             CALL DIAVC2(VEC1,VEC2,VEC1,SHIFT,LEBATCH(ISBATCH))
           END IF
        END IF
CSK          WRITE(LUWRT,*) 'final VEC1'
CSK          WRITE(LUWRT,*) '  (D-E)-1 *( HX - EX ) '
CSK          WRITE(LUWRT,*) 'final VEC2'
CSK          CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                  LUWRT)
C
C       write VEC1 to LUOUT
C
        IOFFSET_LUOUT     = MY_IOFF_LUOUT  + IOFFSET_SCRATCH
        IOFFSET_INT_LUOUT = 1 + NUM_BLK
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC1,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_LUOUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PP_CC_B(LUIN,LUOUT,SCR,NBATCH,LBATCH,LEBATCH,
     &                         I1BATCH,IBATCH,MY_IOFF_LUIN,
     &                         MY_IOFF_LUOUT,
     &                         LUINLIST,LUOUTLIST,IBLOCKL,JOFF)
C
C     Written by  S. Knecht         - May 18 2007
C
C**********************************************************************
C
C     copy c-vector from file LUIN to LUOUT batchwise
C     update the file lists.
C
C     NOTE: JOFF = (IVEC resp. IROOT) - 1
C
C
C     copy vector from file LUIN to file LUOUT --> (hint LUIN, LUOUT)
C
C     active blocks on the MPI-files are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION SCR(*),LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUOUTLIST(*), IBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN, IOFFSET_OUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER(KIND=df_mpi_offset_kind) ISCRATCH_SP
      INTEGER(KIND=df_mpi_offset_kind) ILEN_COMB
      INTEGER NUM_BLK
C
      I_RUN_CPLX = 1
C     some constants
      IONE = 1
      ITWO = 2
C     initialize scratch offsets
      NUM_BLK = 0
      MY_NUM_BLK = 0
      IOFFSET_SCRATCH = 0
      IOFFSET_IN  = 0
      IOFFSET_OUT  = 0
      IOFFSET_INT_IN  = 0
      IOFFSET_INT_OUT  = 0
      ILEN_COMB = 0
      ILEN_COMB = L_COMBI
      MY_IOFFSET_SCRATCH = 0
      ISCRATCH_SP = 0
      NUM_BLK_SP = 0
C
C
C     ================
C       REAL VECTOR
C     ================
C
C     position in file is at the end of vector JOFF
C
C     note: real part: --> MY_VEC2_IOFF, MY_ACT_BLK2
C
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(SCR,LEBATCH(ISBATCH))
C
        IOFFSET_IN = MY_IOFF_LUIN + ( MY_VEC2_IOFF * JOFF ) +
     &               MY_IOFFSET_SCRATCH
        IOFFSET_INT_IN = ( MY_ACT_BLK2 * JOFF ) + MY_NUM_BLK + 1
C
C       write active block array for LUOUT 
C
        IOFFSET_INT_OUT = NUM_BLK + 1
C
        NUM_BLK_SP  = 0
        ISCRATCH_SP = 0
C
csk     WRITE(LUWRT,*) ' THIS IS MY IOFFSET_IN',IOFFSET_IN
csk     WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_IN',IOFFSET_INT_IN
csk     WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_OUT',IOFFSET_INT_OUT
C
        CALL RDVEC_BATCH_DRV3(LUIN,SCR,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_IN,IOFFSET_INT_IN,
     &                       IOFFSET_INT_OUT,LUINLIST,
     &                       LUOUTLIST,NUM_BLK_SP)
C
        IOFFSET_OUT = MY_IOFF_LUOUT + IOFFSET_SCRATCH
     
csk     WRITE(LUWRT,*) ' SCR content is'
csk     CALL WRTMATMN(SCR,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
csk     WRITE(LUWRT,*) ' THIS IS MY IOFFSET_OUT',IOFFSET_OUT
        
        IOFFSET_INT_OUT = NUM_BLK + 1
C
        CALL WTVEC_BATCH_DRV3(SCR,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        LUOUT,IOFFSET_OUT,IOFFSET_INT_OUT,
     &                        LUOUTLIST,1,ISCRATCH_SP,
     &                        IBLOCKL)
C
C
C       count the length of the last copy
C
C       LUOUT
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + ISCRATCH_SP
        NUM_BLK         = NUM_BLK + LBATCH(ISBATCH)
C       LUIN
        MY_NUM_BLK         = MY_NUM_BLK + NUM_BLK_SP
        MY_IOFFSET_SCRATCH = MY_IOFFSET_SCRATCH + LEBATCH(ISBATCH)
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE RDVEC_BATCH_DRV3(LUIN,SCR,NBATCH_BLK,NBATCH_INF,
     &                            IOFFSET,
     &                            IOFFSET_INT_IN,IOFFSET_INT_OUT,
     &                            IVCOFF_IN,IVCOFF_OUT,
     &                            ICOUNT_ACT)
C
C     Written by  S. Knecht         -      May 21 2007
C
C**********************************************************************
C
C     read in a batch from a MPI-file LUIN to SCR - C-vector routine
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*) 
      DIMENSION IVCOFF_IN(*), IVCOFF_OUT(*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET
      INTEGER JOFF
      JOFF = 0
C
        DO IBLK = 1, NBATCH_BLK
C
C         check for need
C
          ILEN2 = NBATCH_INF(8,IBLK) 
C
          IF( ILEN2 .gt. 0 ) THEN
C
C           count the active blocks for this batch
C
            ICOUNT_ACT = ICOUNT_ACT + 1
C
            IF( IVCOFF_IN( IOFFSET_INT_IN + ICOUNT_ACT - 1 ) .gt.0)THEN
C
C             write ILEN2 into file array for LUOUT
C
              IVCOFF_OUT( IOFFSET_INT_OUT + IBLK - 1 ) = ILEN2
C
C             memory offset
C
              JOFF = NBATCH_INF(6,IBLK)
C
CSK          WRITE(LUWRT,*) ' JOFF for read in IBLK at offset',JOFF, IBLK,
CSK     &                 IOFFSET
C
C           read vector
C
              call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET,SCR(JOFF),
     &                              ILEN2,ISTAT)
C
CSK              WRITE(LUWRT,'(2X,A,1X,I12,1X,A,1X,I6,1X,I6)') 
CSK     & 'Read-in from ILU1 at',IOFFSET,'for block',IOFFSET_INT_OUT +
CSK     &  IBLK - 1, JOFF
CSK              CALL WRTMATMN(SCR(JOFF),1,ILEN2,1,ILEN2,LUWRT)
C
            END IF 
C           ^ IVCOFF_IN .gt. 0 ?
          END IF
C         ^ ILEN2 .gt. 0 ?
C
          IOFFSET = IOFFSET + ILEN2
C
C
        ENDDO
C       ^ loop over blocks in batch
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE WTVEC_BATCH_DRV3(SCR,NBATCH_BLK,NBATCH_INF,
     &                            LUOUT,IOFFSET,IOFFSET_INT,
     &                            IVCOFF_OUT,NO_CHECK,ISCRATCH_SP,
     &                            IBLOCKL)
C
C     Written by  S. Knecht         -      May 21 2007
C
C**********************************************************************
C
C
C     write batch from SCR to MPI-file LUOUT - C-vector routine
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*), IVCOFF_OUT(*)
      DIMENSION IBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET, ISCRATCH_SP
      INTEGER JOFF, LENGTH
C
      JOFF   = 0
      LENGTH = 0
CSK      WRITE(LUWRT,*) 'calculating length start block is', IOFFSET_INT
C
C     loop over all blocks in that batch, do a norm check
C
      DO IBLK = 1, NBATCH_BLK
C
        JOFF = NBATCH_INF(6,IBLK)
csk     WRITE(LUWRT,*) 'calculating length for block',
csk  &                  IOFFSET_INT + IBLK - 1
        LENGTH = IBLOCKL( IOFFSET_INT + IBLK - 1 )
C       check norm of vector
        IF( NO_CHECK .eq. 0 ) THEN
          XXX = 0.0D0
          XXX = DDOT(LENGTH,SCR(JOFF),1,SCR(JOFF),1)
C
          IF( XXX .eq. 0.0D0 ) THEN
            IVCOFF_OUT( IOFFSET_INT + IBLK - 1 ) = 0
            GOTO 100
          END IF
        ELSE
C
          IF( IVCOFF_OUT( IOFFSET_INT + IBLK - 1 ) .eq. 0 ) GOTO 100
C
        END IF
C
csk     WRITE(LUWRT,'(2X,A,1X,I6,1X,A,1X,I14,1X,I6)') 'THIS IS block',
csk  &       IOFFSET_INT + IBLK - 1,'to go on pos (JOFF)',IOFFSET,JOFF
csk       CALL WRTMATMN(SCR(JOFF),1,LENGTH,1,LENGTH,LUWRT)
C
          call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET,SCR(JOFF),
     &                           LENGTH,ISTAT)
C
C
 100    CONTINUE  ! skip zero blocks on file
        IOFFSET = IOFFSET + LENGTH
        ISCRATCH_SP = ISCRATCH_SP + LENGTH
C
      END DO
C     ^ loop over blocks in a batch
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PAR_BDRIV5_REL(LUIN,LUOUT,SEGMNT,IBLOCKD,
     &                                 ISCALFAC,ISCALFAC_GROUP,
     &                                 IBLOCKL,NBLOCK,JCOMM,
     &                                 IGROUPLIST,IPROCLIST,IRILP)
C
C     Read blocks of the vector from disc file LUIN to array segment
C     and broadcast to nodes that need the block, save on diskfile luout 
C     if necessary. 
C     communication via communicator jcomm.
C
C     OUTPUT: update of LUOUT with new TTSS-c-blocks, 
C             modified ISCALFAC_GROUP.
C
C     based on the corresponding spinfree-routine
C
C     extended for an imaginary part of the c-vector
C     if NZ == IRILP == 2           S. Knecht - June 27 2007
C
C     Written by  S. Knecht         - May 23 2007
C
C**********************************************************************
      use interface_to_mpi
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SEGMNT(*), IBLOCKD(NBLOCK), IBLOCKL(NBLOCK)
      DIMENSION ISCALFAC(*), ISCALFAC_GROUP(*)
      DIMENSION IGROUPLIST(NMPROC),IPROCLIST(NMPROC)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN, IOFFSET_OUT
      INTEGER(KIND=df_mpi_offset_kind) IOFF_SCRATCH_P
      INTEGER IGROUPBLK, ROOTPROC, IOFF_BLOCK
      CHARACTER*12 WALLTID, SECTID
C
      bcast_time = 0.0D0
      read_write = 0.0D0
C
      IBI_DIST_NBLOCKS = 0
      IBI_RECV_NBLOCKS = 0
C
      IOFF_BLOCK     = 0
      IOFF_SCRATCH_P = 0
csk   WRITE(LUWRT,*) ' MY GROUPLIST resp PROCLIST'
csk   CALL IWRTMAMN(IGROUPLIST,1,NMPROC,1,NMPROC,LUWRT)
csk   CALL IWRTMAMN(IPROCLIST,1,NMPROC,1,NMPROC,LUWRT)
C   
C     =========
C     REAL PART
C     =========
C
      ROOTPROC   = 0
C
C     file offset for each cpu is 0 - real part
C
      IOFFSET_IN  = 0
      IOFFSET_OUT = 0
C
C     LOOP OVER BLOCKS
C
      DO IBLK = 1, NBLOCK 
C
C       get length of block
C
        LBL = IBLOCKL(IBLK)
C
C       check if block is active
        IF( ISCALFAC(IBLK) .ne. 0 ) THEN
C
C         check if block belongs to own group
          IGROUPBLK = 0
          ISCRNODE = IBLOCKD(IBLK)
C
          DO IPROC = 1, NEWCOMM_PROC
            IF( ISCRNODE .eq. IGROUPLIST(IPROC) ) IGROUPBLK = 1
          END DO
          IF( IGROUPBLK .eq. 1 ) THEN
C
csk         WRITE(6,*) 'ACTIVE BLOCK ',IBLK
csk         WRITE(6,*) 'START READING AT IOFFSET_IN = ',IOFFSET_IN
            xrw_time = interface_MPI_WTIME()
            call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN,SEGMNT,
     &                            lbl,ISTAT)
csk         WRITE(6,*) 'MY read-in result '
csk         CALL WRTMATMN(SEGMNT,1,LBL,1,LBL,6)
            read_write = read_write + interface_MPI_WTIME() - xrw_time
            IBI_DIST_NBLOCKS = IBI_DIST_NBLOCKS + 1
C
          END IF
C
C         find the sending root processor
csk       write(6,*) ' processor list:myproc,mynew_id,icomm_id',
csk  &    myproc,mynew_id,icomm_id
csk       CALL IWRTMAMN(IPROCLIST,1,NMPROC,1,NMPROC,6)
csk       call interface_mpi_BARRIER(ICOMM,IERR)
          ROOTPROC = IPROCLIST(ISCRNODE+1) - 1
csk       write(6,*)' ROOT proc should send: ICOMM_ID, ROOTPROC,MYPROC',
csk  &    ICOMM_ID,ROOTPROC,MYPROC
C
C         decide whether it is necessary to take part in communication
          NNCOLOR = 0
          NNKEY = 0
          NNKEY = MYPROC + 1
          IF( MYPROC .eq. ROOTPROC ) NNKEY = 0
csk       WRITE(6,*) ' MYPROC,ROOTPROC,NNKEY',MYPROC,ROOTPROC,NNKEY
          IF( ISCALFAC_GROUP(IBLK) .ne. 0 ) THEN
            NNCOLOR = 7
          ELSE
            NNCOLOR = 8
          END IF
          call interface_mpi_COMM_SPLIT(JCOMM,NNCOLOR,NNKEY,IBLOCKCOMM)
C
          IF( NNCOLOR .eq. 8 ) GOTO 200
C         broadcast the nonvanishing block
          xcast_time = interface_MPI_WTIME()
          call interface_mpi_bcast_r1_work_f77(SEGMNT,LBL,0,IBLOCKCOMM)
          bcast_time = bcast_time + interface_MPI_WTIME() - xcast_time
C         transfer the recieved block to disk
csk       WRITE(LUWRT,*)'THIS IS WHAT I GOT',ICOMM_ID
csk       CALL WRTMATMN(SEGMNT,1,LBL,1,LBL,LUWRT)
csk       WRITE(LUWRT,*) 'IOFFSET_OUT = ',IOFFSET_OUT
          xrw_time = interface_MPI_WTIME()
          call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET_OUT,SEGMNT,
     &                           lbl,ISTAT)
          read_write = read_write + interface_MPI_WTIME() - xrw_time
          IBI_RECV_NBLOCKS = IBI_RECV_NBLOCKS + 1
C
 200      CONTINUE
          call interface_mpi_COMM_FREE(IBLOCKCOMM)
C
        ELSE 
C         set local iscalfac to 0
          ISCALFAC_GROUP(IBLK) = 0
        END IF
C
C       keep track of correct offset
        IOFFSET_OUT = IOFFSET_OUT + LBL
        IOFFSET_IN  = IOFFSET_IN  + LBL
C
      END DO
C     ^ loop over blocks - real part
C
C
      IF( IRILP .eq. 1 ) GOTO 1001
C
C     =========
C     IMAG PART
C     =========
C
      ROOTPROC = 0
C
C     pointer for file offset: skip real part of the c-vector
      IOFF_SCRATCH_P = L_COMBI
C
      IOFFSET_IN  = IOFF_SCRATCH_P
      IOFFSET_OUT = IOFF_SCRATCH_P
C     set correct block counter for imag part
      IOFF_BLOCK = NBLOCK
CSK      WRITE(LUWRT,*) 'IOFF_BLOCK is',IOFF_BLOCK
C
C     LOOP OVER BLOCKS
      DO IBLK = 1, NBLOCK 
C
C       get length of block
C
        LBL = IBLOCKL( IBLK )
C
C       check if block is active
C
        IF( ISCALFAC( IBLK + IOFF_BLOCK ) .ne. 0 ) THEN
C
CSK          WRITE(LUWRT,*) 'ACTIVE BLOCK imag part',IBLK, 
CSK     &                    IBLK + IOFF_BLOCK
C         check if block belongs to own group
C
          IGROUPBLK = 0
          ISCRNODE = IBLOCKD( IBLK )
C
          DO IPROC = 1, NEWCOMM_PROC
            IF( ISCRNODE .eq. IGROUPLIST( IPROC ) ) IGROUPBLK = 1
          END DO
          IF( IGROUPBLK .eq. 1 ) THEN
C
CSK            WRITE(LUWRT,*) 'START READING AT IOFFSET_IN imag part = ',
CSK     &                      IOFFSET_IN
            xrw_time = interface_MPI_WTIME()
            call interface_mpi_FILE_READ_AT_r(LUIN,IOFFSET_IN,SEGMNT,
     &                            lbl,ISTAT)
            read_write = read_write + interface_MPI_WTIME() - xrw_time
            IBI_DIST_NBLOCKS = IBI_DIST_NBLOCKS + 1
C
          END IF
C
C         find the sending root processor
CSK          write(LUWRT,*) ' processor list:myproc,mynew_id,icomm_id',
CSK     &    myproc,mynew_id,icomm_id
CSK          CALL IWRTMAMN(IPROCLIST,1,NMPROC,1,NMPROC,LUWRT)
CSK          call interface_mpi_BARRIER(ICOMM,IERR)
          ROOTPROC = IPROCLIST(ISCRNODE+1) - 1
CSK          write(LUWRT,*)' ROOT proc should send: ICOMM_ID, ROOTPROC',
CSK     &    ICOMM_ID,ROOTPROC
C
C         decide whether it is necessary to take part in communication
C
          NNCOLOR = 0
          NNKEY = 0
          NNKEY = MYPROC + 1
          IF( MYPROC .eq. ROOTPROC ) NNKEY = 0
          IF( ISCALFAC_GROUP( IBLK + IOFF_BLOCK ) .ne. 0 ) THEN
            NNCOLOR = 7
          ELSE
            NNCOLOR = 8
          END IF
          call interface_mpi_COMM_SPLIT(JCOMM,NNCOLOR,NNKEY,IBLOCKCOMM)
C
          IF( NNCOLOR .eq. 8 ) GOTO 300
C         broadcast the nonvanishing block
          xcast_time = interface_MPI_WTIME()
          call interface_mpi_bcast_r1_work_f77(SEGMNT,LBL,0,IBLOCKCOMM)
          bcast_time = bcast_time + interface_MPI_WTIME() - xcast_time
C         transfer the recieved block to disk if necessary
CSK            WRITE(LUWRT,*)'THIS IS WHAT I GOT from ROOTPROC',MYPROC,
CSK     &                     ROOTPROC
CSK            CALL WRTMATMN(SEGMNT,1,LBL,1,LBL,LUWRT)
CSK            WRITE(LUWRT,*) 'IOFFSET_OUT imag part = ',IOFFSET_OUT
CSK            WRITE(LUWRT,*) 'block to write imag part',IBLK,
CSK     &                      IBLK + IOFF_BLOCK
          xrw_time = interface_MPI_WTIME()
          call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET_OUT,SEGMNT,
     &                           lbl,ISTAT)
          read_write = read_write + interface_MPI_WTIME() - xrw_time
          IBI_RECV_NBLOCKS = IBI_RECV_NBLOCKS + 1
C
 300      CONTINUE
          call interface_mpi_COMM_FREE(IBLOCKCOMM)
C
        ELSE 
C         set local iscalfac to 0
          ISCALFAC_GROUP( IBLK + IOFF_BLOCK ) = 0
        END IF
C
C       keep track of correct offset
        IOFFSET_OUT = IOFFSET_OUT + LBL
        IOFFSET_IN  = IOFFSET_IN  + LBL
C
      END DO
C     ^ loop over blocks - imag part
C
 1001 CONTINUE
C
      IF( TIMING ) THEN
C
C       print statistics
C
        WRITE(LUWRT,'(/A)') 
     &  '               coefficients exchange statistics  '
        WRITE(LUWRT,'(A/)')   
     &  '              __________________________________ '
        WRITE(LUWRT,'(2X,A,1X,I4)')   
     &  ' number of b_i blocks distributed         : ',IBI_DIST_NBLOCKS
        WRITE(LUWRT,'(2X,A,1X,I4)')   
     &  ' number of b_i blocks recieved            : ',IBI_RECV_NBLOCKS
        WALLTID = SECTID(bcast_time)
        WRITE(LUWRT,'(2X,A,1X,A)')   
     &  ' time spent in b_i communication (bcast)  : ', WALLTID
        WALLTID = SECTID(read_write)
        WRITE(LUWRT,'(2X,A,1X,A/)')   
     &  ' time spent in saving coefficients on disk: ', WALLTID
C
      END IF
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COP_REST_VEC_REL(VEC1,LUIN,LUOUT,MY_IOFF_LUOUT,
     &                            LUOUTLIST,NPARBLOCK,IBLOCKL,NBLOCKS,
     &                            NVEC,I_RUN_CPLX)
C
C     Written by  S. Knecht         - June 19 2007
C
C**********************************************************************
C
C     copy vector from sequential file LUIN to MPI-file LUOUT.
C     Master sends blocks to Node according to NPARBLOCK.
C
C     NOTE: NVEC = NVEC
C
C     active blocks on the MPI-file are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
#include "implicit.h"
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION VEC1(*)
      DIMENSION LUOUTLIST(*), NPARBLOCK(*), IBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_LUOUT = 0
C     some constants
      NNULL = 0
      IONE  = 1
C
C     WRITE(LUWRT,*) ' NPARBLOCK in copy routine'
C     CALL IWRTMAMN(NPARBLOCK,1,NUM_BLOCKS,1,NUM_BLOCKS,LUWRT)
C     WRITE(LUWRT,*) ' IBLOCKL in copy routine'
C     CALL IWRTMAMN(IBLOCKL,1,NUM_BLOCKS,1,NUM_BLOCKS,LUWRT)
C    
      ILEN = 0
C
C     loop over vectors
C
      DO IVEC = 1, NVEC
C
        IOFFSET_SCRATCH = 0
        NUM_BLK = 0
C
C       ================
C        COMPLEX VECTOR
C       ================
C
C       .................
C        REAL PART FIRST
C       .................
C
C       set new offset
C
C       position in file is at the end of vector IVEC - 1
C
C       note: real part: --> MY_VEC2_IOFF and MY_ACT_BLK2
C
        IOFFSET_LUOUT      = MY_IOFF_LUOUT + IOFFSET_SCRATCH +
     &                       ( IVEC - 1 )  * MY_VEC2_IOFF
C
        IOFFSET_INT_LUOUT  = 1 + NUM_BLK  +
     &                       ( IVEC - 1 ) * MY_ACT_BLK2
C
C       loop over blocks
C
        DO IBLK = 1, NBLOCKS
C
          ILEN = IBLOCKL( IBLK )
csk          WRITE(LUWRT,*) ' ILEN of IBLK',ILEN,IBLK
          IF( ILEN .GT. 0 ) THEN
C
C           get number of CPU that will recieve the block
            I_NEED_BLK = NPARBLOCK( IBLK )
csk            WRITE(LUWRT,*) ' I_NEED_BLK of IBLK',I_NEED_BLK,IBLK
C
            IF( MYPROC .ne. MASTER .AND. MYPROC .NE. I_NEED_BLK ) 
     &         GOTO 500
C
C           slaves first...
C
            IF( MYPROC .ne. MASTER ) THEN
C
              NUM_BLK = NUM_BLK + 1
              IMZERO = 0
C
              call interface_mpi_RECV(IMZERO,1,MASTER,90,
     &                      global_communicator)
C
              IF( IMZERO .eq. 0 ) THEN
C
C               recieve block and transfer to file
                call interface_mpi_recv_r1_work_f77(VEC1,ILEN,MASTER,91,
     &                        global_communicator)
                LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = IONE
!               WRITE(6,*) ' S will write this to disk for block',
!    &          IOFFSET_LUOUT, IBLK
!               CALL WRTMATMN(VEC1,1,ILEN,1,ILEN,6)
                call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET_LUOUT,
     &               VEC1,ILEN,ISTAT)
C
              ELSE
C
C               mark block as zero block
                LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = NNULL
C
              END IF
C
C             keep track of correct offset
              IOFFSET_SCRATCH = ILEN
              IOFFSET_LUOUT = IOFFSET_LUOUT + IOFFSET_SCRATCH
C
C           MASTER follows...
C
            ELSE
C
C             read block in core and send to CPU I_NEED_BLK if necessary
              IMZERO = 0
              NO_ZEROING = 1
              CALL IFRMDS(LBL,1,-1,LUIN)
              IF( LBL .ne. ILEN ) THEN
                WRITE(LUWRT,*) '  Error in restart routine detected, I
     & quit'
                WRITE(LUWRT,*) '  Different blocksizes!!! '
                CALL Abend2('Error in COP_REST_VEC_REL detected!')
              END IF
              CALL FRMDSC2(VEC1,ILEN,-1,LUIN,IMZERO,IAMPACK,
     &                     NO_ZEROING)
              IF( MYPROC .ne. I_NEED_BLK )THEN
                call interface_mpi_SEND(IMZERO,1,I_NEED_BLK,90,
     &                        global_communicator)
                IF( IMZERO .eq. 0 ) THEN
                  call interface_mpi_send_r1_work_f77(VEC1,ILEN,
     &                          I_NEED_BLK,91,global_communicator)
                END IF
              ELSE
C               count active blocks
                NUM_BLK = NUM_BLK + 1
C
                IF( IMZERO .eq. 0 ) THEN
C
!                 WRITE(LUWRT,*) ' M will write this to disk for block',
!    &            IOFFSET_LUOUT, IBLK
!                 CALL WRTMATMN(VEC1,1,ILEN,1,ILEN,LUWRT)
C
                 call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET_LUOUT,
     &                VEC1,ILEN,ISTAT)
                 LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = IONE
                ELSE
                  LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = NNULL
                END IF 
C               keep track of correct offset
                IOFFSET_SCRATCH = ILEN
                IOFFSET_LUOUT = IOFFSET_LUOUT + IOFFSET_SCRATCH
C
              END IF
C             ^ needs send?
C
            END IF
C           ^ slave or master
         ELSE
C          skip zero block on disk
           IF( MYPROC .EQ. MASTER ) THEN
             CALL IFRMDS(LBL,1,-1,LUIN)
             NO_ZEROING = 1
             CALL FRMDSC2(VEC1,LBL,-1,LUIN,IMZERO,IAMPACK,
     &                    NO_ZEROING)
           END IF
         END IF
C        ^ ILEN > 0?
 500     CONTINUE
C
        END DO
C       ^ loop over blocks
C
C       ..............
C        COMPLEX PART
C       ..............
C
        IOFFSET_SCRATCH = 0
        NUM_BLK = 0
C
        IF( I_RUN_CPLX .eq. 2 ) THEN
C
          IF( MYPROC .eq. MASTER )THEN
C           skip marker on file 
            CALL IFRMDS(LBL,1,-1,LUIN)
C
          END IF
C
          IOFFSET_SCRATCH = 0
          NUM_BLK = 0
C
C         new offset for writing on LUOUT - complex part
C
          IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH +
     &                          ( IVEC - 1 )  * MY_VEC2_IOFF +
     &                                          MY_VEC1_IOFF
C
          IOFFSET_INT_LUOUT  = 1 + NUM_BLK  +
     &                         ( IVEC - 1 ) * MY_ACT_BLK2 + MY_ACT_BLK1
C
C         loop over blocks
C
          DO IBLK = 1, NBLOCKS
C
            ILEN = IBLOCKL( IBLK )
CSK            WRITE(LUWRT,*) ' ILEN of IBLK',ILEN,IBLK
            IF( ILEN .GT. 0 ) THEN
C
C             get number of CPU that will recieve the block
              I_NEED_BLK = NPARBLOCK( IBLK )
CSK              WRITE(LUWRT,*) ' I_NEED_BLK of IBLK',I_NEED_BLK,IBLK
C
              IF( MYPROC .ne. MASTER .AND. MYPROC .NE. I_NEED_BLK ) 
     &           GOTO 600
C
C           slaves first...
C
              IF( MYPROC .ne. MASTER ) THEN
C
                NUM_BLK = NUM_BLK + 1
                IMZERO = 0
C
                call interface_mpi_RECV(IMZERO,1,MASTER,93,
     &                        global_communicator)
C
                IF( IMZERO .eq. 0 ) THEN
C
C                 recieve block and transfer to file
                  call interface_mpi_recv_r1_work_f77(VEC1,ILEN,
     &                          MASTER,94,global_communicator)
                  LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = IONE
                  call interface_mpi_FILE_WRITE_AT_r(LUOUT,
     &                                   IOFFSET_LUOUT,VEC1,ILEN,ISTAT)
C
                ELSE
C
C                 mark block as zero block
                  LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = NNULL
C
                END IF
C
C               keep track of correct offset
                IOFFSET_SCRATCH = ILEN
                IOFFSET_LUOUT = IOFFSET_LUOUT + IOFFSET_SCRATCH
C
C             MASTER follows...
C
              ELSE
C
C               read block in core and send to CPU I_NEED_BLK if necessary
                IMZERO = 0
                NO_ZEROING = 1
                CALL IFRMDS(LBL,1,-1,LUIN)
                IF( LBL .ne. ILEN ) THEN
                  WRITE(LUWRT,*) '  Error in restart routine detected, I
     & quit'
                  WRITE(LUWRT,*) '  Different blocksizes!!! '
                  CALL Abend2('Error in COP_REST_VEC_REL detected!')
                END IF
                CALL FRMDSC2(VEC1,ILEN,-1,LUIN,IMZERO,IAMPACK,
     &                       NO_ZEROING)
                IF( MYPROC .ne. I_NEED_BLK )THEN
                  call interface_mpi_SEND(IMZERO,1,I_NEED_BLK,93,
     &                          global_communicator)
                  IF( IMZERO .eq. 0 ) THEN
                    call interface_mpi_send_r1_work_f77(VEC1,ILEN,
     &                            I_NEED_BLK,94,global_communicator)
                  END IF
                ELSE
C                 count active blocks
                  NUM_BLK = NUM_BLK + 1
C
                  IF( IMZERO .eq. 0 ) THEN
C
                    call interface_mpi_FILE_WRITE_AT_r(LUOUT,
     &                   IOFFSET_LUOUT,VEC1,ILEN,ISTAT)
                    LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = IONE
                  ELSE
                    LUOUTLIST( IOFFSET_INT_LUOUT + NUM_BLK - 1 ) = NNULL
                  END IF 
C                 keep track of correct offset
                  IOFFSET_SCRATCH = ILEN
                  IOFFSET_LUOUT = IOFFSET_LUOUT + IOFFSET_SCRATCH
C
                END IF
C               ^ needs send?
C
              END IF
C             ^ slave or master
           ELSE
C            skip zero block on disk
             IF( MYPROC .EQ. MASTER ) THEN
               CALL IFRMDS(LBL,1,-1,LUIN)
               NO_ZEROING = 1
               CALL FRMDSC2(VEC1,LBL,-1,LUIN,IMZERO,IAMPACK,
     &                      NO_ZEROING)
             END IF
           END IF
C          ^ ILEN > 0?
 600       CONTINUE
C
          END DO
C         ^ loop over blocks
C
        END IF
C       ^ COMPLEX PART NEEDED? 
C
        IF( MYPROC .eq. MASTER ) THEN
C         skip end-of-vector marker on file LUIN
          CALL IFRMDS(LBL,1,-1,LUIN)
        END IF
C
      END DO
C     ^ loop over vectors
csk      WRITE(LUWRT,*) ' MY LUOUTLIST AFTER ALL',MYPROC
csk      CALL IWRTMAMN(LUOUTLIST,1,IALL_LU1,1,IALL_LU1,LUWRT)
C
      END
      subroutine blubb_blubb(communicator)
      implicit none
      integer, intent(inout) :: communicator
      integer                :: bla
      bla = communicator
      end 
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE GROUP_CONSTRUCTOR_REL(IGROUPLIST,IPROCLIST,
     &                                 IPROCLIST_SM)
C
      use interface_to_mpi
      IMPLICIT REAL*8 (A-H,O-Z)
C
#include "infpar.h"
#include "parluci.h"
!#define DEBUG_MPI_KRCI
#ifdef DEBUG_MPI_KRCI
#include "mpif.h"
#endif
C
      DIMENSION IPROCLIST(NMPROC),IGROUPLIST(NMPROC)
      DIMENSION IPROCLIST_SM(NMPROC)
      INTEGER IKEY, ICOLOR, JKEY, JCOLOR, KCOLOR, KKEY
      INTEGER LCOLOR, LKEY
      integer test_size_glb
      ITEST = 00
C     ITEST = 10
C
C     'intra-node' communicator MYNEW_COMM ( I/O communicator )
C
      IKEY       = MYPROC
      ICOLOR     = IPROCLIST(MYPROC+1)
      MYNEW_COMM = -1
C
#ifdef DEBUG_MPI_KRCI
      call mpi_comm_split(global_communicator,ICOLOR,IKEY,MYNEW_COMM,
     &                    ierr)
#else
      call interface_mpi_COMM_SPLIT(global_communicator,ICOLOR,IKEY,
     &                              MYNEW_COMM)
#endif
      call blubb_blubb(MYNEW_COMM)
      IF(ITEST .ge. 10)THEN
        test_size_glb = -1
        call interface_mpi_COMM_SIZE(global_communicator,test_size_glb)
        write(6,*) ' global_communicator: size == ',test_size_glb
        write(6,*) ' global_communicator, MYNEW_COMM, nmproc, myproc',
     &               global_communicator, MYNEW_COMM, nmproc, myproc
        do i = 1, nmproc
          write(6,*) 'IPROCLIST(i) for myproc ==> ',
     &                IPROCLIST(i), myproc
        end do
      END IF
C
C     collect useful information about each group,
C     store on common block
C
      NEWCOMM_PROC = -1
      MYNEW_ID     = -1
#ifdef DEBUG_MPI_KRCI
      call mpi_comm_size(MYNEW_COMM,NEWCOMM_PROC,ierr)
#else
      call interface_mpi_COMM_SIZE(MYNEW_COMM,NEWCOMM_PROC)
#endif
      
      call interface_mpi_COMM_RANK(MYNEW_COMM,MYNEW_ID)

      IF(ITEST .ge. 10)THEN
        WRITE(luwrt,*) ' processor-color-list (input for grouping) '
        call iwrtmamn(IPROCLIST,1,nmproc,1,nmproc,1,luwrt)
        WRITE(6,*) ' intra-node communicator (for co-worker)    :',
     &               MYNEW_COMM,' (',MYPROC,')' 
        WRITE(6,*) ' size of communicator (for co-worker group) :',
     &               NEWCOMM_PROC,' (',ICOLOR,')'
        WRITE(6,*) ' my ID in new communicator (for co-worker)  :',
     &               MYNEW_ID,' (',MYPROC,')'
        WRITE(6,*) ' myproc, color and key       :',MYPROC,ICOLOR,IKEY
      END IF

      if(NEWCOMM_PROC <= 0)then
        write(luwrt,*) '*** error in GROUP_CONSTRUCTOR_REL*** '// 
     &                 'impossible group size: ',NEWCOMM_PROC
        call quit('*** error in GROUP_CONSTRUCTOR_REL*** impossible'//
     &            ' group size <= 0.')
      end if
C
C     T-communicator MYNEW_COMM_SM ( 1st "shared memory" communicator)
C
      KKEY = MYPROC
      IF( IT_SHL .eq. - 1 .or. IT_SHL .eq. - 2 )THEN
        KCOLOR = IPROCLIST_SM( MYPROC + 1 )
      ELSE IF( IT_SHL .eq. 0 )THEN
        KCOLOR = IPROCLIST_SM( MYPROC + 1 )
      ELSE IF( IT_SHL .eq. 1 )THEN
        KCOLOR = 1
      END IF
C
      call interface_mpi_COMM_SPLIT(global_communicator,KCOLOR,KKEY,
     &                    MYNEW_COMM_SM)
C
C       collect useful information about each group,
C       store on common block
C
      NEWCOMM_PROC_SM = 0
      MYNEW_ID_SM     = 0
      call interface_mpi_COMM_SIZE(MYNEW_COMM_SM,NEWCOMM_PROC_SM)
      call interface_mpi_COMM_RANK(MYNEW_COMM_SM,MYNEW_ID_SM)
C
C     C-communicator MYNEW_COMM_SM_C ( 2nd "shared memory" communicator)
C
      LKEY = MYPROC
      IF( IC_SHL .eq. - 1 )THEN
        LCOLOR = IPROCLIST_SM( MYPROC + 1 )
      ELSE IF( IC_SHL .eq. 0 )THEN
        LCOLOR = IPROCLIST_SM( MYPROC + 1 )
      ELSE IF( IC_SHL .eq. 1 )THEN
        LCOLOR = 1
      END IF
C
      call interface_mpi_COMM_SPLIT(global_communicator,LCOLOR,LKEY,
     &                    MYNEW_COMM_SM_C)
C
C       collect useful information about each group,
C       store on common block
C
      NEWCOMM_PROC_SM_C = 0
      MYNEW_ID_SM_C     = 0
      call interface_mpi_COMM_SIZE(MYNEW_COMM_SM_C,NEWCOMM_PROC_SM_C)
      call interface_mpi_COMM_RANK(MYNEW_COMM_SM_C,MYNEW_ID_SM_C)
C
C     set node-master (might be MASTER of all CPU's)
      N_MASTER_SM_C = 0
C
C     inter-node communicator ICOMM
C
      IF( MYNEW_ID .eq. 0 ) THEN
        JKEY = IPROCLIST(MYPROC+1)
        JCOLOR = 2
      ELSE
        JKEY = IPROCLIST(MYPROC+1)
        JCOLOR = 3
      END IF
C
      call interface_mpi_COMM_SPLIT(global_communicator,JCOLOR,JKEY,
     /     ICOMM)
C
C     collect again ...     
C
      ICOMM_ID = 0
      ICOMM_SIZE = 0
      call interface_mpi_COMM_RANK(ICOMM,ICOMM_ID)
      call interface_mpi_COMM_SIZE(ICOMM,ICOMM_SIZE)
C
C     set-up and store group information 
C
      CALL SET_GROUP_TABLE_REL(IGROUPLIST,IPROCLIST,ICOLOR)
C
C     store personal group number
C
      MY_GROUPN = ICOLOR
      IF ( ITEST .ge. 10 )THEN
        WRITE(LUWRT,*) ' '
        WRITE(LUWRT,*) ' OUTPUT FROM GROUP_CONSTRUCTOR_REL'
        WRITE(LUWRT,*) ' '
        WRITE(LUWRT,*) ' size of MYNEW_COMM     :',NEWCOMM_PROC
        WRITE(LUWRT,*) ' size of MYNEW_COMM_SM  :',NEWCOMM_PROC_SM
        WRITE(LUWRT,*) ' size of MYNEW_COMM_SM_C:',NEWCOMM_PROC_SM_C
        WRITE(LUWRT,*) ' size of ICOMM          :',ICOMM_SIZE
      END IF
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE SET_GROUP_TABLE_REL(IGROUPLIST,IPROCLIST,ICOLOR)
C
      use interface_to_mpi
      IMPLICIT REAL*8 (A-H,O-Z)
C
#include "infpar.h"
#include "parluci.h"
C
C     INPUT
      DIMENSION IPROCLIST(NMPROC)
C     OUTPUT
      DIMENSION IGROUPLIST(NMPROC)
C
      INUMB = 1
      DO IPROC = 1, NMPROC
C
        IF( IPROCLIST(IPROC) .eq. ICOLOR ) THEN
C         put in the process tag ( master is 0 )
          IGROUPLIST(INUMB) = IPROC - 1
          INUMB = INUMB + 1
        END IF
C
      END DO
C
      IF( INUMB-1 .gt. NEWCOMM_PROC ) THEN
        WRITE(LUWRT,*) 'Error in SET_GROUP_TABLE_REL: more CPUs assigned
     & as included in this group!'
        WRITE(LUWRT,*) 'assigned CPUs, group size',INUMB-1,NEWCOMM_PROC
        CALL Abend2('Error detected in gp/gplupar.F: 
     &               SET_GROUP_TABLE_REL')
      END IF
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DISTBLKND(NDIM,NBLOCKL,NPARBLOCK,NPARBLKWT)
*
*. Find distribution for (nonvanishing blocks) among the nodes
*. Non vanishing block has a nonzero blocklength
*
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
#include "../lucita/clunit.inc"
*
      DIMENSION NBLOCKL(NDIM), NPARBLKWT(2,NMPROC)
*. Scratch
      INTEGER INONVAN,MAXLBL,LBL,IWEIGHT,LABEL,IBLOCKN,IMINNP
      INTEGER MXSZTMP,MXNUMB,ITEMPL,ITEMPD
      INTEGER*8 ITOTBLL 
*. ======
*. Output
*. ======
*. array NPARBLOCK that contains the final block distribution
      DIMENSION NPARBLOCK(NDIM)
*. 
      MXSIZE = 1000000000
      MXSZTMP = 0
      MXNUMB = 0
      IRUN = 0
      ITEMPW1 = 1000000000
      ITEMPW2 = 0
      ITEMPN = 0
      ITEMPL = 0
      IAMGPROC = 0
      IAMTPROC = 0
      INONVAN = 0
      ITOTBLL = 0
      IMINNP = 0
      MAXLBL = 0 
      LABEL = 0
      MIN2 = -2
      NZERO = 0
      NTEST = 0
      CALL ISETVC(NPARBLOCK,MIN2,NDIM)
      DO 100 II = 1, NDIM
        LBL = NBLOCKL(II)
        IF(LBL.GT.0) THEN
          INONVAN = INONVAN + 1
          NPARBLOCK(II) = -1
          ITOTBLL = ITOTBLL + LBL
        ENDIF
        IF(LBL.GT.MAXLBL) THEN
          MAXLBL = LBL
          LABEL = II
        END IF
 100  CONTINUE
*
*
CSK      NTEST = 10
      IF(NTEST.GT.0) THEN
       IF(MYPROC.EQ.MASTER) THEN
         WRITE(6,*) '  total number of processes to distribute on:',
     &                 NMPROC
         WRITE(6,*) '  total number of active blocks:', INONVAN
         WRITE(6,*) '  overall active block length:  ', ITOTBLL 
         WRITE(6,*) '  largest active block:         ', LABEL
         WRITE(6,*) '  size of largest active block: ', MAXLBL
       END IF
      END IF
*
      IF(INONVAN.LT.NMPROC) THEN
        IMINNP = MIN(INONVAN,NMPROC)
        write(6,'(/a,i6)') '*** error in distblknd:'//
     &    ' number of active blocks lower than the total number of'//
     &    ' processes. please decrease the number of processes to:',
     &    IMINNP
        write(6,'(/a)') ' alternatively change the GAS'//
     &    ' specification (more GA spaces) to increase the number of'//
     &    ' TTSS blocks.***'
        call quit('*** error in distblknd: number of processes >
     & number of active TTSS blocks. the distribution algorithm
     & will therefore fail (see output for more information).***')
        IF(MYPROC.GE.INONVAN) GOTO 1001
      ELSE
        IMINNP = NMPROC
      END IF
*
*. starting the treausure quest for the ?optimal? c-block distribution
*
 200  CONTINUE
*
      IRUN = IRUN + 1
      IF(IRUN.LE.INONVAN) THEN
*
        MXSZTMP = 0
*
        DO 300 II = 1, NDIM
*
          ITEMPL = NBLOCKL(II)
          ITEMPD = NPARBLOCK(II)
          ITEMPN = II
          IF(ITEMPL.GT.0)THEN
            IF(ITEMPD.EQ.-1) THEN
              IF(ITEMPL.LE.MXSIZE)THEN
                IF(ITEMPL.GE.MXSZTMP) THEN
                  MXSZTMP = ITEMPL
                  MXNUMB = II
                END IF
              END IF
            END IF
          END IF
*
300     CONTINUE
*
        DO 400 IPR = 1, IMINNP
*
          ITEMPW2 = NPARBLKWT(2,IPR)
* attention: IAMTPROC = 1 <--> MYPROC = 0 !
          IAMTPROC = IPR
*
          IF(IPR.EQ.1) ITEMPW1 = ITEMPW2
*
          IF(ITEMPW2.LE.ITEMPW1) THEN
            ITEMPW1 = ITEMPW2
            IAMGPROC = IAMTPROC
          END IF
*
 400    CONTINUE
*. now we should have found a proc and a block --> put both together !
*
*. calculation of block MXNUMB by proc IAMGPROC
        IF(NTEST.GT.0) THEN
          IF(MYPROC.EQ.MASTER) THEN
            WRITE(6,*)'calculation of block MXNUMB by proc',MXNUMB,
     &      IAMGPROC-1
          END IF
        ENDIF
*. raising NPARBLKWT(1,proc) by 1
*. adding on NPARBLKWT(2,proc) the weight of the new block
        MXSIZE = MXSZTMP
        NPARBLKWT(1,IAMGPROC) = NPARBLKWT(1,IAMGPROC) + 1
        NPARBLKWT(2,IAMGPROC) = NPARBLKWT(2,IAMGPROC) + MXSZTMP 
        NPARBLOCK(MXNUMB) = IAMGPROC - 1
      ELSE
        GOTO 500
      END IF
*     /\ IRUN !!!
      GOTO 200
*
500   CONTINUE
CSK      IF(NTEST.GT.0) THEN
        IF(MYPROC.EQ.MASTER) THEN
          WRITE(6,'(3X,A,I4)')'SUMMATION OF EVEN DISTRIBUTION OF 
     &    MYPROC:',MYPROC
          DO ISTI = 1, IMINNP
            WRITE(6,'(3X,A,I17,A,I17,A,1X,I17)')
     &      'process',ISTI -1,' calculates',NPARBLKWT(1,ISTI),
     &      ' blocks with a total length of',NPARBLKWT(2,ISTI)
          END DO
        END IF
CSK      END IF
1001  CONTINUE
*      
      END 
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DISTBLKND_1(NDIM,ICWEIGHTF,NPARBLOCK,NBLKWT,NPARBLKWT,
     &                       NVAR,ICCTOS,IBLOCKL,IPROCLIST,ICWEIGHT,
     &                       IABSOLUTE_WEIGHT)
*
*. Find distribution for (nonvanishing blocks) among the nodes
*. Non vanishing block has a nonzero blocklength
*
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
C     INPUT
      INTEGER ICOUNTABLK, NTEST,MWEIGHT
      INTEGER*8 ITOTBLCKL,MMPINFA
      DIMENSION ICWEIGHTF(NDIM),ICCTOS(NDIM,NDIM), IBLOCKL(NDIM)
      DIMENSION NPARBLOCK(NDIM),NBLKWT(NDIM),MMPINFA(NMPROC)
      DIMENSION MWEIGHT(NMPROC),IPROCLIST(NMPROC),ICWEIGHT(NDIM)
C     scratch
C     MXSIZE: actual maximum size for one of all blocks
C     MXNUMB: number of the current largest block
C     MMPINFA(*,1): total number of blocks for the n-th proc
C     MMPINFA(*,2): total length ('weight') of the blocks
      INTEGER*8 MXSIZE, MXSZTMP, ITEMPL, ITEMPW1, ITEMPW2,ILEN
      INTEGER*8 LBPROC,MXSZTMP2,MXSIZE2,ITEMPL2,IABSOLUTE_WEIGHT
      INTEGER ITEMPN, IAMGPROC, IAMTPROC, MXNUMB, IRUN
      LOGICAL SUCCESS
C     initializtion
      SUCCESS = .FALSE.
      NTEST = 0
      ICOUNTABLK = 0
      MXSIZE = 0
      MXSZTMP = 0
      MXNUMB = 0
      IRUN = 0
      ITEMPW1 = 0
      ITEMPW2 = 0
      ITEMPN = 0
      ITEMPL = 0
      IAMGPROC = 0
      IAMTPROC = 0
      ITOTBLCKL = 0
      DO I = 1, NMPROC
        MMPINFA(I) = 0
        MWEIGHT(I) = 0
      END DO
C
C     count total number of active blocks
C     set need for non-zero block to -1
C
      DO ICBL = 1, NDIM
C
        ILEN = IBLOCKL(ICBL)
        IF(ILEN.NE.0) THEN
           ICOUNTABLK = ICOUNTABLK + 1
           ITOTBLCKL = ITOTBLCKL + ILEN
           NPARBLOCK(ICBL) = -1
        END IF
        IF(ILEN.GE.MXSIZE) MXSIZE = ILEN
C
      END DO
C
      ITEMPW1 = MXSIZE
C
      IF(NTEST.GT.0) THEN
       IF(MYPROC.EQ.0) THEN
         WRITE(6,*) '  total number of processes to distribute on:',
     &                 NMPROC
         WRITE(6,*) '  total number of active blocks:', ICOUNTABLK
         WRITE(6,*) '  overall active block length:', ITOTBLCKL
       END IF
      END IF
*
      
      IAM_NOT_INV =   1
      IMINNP = NMPROC
      NTEMPP = IMINNP
      IF(ICOUNTABLK.LT.NMPROC) THEN
        IMINNP = MIN(ICOUNTABLK,IMINNP)
        write(6,'(/a,i6)') '*** error in distblknd_1:'//
     &    ' number of active blocks lower than the total number of'//
     &    ' processes. please decrease the number of processes to:',
     &    IMINNP
        write(6,'(/a)') ' alternatively change the GAS'//
     &    ' specification (more GA spaces) to increase the number of'//
     &    ' TTSS blocks.***'
        call quit('*** error in distblknd_1: number of processes >
     & number of active TTSS blocks. the distribution algorithm
     & will therefore fail (see output for more information).***')
        NTEMPP = IMINNP
        IF(MYPROC.GE.ICOUNTABLK) THEN
          IAM_NOT_INV =   0
          IMINUS2     = - 2
          CALL ISETVC(NPARBLOCK,IMINUS2,NDIM)
          GOTO 1001
        END IF
      END IF
      IAM_NOT_INV =   1
C
C     maximum number of determinants on a given cpu
C
      LBPROC = (( NVAR -1 ) / (NMPROC)) + 1
C?      LBPROC = (( IABSOLUTE_WEIGHT  - 1 ) / (NTEMPP)) + 1
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'LBPROC is',LBPROC
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'NTEMPP is',NTEMPP
C
Csk?      IF(MYPROC.EQ.MASTER) THEN
Csk?        CALL IWRTMA(NPARBLOCK,1,NDIM,1,NDIM)
Csk?      END IF
C
C     find optimal (?) c-block distribution 
C
C
C     start with the 'master group' and with the master
      IGROUP = 1
      IPROC  = 1
      ICPROC = 0
      IASSIGNED_BLK = 0
C
 100  CONTINUE
C
      MXSZTMP = 0
      ICPROC = ICPROC + 1
      IF( ICPROC .gt. NTEMPP ) GOTO 500
      IF( IASSIGNED_BLK .ge. ICOUNTABLK ) GOTO 1001
C     find largest unassigned block:
C     MXNUMB  = block number
C     MXSZTMP = block 'weight'
C
      DO II = 1, NDIM
C
C        ITEMPL = NBLKWT(II)
        ITEMPL = IBLOCKL(II)
        ITEMPD = NPARBLOCK(II)
        ITEMPN = II
C?        IF(MYPROC.eq.MASTER) WRITE(6,*)'ITEMPL for II',ITEMPL,II
C
        IF(ITEMPL.GT.0)THEN
          IF(ITEMPD.EQ.-1) THEN
            IF(ITEMPL.LE.MXSIZE)THEN
              IF(ITEMPL.GT.MXSZTMP) THEN
C?                 IF(MYPROC.eq.MASTER) WRITE(6,*)'MXSZTMP,ITEMPL',
C?     & MXSZTMP,ITEMPL,MXSIZE
                 MXSZTMP = ITEMPL
                 MXNUMB = ITEMPN
              END IF
            END IF
          END IF
        END IF
      END DO
C
 150  CONTINUE
C
C?        IF(MYPROC.eq.MASTER) WRITE(6,*)'assigned in 150',MXNUMB,IPROC
        MMPINFA(IPROC) = MMPINFA(IPROC) + MXSZTMP
        MWEIGHT(IPROC) = MWEIGHT(IPROC) + 1
        NPARBLOCK(MXNUMB) = IPROC - 1
        MXSIZE = MXSZTMP
        IASSIGNED_BLK = IASSIGNED_BLK + 1
C
C     assign all blocks that are connected to MXNUMB to a given CPU
C     as long as MMPINFA <= LBPROC !
C
      SUCCESS = .FALSE.
C
 200  CONTINUE 
C
C     find largest block connected to MXNUMB
C
C     1. find all connections
      JJJ = 0
      DO IJ = 1, NDIM
        IF(ICCTOS(IJ,MXNUMB).NE.0) THEN
           IF( NPARBLOCK(IJ) .eq. -1 ) THEN
             JJJ = JJJ + 1
             ICWEIGHT(JJJ) = IJ
           END IF
        ENDIF
      END DO
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'JJJ is',JJJ
C
      IRUN = 0
 250  CONTINUE
      MXSIZE2 = MXSIZE
      MXSZTMP = 0
      IF( JJJ .eq. 0 ) GOTO 290
C
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'current IPROC is',IPROC
C?      IF(MYPROC.eq.MASTER) THEN
C?        WRITE(6,*)'current ICWEIGHT'
C?        CALL IWRTMA(ICWEIGHT,1,JJJ,1,JJJ)
C?      END IF
C
C?      IRUN = 0
 275  CONTINUE
      MXSZTMP = 0
C     2. select the largest block not yet assigned
      DO IASB = 1, JJJ
C
        JBLK = ICWEIGHT(IASB)
        ITEMPL2 = IBLOCKL(JBLK)
C?       ITEMPL2 = NBLKWT(JBLK)
        ITEMPD2 = NPARBLOCK(JBLK)
        ITEMPN2 = JBLK
C
        IF(ITEMPD2.EQ.-1) THEN
          IF(ITEMPL2.LE.MXSIZE2)THEN
            IF(ITEMPL2.GT.MXSZTMP) THEN
               MXSZTMP  = ITEMPL2
               MXNUMB2  = ITEMPN2
               MXSZTMP2 = ITEMPL2
            END IF
          END IF
        END IF
      END DO
C
C     assign
C
      IF( ( MMPINFA(IPROC) + MXSZTMP2 ) .le. LBPROC ) THEN
C?        IF(MYPROC.eq.MASTER) WRITE(6,*)'assigned in 250',MXNUMB2,IPROC
        MMPINFA(IPROC) = MMPINFA(IPROC) + MXSZTMP2
        MWEIGHT(IPROC) = MWEIGHT(IPROC) + 1
        NPARBLOCK(MXNUMB2) = IPROC - 1
        MXSIZE2 = MXSZTMP2
        IASSIGNED_BLK = IASSIGNED_BLK + 1
      ELSE
C
C       CPU in same group?
 280    CONTINUE
C
        SUCCESS = .FALSE.
C
        DO ITCPU = 1, NTEMPP
C
          IF(.NOT.SUCCESS) THEN
            JGROUP = IPROCLIST(ITCPU)
            IF( JGROUP .eq. IGROUP ) THEN
              IF( ITCPU .gt. IPROC ) THEN
                IPROC = ITCPU
                SUCCESS = .TRUE.
              END IF
            END IF
          END IF
C
        END DO
C
        IF( .NOT. SUCCESS ) THEN
          IGROUP = IGROUP + 1
C?          IF(MYPROC.eq.MASTER) WRITE(6,*)'IGROUP,NFLGRPS',IGROUP,NFLGRPS
          IF( IGROUP .gt. NFLGRPS ) GOTO 500
C         find the lowest CPU number in the new group
          IPROC = 1
          GOTO 280
        ELSE
C         fresh cpu
          GOTO 100
        END IF
C
      END IF
      IRUN = IRUN + 1
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'IRUN IS ',IRUN
      IF( IRUN .lt. JJJ ) THEN
C?      IF(MYPROC.eq.MASTER) WRITE(6,*)'I WILL GO TO 250'
        GOTO 250
      ELSE 
        GOTO 290
      END IF
C       CPU in same group?
 290    CONTINUE
C
        SUCCESS = .FALSE.
C
        DO ITCPU = 1, NTEMPP
C
          IF(.NOT.SUCCESS) THEN
            JGROUP = IPROCLIST(ITCPU)
            IF( JGROUP .eq. IGROUP ) THEN
              IF( ITCPU .gt. IPROC ) THEN
                IPROC = ITCPU
                SUCCESS = .TRUE.
              END IF
            END IF
          END IF
C
        END DO
C
        IF( .NOT. SUCCESS ) THEN
          IGROUP = IGROUP + 1
C?          IF(MYPROC.eq.MASTER) WRITE(6,*)'2. IGROUP,NFLGRPS',
C?     &                         IGROUP,NFLGRPS
          IF( IGROUP .gt. NFLGRPS ) GOTO 500
C         find the lowest CPU number in the new group
          IPROC = 1
          GOTO 290
        END IF
C       fresh cpu
        GOTO 100
C
 500  CONTINUE
C
C     check for 'lonely' blocks
C
      LONBLK = 0
C
      DO IBLK = 1, NDIM
        IF( NPARBLOCK(IBLK) .eq. -1 ) THEN 
          LONBLK = LONBLK + 1
          ICWEIGHT(LONBLK) = IBLK
        END IF
      END DO
C?      IF (MYPROC .eq. MASTER) WRITE(6,*) 'LONBLK is ',LONBLK
C
      IF( LONBLK .eq. 0 ) GOTO 1001
C
C     good old normal block distribution
C
      IRUN = 0
      MXSIZE = ITEMPW1
C
 600  CONTINUE
*
      IRUN = IRUN + 1
      IF(IRUN.LE.LONBLK) THEN
*
        MXSZTMP = 0
*
        DO 3000 II = 1, LONBLK
*
          IBLK   = ICWEIGHT(II) 
          ITEMPL = IBLOCKL(IBLK)
          ITEMPD = NPARBLOCK(IBLK)
          ITEMPN = IBLK
C
C?          IF(ITEMPL.GT.0)THEN
            IF(ITEMPD.EQ.-1) THEN
              IF(ITEMPL.LE.MXSIZE)THEN
                IF(ITEMPL.GT.MXSZTMP) THEN
                   MXSZTMP = ITEMPL
                   MXNUMB = ITEMPN
                END IF
              END IF
            END IF
C?          END IF
*
3000    CONTINUE
*
        DO 4000 IPR = 1, NTEMPP
*
          ITEMPW2 = MMPINFA(IPR) 
CSK         IF(MYPROC.EQ.0) 
CSK     & WRITE(6,*) 'ITEMPW2 and ITEMPW1:',ITEMPW2,ITEMPW1
* attention: IAMTPROC = 1 --> MYNEW_ID = 0 !
          IAMTPROC = IPR
CSK         IF(MYPROC.EQ.0) WRITE(6,*) 'IAMTPROC',IAMTPROC
*
          IF(IPR.EQ.1) ITEMPW1 = ITEMPW2
*
          IF(ITEMPW2.LE.ITEMPW1) THEN
             ITEMPW1 = ITEMPW2
             IAMGPROC = IAMTPROC
CSK         IF(MYPROC.EQ.0) WRITE(6,*)'ITEMPW1,IAMGPROC',ITEMPW1,IAMGPROC
          END IF
*
4000    CONTINUE
C      now we should have found a proc and a block --> put both together !
C
C      calculation of block MXNUMB by proc IAMGPROC -1 (M excl.)
C?       IF(NTEST.GT.0) THEN
C?         IF(MYPROC.EQ.0) 
C?     & WRITE(6,*)'calculation of block MXNUMB by proc',MXNUMB,IAMGPROC-1
C?       ENDIF
C
C      raising MMPINFA(proc,1) by 1
C      adding on MMPINFA(proc,2) the weight of the new block
C
       IAMTPROC = IAMGPROC-1
       MXSIZE = MXSZTMP
       MWEIGHT(IAMGPROC) = MWEIGHT(IAMGPROC) + 1
       MMPINFA(IAMGPROC) = MMPINFA(IAMGPROC) + MXSZTMP
       NPARBLOCK(MXNUMB)     = IAMTPROC
C
      ELSE
        GOTO 1001
      END IF
C     /\ IRUN !!!
C
      GOTO 600
C
1001  CONTINUE
CSK      IF(NTEST.GT.0) THEN
        IF(MYPROC.EQ.0) THEN
      WRITE(6,'(3X,A,I4)')'SUMMATION OF EVEN DISTRIBUTION OF MYPROC:',
     &MYPROC
      DO ISTI = 1, NTEMPP
        WRITE(6,'(3X,A,I2,A,I5,A,1X,I17)')
     & 'process',ISTI -1,' calculates',MWEIGHT(ISTI),' blocks with a
     & total length of',MMPINFA(ISTI)
      END DO
        END IF
CSK      END IF

C?      write(6,*) 'here i am',myproc
C?      IF(MYPROC.EQ.MASTER) THEN
C?        CALL IWRTMA(NPARBLOCK,1,NDIM,1,NDIM)
C?      END IF
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DISTBLKND_2(NDIM,ICWEIGHTF,NPARBLOCK,IBLOCKL)
*
*. Find distribution for (nonvanishing blocks) among the nodes
*. Non vanishing block has a nonzero blocklength
*
      use interface_to_mpi
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
C     -----
C     INPUT
C     -----
C     NDIM: total number of c-blocks
C     IBLOCKL(NDIM): total length of each block
C     ICWEIGHTF(NDIM): total 'weight factors' for each block
C
C     ------
C     OUTPUT
C     ------
*     NPARBLOCK(NDIM): list of blocks with the corresponding assigned node  
*
      INTEGER ICOUNTABLK, NTEST
      DIMENSION ICWEIGHTF(NDIM)
      DIMENSION NPARBLOCK(NDIM),IBLOCKL(NDIM)
C     -------
C     SCRATCH
C     -------
C     MXSIZE: actual maximum size for one of all blocks
C     MXNUMB: number of the current largest block
C     MMPINFA(*,1): total number of blocks for the n-th proc
C     MMPINFA(*,2): total length ('weight') of the blocks
      INTEGER ITEMPN, IAMGPROC, IAMTPROC, MXNUMB, IRUN
      INTEGER(KIND=df_mpi_offset_kind) ITLTND,MXSIZE,MXSZTMP
      INTEGER(KIND=df_mpi_offset_kind) ITEMPL, ITEMPW1, ITEMPW2
      INTEGER(KIND=df_mpi_offset_kind) ILEN, ITOTBLCKL 
      INTEGER(KIND=df_mpi_offset_kind) IIWEIGHTBLK, IILENGTH 
*
      NTEST = 0
      IIWEIGHTBLK = 0
      IILENGTH = 0
      ITLTND = 0
      ICOUNTABLK = 0
      MXSIZE = 0
      MXSZTMP = 0
      MXNUMB = 0
      IRUN = 0
      ITEMPW1 = 0
      ITEMPW2 = 0
      ITEMPN = 0
      ITEMPL = 0
      IAMGPROC = 0
      IAMTPROC = 0
      ITOTBLCKL = 0
      ILEN = 0
*. end of initialization
      DO ICBL = 1, NDIM
C
        IIWEIGHTBLK = ICWEIGHTF(ICBL)
        IILENGTH = IBLOCKL(ICBL) 
C
        ILEN = IILENGTH * IIWEIGHTBLK 
C
        IF( ILEN .lt. 0 ) THEN 
          WRITE(6,*)'Attention, minus block detected',ICBL
          WRITE(6,*)'ILEN = ',ILEN
          WRITE(6,*)'IBLOCKL(ICBL) = ',IBLOCKL(ICBL)
          WRITE(6,*)'ICWEIGHTF(ICBL) = ',ICWEIGHTF(ICBL)
        END IF
        IF(ILEN.ne.0) THEN
           ICOUNTABLK = ICOUNTABLK + 1
           ITOTBLCKL = ITOTBLCKL + ILEN
           NPARBLOCK(ICBL) = -1
        END IF
        IF(ILEN.GE.MXSIZE) MXSIZE = ILEN
*
      END DO
*
CSK      WRITE(6,*)'MXSIZE of MYPROC',MXSIZE,MYPROC
      ITEMPW1 = MXSIZE
*
      IF(NTEST.GT.0) THEN
       IF(MYPROC.EQ.0) THEN
         WRITE(6,*) '  total number of processes to distribute on:',
     &                 NMPROC
         WRITE(6,*) '  total number of blocks:', NDIM
         CALL IWRTMA(NPARBLOCK,1,NDIM,1,NDIM)
         WRITE(6,*) '  total number of active blocks:', ICOUNTABLK
         WRITE(6,*) '  overall active block length  :', ITOTBLCKL
         WRITE(6,*) '  Maximum block size           :', MXSIZE
       END IF
      END IF
*
      IAM_NOT_INV = 1
      IMINNP = NMPROC
      NTEMPP = IMINNP
      IF( ICOUNTABLK .lt. NMPROC) THEN
        IMINNP = MIN( ICOUNTABLK, IMINNP )
        write(6,'(/a,i6)') '*** error in distblknd_2:'//
     &    ' number of active blocks lower than the total number of'//
     &    ' processes. please decrease the number of processes to:',
     &    IMINNP
        write(6,'(/a)') ' alternatively change the GAS'//
     &    ' specification (more GA spaces) to increase the number of'//
     &    ' TTSS blocks.***'
        call quit('*** error in distblknd_2: number of processes >
     & number of active TTSS blocks. the distribution algorithm
     & will therefore fail (see output for more information).***')
        NTEMPP = IMINNP
        IF( MYPROC .ge. ICOUNTABLK ) THEN
          IAM_NOT_INV =   0
          IMINUS2     = - 2
          CALL ISETVC(NPARBLOCK,IMINUS2,NDIM)
          GOTO 101
        END IF
      END IF
      IAM_NOT_INV = 1
*
*. starting the treausure quest for the ?optimal? c-block distribution
*
100   CONTINUE
*
      IRUN = IRUN + 1
      IF(IRUN.LE.ICOUNTABLK) THEN
*
        MXSZTMP = 0
*
        DO 3000 II = 1, NDIM
*
          
          IIWEIGHTBLK = ICWEIGHTF(II)
          IILENGTH = IBLOCKL(II)
C
          ITEMPL = IILENGTH * IIWEIGHTBLK
C
          ITEMPD = NPARBLOCK(II)
          ITEMPN = II

CSK          IF(MYPROC.EQ.0) 
CSK     & WRITE(6,*) 'ITEMPL,ITEMPD,ITEMPN:',ITEMPL,ITEMPD,ITEMPN
*
CSK          IF(MYPROC.EQ.0) WRITE(6,*) 'MXSIZE:',MXSIZE
CSK          IF(MYPROC.EQ.0) WRITE(6,*) 'MXSZTMP:',MXSZTMP
          IF(ITEMPL.GT.0)THEN
            IF(ITEMPD.EQ.-1) THEN
              IF(ITEMPL.LE.MXSIZE)THEN
                IF(ITEMPL.GT.MXSZTMP) THEN
                   MXSZTMP = ITEMPL
                   MXNUMB = ITEMPN
CSK                  WRITE(6,*) 'MXSZTMP (2):',MXSZTMP
                END IF
              END IF
            END IF
          END IF
*
3000    CONTINUE
*
CSK       IF(MYPROC.EQ.0) WRITE(6,*)'NTEMPP:',NTEMPP
        DO 4000 IPR = 1, NTEMPP
*
C?          ITEMPW2 = MMPINFA(IPR) 
           ITEMPW2 = 0
           DO IBLK = 1, NDIM
            IF( NPARBLOCK(IBLK) .eq. IPR-1 ) THEN
              IIWEIGHTBLK = ICWEIGHTF(IBLK)
              IILENGTH = IBLOCKL(IBLK)
              ITEMPW2 = ITEMPW2 + ( IILENGTH * IIWEIGHTBLK )
            END IF
           END DO
          
CSK         IF(MYPROC.EQ.0) 
CSK     & WRITE(6,*) 'ITEMPW2 and ITEMPW1:',ITEMPW2,ITEMPW1
* attention: IAMTPROC = 1 --> MYNEW_ID = 0 !
          IAMTPROC = IPR
CSK         IF(MYPROC.EQ.0) WRITE(6,*) 'IAMTPROC',IAMTPROC
*
          IF(IPR.EQ.1) ITEMPW1 = ITEMPW2
*
          IF(ITEMPW2.LE.ITEMPW1) THEN
             ITEMPW1 = ITEMPW2
             IAMGPROC = IAMTPROC
CSK         IF(MYPROC.EQ.0) WRITE(6,*)'ITEMPW1,IAMGPROC',ITEMPW1,IAMGPROC
          END IF
*
4000    CONTINUE
*. now we should have found a proc and a block --> put both together !
*
*.      calculation of block MXNUMB by proc IAMGPROC -1 (M excl.)
       IF(NTEST.GT.0) THEN
         IF(MYPROC.EQ.0) 
     & WRITE(6,*)'calculation of block MXNUMB by proc',MXNUMB,IAMGPROC-1
       ENDIF
*
*.     raising MMPINFA(proc,1) by 1
*.     adding on MMPINFA(proc,2) the weight of the new block
       IAMTPROC = IAMGPROC-1
       MXSIZE = MXSZTMP
C?       MWEIGHT(IAMGPROC) = MWEIGHT(IAMGPROC) + 1
C?       MMPINFA(IAMGPROC) = MMPINFA(IAMGPROC) + MXSZTMP
       NPARBLOCK(MXNUMB)     = IAMTPROC
*
      ELSE
        GOTO 101
      END IF
*     /\ IRUN !!!
*
      GOTO 100
*
101   CONTINUE
*
CSK      IF(NTEST.GT.0) THEN
        IF(MYPROC.EQ.0) THEN
        DO II = 1, NDIM
         IF( NPARBLOCK(II) .eq. -1 ) THEN
          WRITE(6,*) 'AAAHHHH, block',II,' is not distributed'
          WRITE(6,*) 'Since I do not know how to proceed, I will stop!'
            Call Abend2('NE DISTBLKND_2: Block not distributed')
         END IF
        END DO
      WRITE(6,*)' '
      WRITE(6,'(20X,A)')'================================'
      WRITE(6,'(20X,A)')' Summation of even distribution '
      WRITE(6,'(20X,A)')'================================'
      WRITE(6,*)' '
      DO ISTI = 1, NTEMPP
        ITLTND = 0
        ITLBND = 0
        DO IBLK = 1, NDIM
          IF( NPARBLOCK(IBLK) .eq. ISTI -1 ) THEN
C
             IIWEIGHTBLK = ICWEIGHTF(IBLK)
             IILENGTH = IBLOCKL(IBLK)
C
             ITLTND = ITLTND + ( IILENGTH * IIWEIGHTBLK )
             ITLBND = ITLBND + 1
          END IF
        END DO
        WRITE(6,'(3X,A,1X,I4,1X,A,1X,I6,1X,A,1X,I18)')
     & 'CPU',ISTI-1,' computes',ITLBND,'blocks with a total weight of',
     & ITLTND
      END DO
*
*      WRITE(6,'(3X,A)') 'even distribution finished!'
*
        END IF
CSK      END IF
*
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DISTBLKND_3(NDIM,NBLOCKL,NPARBLOCK,NPARBLKWT,
     &                       NBATCH,LBATCH,LEBATCH,I1BATCH)
*
*. Find distribution for (nonvanishing blocks) among the nodes
*. Non vanishing block has a nonzero blocklength
*
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
#include "../lucita/clunit.inc"
*
      DIMENSION NBLOCKL(NDIM), NPARBLKWT(2,NMPROC)
      DIMENSION LBATCH(*),LEBATCH(*),I1BATCH(*)
*. Scratch
      INTEGER INONVAN,MAXLBL,LBL,IWEIGHT,LABEL,IBLOCKN
      INTEGER MXSZTMP,MXNUMB,ITEMPL,ITEMPD
      INTEGER*8 ITOTBLL 
*. ======
*. Output
*. ======
*. array NPARBLOCK that contains the final block distribution
      DIMENSION NPARBLOCK(NDIM)
*. 
      MXSZTMP = 0
      MXNUMB = 0
      IRUN = 0
      ITEMPW1 = 1000000000
      ITEMPW2 = 0
      ITEMPN = 0
      ITEMPL = 0
      IAMGPROC = 0
      IAMTPROC = 0
      INONVAN = 0
      ITOTBLL = 0
      MAXLBL = 0 
      LABEL = 0
      MIN2 = -2
      NTEST = 0
      CALL ISETVC(NPARBLOCK,MIN2,NDIM)
      DO 100 II = 1, NDIM
        LBL = NBLOCKL(II)
        IF(LBL.GT.0) THEN
          NPARBLOCK(II) = -1
          ITOTBLL = ITOTBLL + LBL
        ENDIF
        IF(LBL.GT.MAXLBL) THEN
          MAXLBL = LBL
          LABEL = II
        END IF
 100  CONTINUE
*
*
CSK      NTEST = 10
      IF(NTEST.GT.0) THEN
       IF(MYPROC.EQ.MASTER) THEN
         WRITE(6,*) '  total number of processes to distribute on:',
     &                 NMPROC
         WRITE(6,*) '  overall active block length:  ', ITOTBLL 
         WRITE(6,*) '  largest active block:         ', LABEL
         WRITE(6,*) '  size of largest active block: ', MAXLBL
       END IF
      END IF
*
*. gather batch information for distribution of batches to cpu's (nodes)
*
         if(myproc.eq.master) then
         write(6,*)'NUMBER OF BATCHES:', NBATCH
         do II = 1, NBATCH
         write(6,*)'TOTAL LENGTH OF BATCH:', II,LEBATCH(II)
         write(6,*)'FIRST BLOCK OF THIS BATCH:', I1BATCH(II)
         write(6,*)'TOTAL NUMBER OF BLOCKS FOR THIS BATCH:',LBATCH(II)
         end do
         endif
CSK      LEBATCHTOT = DASUM(NBATCH,LEBATCH,1)
CSK      LEBATCHAVE = LEBATCHTOT / NMPROC
*
*. starting the treausure quest for the ?optimal? c-block distribution
*
 200  CONTINUE
*
      IRUN = IRUN + 1
      IF(IRUN.LE.NBATCH) THEN
*
*       Find biggest batch not assigned to a CPU yet:
*       
        MXSZTMP = 0
*
        DO II = 1, NBATCH
*
*         Has this batch already been assigned to a CPU ?
          IBLK1 = I1BATCH(II)
          DO IBLK = IBLK1, IBLK1-1+LBATCH(II)
             ITEMPD = NPARBLOCK(IBLK)
             IF (ITEMPD .GE. 0) GO TO 300
*            .. Yes! by CPU no. ITEMPD
          END DO
*
          ITEMPL = LEBATCH(II)
          IF(ITEMPL.GE.MXSZTMP) THEN
            MXSZTMP = ITEMPL
            JBATCH = II
          END IF
*
  300   CONTINUE
        END DO
*
*       Find least loaded CPU:
*
        ITEMPW1 = NPARBLKWT(2,1)
        IAMGPROC = 1
*       Is anyone less loaded than the master (IPR = 1) ?
        DO 400 IPR = 2, NMPROC
*
          ITEMPW2 = NPARBLKWT(2,IPR)
*
          IF(ITEMPW2.LE.ITEMPW1) THEN
            ITEMPW1 = ITEMPW2
            IAMGPROC = IPR
          END IF
*
 400    CONTINUE
*. now we should have found a proc and a batch --> put both together !
*
*. calculation of batch JBATCH by proc IAMGPROC
        IF(NTEST.GT.0) THEN
          IF(MYPROC.EQ.MASTER) THEN
            WRITE(6,*)'calculation of batch by proc',JBATCH,
     &      IAMGPROC-1
          END IF
        ENDIF
*. raising NPARBLKWT(1,proc) by 1
*. adding on NPARBLKWT(2,proc) the weight of the new block
        NPARBLKWT(1,IAMGPROC) = NPARBLKWT(1,IAMGPROC) + LBATCH(JBATCH)
        NPARBLKWT(2,IAMGPROC) = NPARBLKWT(2,IAMGPROC) + MXSZTMP 
        IBLK1 = I1BATCH(JBATCH)
        DO IBLK = IBLK1, IBLK1-1+LBATCH(JBATCH)
          IF (NBLOCKL(IBLK) .GT. 0) THEN
*           .. assign this block to the selected cpu
            NPARBLOCK(IBLK) = IAMGPROC - 1
          ELSE
*           .. insert code -2 for zero length block
            NPARBLOCK(IBLK) = -2
          END IF
        END DO
*
      ELSE
        GOTO 500
      END IF
*     /\ IRUN !!!
      GOTO 200
*
500   CONTINUE
CSK      IF(NTEST.GT.0) THEN
        IF(MYPROC.EQ.MASTER) THEN
          WRITE(6,'(3X,A,I4)')'SUMMATION OF EVEN DISTRIBUTION OF 
     &    MYPROC:',MYPROC
          DO ISTI = 1, NMPROC
            WRITE(6,'(3X,A,I17,A,I17,A,1X,I17)')
     &      'process',ISTI -1,' calculates',NPARBLKWT(1,ISTI),
     &      ' blocks with a total length of',NPARBLKWT(2,ISTI)
          END DO
        END IF
CSK      END IF
1001  CONTINUE
*      
      END 
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DMTVDS_PAR(VEC1,VEC2,LU1,LU2,LU3,FAC,IREW,INV,
     &                     ISCAT,XSCAT,NSCAT,LBLK,XINOUT)
*
* Multiply/divide elements of two vectors residing on disc
* Elements corresponding to absolute adresses in ISCAT
* are set to the elements of XSCAT
*
* For elements not in ISCAT the operation is thus :
*
* For INV.NE. 0 :
*
*    V3(I) = (V1(I)+FAC)-1 * V2(I)
*    LU3      LU1            LU2
*
* For INV.EQ. 0 :
*
*    V3(I) = (V1(I)+FAC) * V2(I)
*    LU3         LU1        LU2
*
* LBLK defines structure of files
*
* Overlap between input and output vector is also calculated
*
      IMPLICIT REAL*8(A-H,O-Z)
#include "parluci.h"
      DIMENSION  VEC1(*),VEC2(*)
      DIMENSION  XSCAT(NSCAT),ISCAT(NSCAT)
*
      REAL*8 INPROD
*
      XINOUT = 0.0D0
*
      IF ( IREW .NE. 0 ) THEN
        IF( LBLK .GE. 0 ) THEN
          REWIND LU1
          REWIND LU2
          REWIND LU3
        ELSE
          CALL REWINE( LU1,LBLK)
          CALL REWINE( LU2,LBLK)
          CALL REWINE( LU3,LBLK)
         END IF
      END IF
*
* Loop over blocks
*
        IOFF = 1
 1000 CONTINUE
        IF (LBLK .GT. 0 ) THEN
          LBL1 = LBLK
          LBL2 = LBLK
        ELSE IF( LBLK .EQ. 0 ) THEN
          READ(LU1) LBL1
          READ(LU2) LBL2
          WRITE(LU3) LBL1
        ELSE IF (LBLK .LT. 0 ) THEN
          CALL IFRMDS(LBL1,1,-1,LU1)
          CALL IFRMDS(LBL2,1,-1,LU2)
          CALL ITODS (LBL1,1,-1,LU3)
        END IF
        IF(LBL1 .NE. LBL2 ) THEN
          WRITE(6,'(A,2I3)') ' DIFFERENT BLOCKSIZES IN DMTVSD_PAR : '
     &                     , LBL1,LBL2
          Call Abend2( ' DIFFERENT BLOCKSIZES IN DMTVSD_PAR ' )
        END IF
        IF(LBL1 .GE. 0 ) THEN
          IF(      LBLK .GE.0 ) THEN
            KBLK = LBL1
          ELSE
            KBLK = -1
          END IF
          LENGTH = LBL1
          CALL FRMDSC(VEC1,LENGTH,KBLK,LU1,IMZERO,IAMPACK)
          CALL FRMDSC(VEC2,LENGTH,KBLK,LU2,IMZERO,IAMPACK)
          IF( LBL1 .GT. 0 )THEN
            IF(INV .NE. 0 ) THEN
*             nothing needs to be done here
              CALL DIAVC2(VEC1,VEC2,VEC1,FAC,LENGTH)
            ELSE
*             nothing needs to be done here
              CALL VVTOV(VEC1,VEC2,VEC1,LBL1)
*             nothing needs to be done here
              CALL DAXPY(LENGTH, FAC, VEC2, 1, VEC1, 1 )
            END IF
          END IF
*
          IF(NSCAT.NE.0) THEN
            IFIRST = IOFF
            ILAST = IOFF + LENGTH - 1
            DO 100 I = 1, NSCAT
              IF(IFIRST .LE. ISCAT(I) .AND. ISCAT(I) .LE. ILAST )
     &        VEC1(ISCAT(I)-IOFF+1) = XSCAT(I)
  100       CONTINUE
          END IF
*
          XINOUT = XINOUT + DDOT(LENGTH,VEC1,1,VEC2,1)
          CALL TODSC(VEC1,LENGTH,KBLK,LU3)
          IOFF = IOFF + LENGTH
        END IF
*
      IF( LBL1.GE. 0 .AND. LBLK .LE. 0) GOTO 1000
*
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE DMTVDS_PAR2(VEC1,VEC2,LUDIA,LUIN,LUOUT,FAC,IREW,INV,
     &                       ISCAT,XSCAT,NSCAT,LBLK,XINOUT)
*
* Multiply/divide elements of two vectors residing on disc
* Elements corresponding to absolute adresses in ISCAT
* are set to the elements of XSCAT
*
* For elements not in ISCAT the operation is thus :
*
* For INV.NE. 0 :
*
*    V3(I) = (V1(I)+FAC)-1 * V2(I)
*    LUOUT      LUDIA            LUIN
*
* For INV.EQ. 0 :
*
*    V3(I) = (V1(I)+FAC) * V2(I)
*    LUOUT         LUDIA        LUIN
*
* LBLK defines structure of files
*
* Overlap between input and output vector is also calculated
*
#include "implicit.h"
#include "parluci.h"
#include "../lucita/mxpdim.inc"
#include "../lucita/crun.inc"

      DIMENSION  VEC1(*),VEC2(*)
      DIMENSION  XSCAT(NSCAT),ISCAT(NSCAT)
*
      REAL*8 INPROD
*
      XINOUT = 0.0D0
C     set proper truncation factor
      THR_TRUNC  = TRUNC_FAC * RNORM_FAC
Csk      WRITE(6,*) 'TRUNCATION FACTOR:',THR_TRUNC
Chj 14-jun-07:   disable THR_ETRUNC
Chj   THR_ETRUNC = 1.0D-6 * THRES_E
      THR_ETRUNC = -1.0D0
*
      IF ( IREW .NE. 0 ) THEN
        IF( LBLK .GE. 0 ) THEN
          REWIND LUDIA
          REWIND LUIN
          REWIND LUOUT
        ELSE
          CALL REWINE( LUDIA,LBLK)
          CALL REWINE( LUIN,LBLK)
          CALL REWINE( LUOUT,LBLK)
         END IF
      END IF
*
* Loop over blocks
*
        IOFF = 1
 1000 CONTINUE
        IF (LBLK .GT. 0 ) THEN
          LBL1 = LBLK
          LBL2 = LBLK
        ELSE IF( LBLK .EQ. 0 ) THEN
          READ(LUDIA)  LBL1
          READ(LUIN)   LBL2
          WRITE(LUOUT) LBL1
        ELSE IF (LBLK .LT. 0 ) THEN
          CALL IFRMDS(LBL1,1,-1,LUDIA)
          CALL IFRMDS(LBL2,1,-1,LUIN)
          CALL ITODS (LBL1,1,-1,LUOUT)
        END IF
        IF(LBL1 .NE. LBL2 ) THEN
          WRITE(6,'(A,2I3)') ' DIFFERENT BLOCKSIZES IN DMTVSD_PAR : '
     &                     , LBL1,LBL2
          Call Abend2( ' DIFFERENT BLOCKSIZES IN DMTVSD_PAR ' )
        END IF
        IF(LBL1 .GE. 0 ) THEN
          IF(      LBLK .GE.0 ) THEN
            KBLK = LBL1
          ELSE
            KBLK = -1
          END IF
          LENGTH = LBL1
          CALL FRMDSC(VEC1,LENGTH,KBLK,LUDIA,IMZERO,IAMPACK)
          CALL FRMDSC(VEC2,LENGTH,KBLK,LUIN,IMZERO,IAMPACK)
          IF( LBL1 .GT. 0 )THEN
CSK            IOFFMAX = idamax(LENGTH, VEC2, 1 )
CSK            IF( ABS(VEC2(IOFFMAX)) .LT. THR_TRUNC ) THEN
CSK              CALL DZERO(VEC2,LENGTH)
CSK            ELSE
CSK              DO II = 1, LENGTH
CSK                IF( ABS(VEC2(II)) .LT. THR_TRUNC ) 
CSK     &              VEC2(II) = 0.0D0
CSK              END DO
CSK            END IF
            IF(INV .NE. 0 ) THEN
*             nothing needs to be done here
              
              CALL DIAVC2_TRUNC(VEC1,VEC2,VEC1,FAC,LENGTH,THR_TRUNC,
     &                          THR_ETRUNC)
            ELSE
*             nothing needs to be done here
              CALL VVTOV(VEC1,VEC2,VEC1,LBL1)
*             nothing needs to be done here
              CALL DAXPY(LENGTH, FAC, VEC2, 1, VEC1, 1 )
            END IF
          END IF
*
          IF(NSCAT.NE.0) THEN
            IFIRST = IOFF
            ILAST = IOFF + LENGTH - 1
            DO 100 I = 1, NSCAT
              IF(IFIRST .LE. ISCAT(I) .AND. ISCAT(I) .LE. ILAST )
     &        VEC1(ISCAT(I)-IOFF+1) = XSCAT(I)
  100       CONTINUE
          END IF
*
          XINOUT = XINOUT + DDOT(LENGTH,VEC1,1,VEC2,1)
          CALL TODSC(VEC1,LENGTH,KBLK,LUOUT)
          IOFF = IOFF + LENGTH
        END IF
*
      IF( LBL1.GE. 0 .AND. LBLK .LE. 0) GOTO 1000
*
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE FIND_CCTOS(ISCALFAC_LOCAL,IBLOCKD,ICCTOS,NBLOCK)
*
*. Find all c-blocks connecting to a given sigma-block for each cpu
*  using the connection matrix ICCTOS.
*. Each cpu stores the information in ISCALFAC_LOCAL.
*. Connection is marked by 1.
*
      IMPLICIT REAL*8           ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
*     ==========
*       INPUT
*     ==========
      DIMENSION ICCTOS(NBLOCK,NBLOCK),IBLOCKD(NBLOCK)
*
*     ===========
*       OUTPUT
*     ===========
*
      DIMENSION ISCALFAC_LOCAL(NBLOCK)
*
      IONE = 1
*
      DO IBLK = 1, NBLOCK
*
        IF(IBLOCKD(IBLK).EQ.MYPROC) THEN
*
          DO JBLK = 1,NBLOCK
*
            IF(ICCTOS(JBLK,IBLK).NE.0) ISCALFAC_LOCAL(JBLK) = IONE
*
          END DO
*
        END IF
*
      END DO
*
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE FIND_GROUP_OF_PROCS(MYOWNNAME,MYNAMELENGTH,
     &                               IPROCLIST,NAMELENGTHLIST)
C
      use interface_to_mpi
      IMPLICIT REAL*8 (A-H,O-Z)
C
#include "infpar.h"
#include "parluci.h"
C
      INTEGER IPROCLIST(NMPROC),NAMELENGTHLIST(NMPROC)
      CHARACTER*255 PNAMELIST(NMPROC),MYOWNNAME,MYSCRNAME
      CHARACTER*255 OLD_NAME
      INTEGER INUMBER, IFINI, NTEST,ICOUNT_GROUPS
C
      NTEST =  0
C     NTEST = 10
      INUMBER = 0
      IFINI = 0
      ICOUNT_GROUPS = 0
C
C     start with an all_gather of all name length
C
      call interface_mpi_ALLGATHER(MYNAMELENGTH,1,
     &                   NAMELENGTHLIST,1,
     &                   global_communicator)
C
C     ... all name length are stored now!
C     ... save own name in scratch array!
C
      PNAMELIST(MYPROC+1) = MYOWNNAME
C
C     ... we start to gather the names ... 
      DO IPROC = 1, NMPROC
C
         MYSCRNAME = MYOWNNAME
         call interface_mpi_BCAST(MYSCRNAME,NAMELENGTHLIST(IPROC),
     &                 IPROC-1,global_communicator)
         IF( MYPROC .NE. IPROC -1 ) THEN
           PNAMELIST(IPROC) = MYSCRNAME(1:NAMELENGTHLIST(IPROC))
         END IF
C
      END DO
C
C     test writing if needed
C
      IF( MYPROC .EQ. MASTER .AND. NTEST .GE. 10 ) THEN
       DO IPROC = 1, NMPROC
        MYSCRNAME = PNAMELIST(IPROC)(1:NAMELENGTHLIST(IPROC))
        WRITE(LUWRT,'(1X,A,1X,I4,1X,A7,1X,A)')'Processor ',IPROC -1,
     & ' alias ',MYSCRNAME(1:NAMELENGTHLIST(IPROC))
C    & MYSCRNAME
       END DO
      END IF
C
C     find all processors on the same deck and reorder (if necessary) 
C     to get the processors as close as possible, starting with the
C     master 
C     reordering is currently NOT done
C
      INUMBER = MASTER + 1
 100  CONTINUE
      IFINI = 0
      ICOUNT_GROUPS = ICOUNT_GROUPS + 1
      OLD_NAME = PNAMELIST( INUMBER)(1:NAMELENGTHLIST( INUMBER))
C
      DO IPROC = 1, NMPROC
C
        MYSCRNAME(1:NAMELENGTHLIST(IPROC)) = 
     &  PNAMELIST(IPROC)(1:NAMELENGTHLIST(IPROC))
C
        IF(MYSCRNAME(1:NAMELENGTHLIST(IPROC)) .eq. 
     &     OLD_NAME(1:NAMELENGTHLIST(INUMBER)))
     &     IPROCLIST(IPROC) = INUMBER
C
      END DO
C
C     if finished: IFINI = 1, else 0
C     search for the next lowest cpu building the 'local group master'
C
      IFINI = 1
      DO IPRC = 1, NMPROC
        IF(IFINI .ne. 0 .and. IPROCLIST(IPRC) .eq. -1) THEN
          IFINI = 0
          INUMBER = IPRC
        END IF
      END DO
      IF(IFINI .eq. 0 ) GOTO 100
C
C     write the count of 'groups' formed by processes
C
      IF( MYPROC .eq. MASTER .and. NTEST .ge. 10) THEN
        WRITE(6,'(1X,A,I4,1X,A)') 'There are ',ICOUNT_GROUPS,
     & 'different groups of processes'
        CALL IWRTMA(IPROCLIST,1,NMPROC,1,NMPROC)
      END IF
C
C     Transfer information to common block
C
      NFLGRPS = ICOUNT_GROUPS
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
*
      SUBROUTINE GATVCD_PAR(LU,LBLK,NGAT,IGAT,XGAT,SEGMNT,IPRT)
*
* Gather elements from a file LU
*
* XGAT(I) = Vector(IGAT(I))
*
* Jeppe Olsen, September 1993
*
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
*. Input
      INTEGER IGAT(NGAT)
*. Output
      DIMENSION XGAT(NGAT)
*. Scratch
      DIMENSION SEGMNT(*)
*
      CALL REWINE(LU,-1)
*
      IBASE = 1
      IBLOCK = 0
*
*. Loop over blocks of file
*
 1000 CONTINUE
        IBLOCK = IBLOCK + 1
        CALL NEXREC(LU,LBLK,SEGMNT,IEND,LENGTH)
        IF(IPRT.GE.10)
     &  WRITE(6,*) LENGTH, ' elements in block ',IBLOCK
        IF(IEND.EQ.0) THEN
          IFIRST = IBASE
          ILAST = IBASE + LENGTH - 1
          DO 100 I = 1, NGAT
            IF(IFIRST .LE. IGAT(I) .AND. IGAT(I) .LE. ILAST )
     &      XGAT(I) = SEGMNT(IGAT(I)-IFIRST+1)
C?          IF(IFIRST .LE. IGAT(I) .AND. IGAT(I) .LE. ILAST )
C?   &      write(6,*) ' Catch I IGAT(I) XGAT(I) ',
C?   &                         I,IGAT(I),XGAT(I)
  100     CONTINUE
          IBASE = IBASE + LENGTH
      IF(LBLK.LT.0) GOTO 1000
        END IF
*
      NTEST = 0
      NTEST = MAX(IPRT,NTEST)
      IF(NTEST.GE.5) THEN
       WRITE(6,*) ' Gathered vector from GATVCD_PAR'
       CALL WRTMAT(XGAT,1,NGAT,1,NGAT)
      END IF
*
      END
***********************************************************************
*                                                                     *
* LUCIAREL, by Timo Fleig and Jeppe Olsen                             *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE I_DO_RCCTOS(IBLOCKL,ICCTOS,NBLOCKS)
C
      IMPLICIT REAL*8 (A-H,O-Z)
      DIMENSION IBLOCKL(*), ICCTOS(*)
C
      IONE = 1
      DO IBLK = 1, NBLOCKS
        IF( IBLOCKL(IBLK) .gt. 0 ) ICCTOS(IBLK) = IONE
      END DO
      END 
***********************************************************************
*                                                                     *
* LUCIAREL, by Timo Fleig and Jeppe Olsen                             *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE ISET_ARRAY_ACT_BLK(IAM_BLK_ACT,IBLOCKL,NDIM)
C
      IMPLICIT REAL*8 (A-H,O-Z)
      DIMENSION IBLOCKL(*), IAM_BLK_ACT(*)
C
        ICOUNT = 0
C
        DO IBLK = 1, NDIM
          IF( IBLOCKL(IBLK) .gt. 0 ) THEN
            ICOUNT = ICOUNT + 1
            IAM_BLK_ACT( IBLK ) = ICOUNT
          END IF
        END DO
C
      END 
***********************************************************************
*                                                                     *
* LUCIAREL, by Timo Fleig and Jeppe Olsen                             *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE INFO_PRINT_BATCH_REL(LBATCH, LEBATCH,
     &                                I1BATCH,IBATCH,JBATCH,LUPRINT)
      IMPLICIT REAL*8(A-H,O-Z)
C     Input
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
C
C
C     
      DO ISBATCH = 1, JBATCH
        WRITE(LUPRINT,*)
        WRITE(LUPRINT,*) ' Info on batch ', ISBATCH
        WRITE(LUPRINT,*) ' *********************** '
        WRITE(LUPRINT,*)
        WRITE(LUPRINT,*) '      Number of blocks included ',
     &                          LBATCH(ISBATCH)
        WRITE(LUPRINT,*) '      Length of batch           ',
     &                          LEBATCH(ISBATCH)
        WRITE(LUPRINT,*) '      TTSS and offsets and lengths of
     &                          each block '
        DO IBLOCK = I1BATCH(ISBATCH),I1BATCH(ISBATCH)+ LBATCH(ISBATCH)-1
          WRITE(LUPRINT,'(10X,4I3,4I8)') (IBATCH(II,IBLOCK),II=1,8)
        END DO
      END DO
      END
***********************************************************************
*                                                                     *
* LUCIAREL, by Timo Fleig and Jeppe Olsen                             *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      REAL*8 FUNCTION INPRODB_PAR(VEC1,VEC2,NBLK,LBLK,I0BLK)
C
C     inner products between blocked vectors with check of zero blocks.
C     Zero blocks are flagged by a 0 entry in I0BLK.
C
C     adaption for parallel version from original routine written by 
C                        J. Olsen        -   May  1997
C     Last revision:     S. Knecht       - March  2007
C
************************************************************************
      IMPLICIT REAL*8(A-H,O-Z)
C     Input
      DIMENSION VEC1(*),VEC2(*), LBLK(NBLK),I0BLK(NBLK)
*
      X = 0.0D0
      IOFF = 1
      DO IBLK = 1, NBLK
        LLBLK = LBLK(IBLK)
C?      WRITE(6,*) ' INPRODB IBLK LLBLK IOFF ',IBLK,LLBLK,IOFF
        IF(I0BLK(IBLK).EQ.0.AND.LLBLK.GT.0) THEN
          X = X + DDOT(VEC1(IOFF),VEC2(IOFF),LLBLK)
C?        WRITE(6,*) ' Vec1 and Vec2 blocks '
C?        CALL WRTMAT(VEC1(IOFF),1,LLBLK,1,LLBLK)
C?        CALL WRTMAT(VEC2(IOFF),1,LLBLK,1,LLBLK)
C?        write(6,*) ' Updated x', X
        END IF
        IF(LLBLK.GT.0) IOFF = IOFF + LLBLK
      END DO
C
      INPRODB_PAR = X
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE MVCSMD_PAR(LUIN,FAC,LUOUT,LUSCR,VEC1,VEC2,
     &                      NVEC,IREW,LBLK)
C
C ADD VECTORS ON FILE LUIN TIMES FACTOR AND STORE ON LUOUT
C
C LUOUT AND LUSCR ARE INITIALLY REWOUND
C NVEC = NROOT resp. NVEC - NROOT
C
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
      DIMENSION VEC1(1),VEC2(1)
      DIMENSION FAC(NVEC)
C
      IF( MOD(NVEC,2) .EQ. 0 ) THEN
        LLUOUT = LUSCR
        LLUSCR = LUOUT
      ELSE
        LLUOUT = LUOUT
        LLUSCR = LUSCR
      END IF
C
      IF(IREW .NE. 0 ) CALL REWINE(LUIN,LBLK)
C
      DO 100 IVEC = 1, NVEC
CSK        WRITE(6,*) 'FAC(IVEC) 1 = ',FAC(IVEC)
        CALL REWINE(LLUSCR,LBLK)
        CALL REWINE(LLUOUT,LBLK)
        IF( IVEC .EQ. 1 ) THEN
          CALL SCLVCD(LUIN,LLUOUT,FAC(IVEC),VEC1,0,LBLK)
        ELSE
          CALL VECSMD(VEC1,VEC2,FAC(IVEC),1.0D0,LUIN,LLUSCR,LLUOUT,
     &                0,LBLK)
        END IF
C
        LBUF = LLUOUT
        LLUOUT = LLUSCR
        LLUSCR = LBUF
  100 CONTINUE
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PP_CC_B_C(LUIN,LUOUT,SCR,NBATCH,LBATCH,LEBATCH,
     &                            I1BATCH,IBATCH,MY_IOFF_LUIN,
     &                            MY_IOFF_LUOUT,
     &                            LUINLIST,LUOUTLIST,IBLOCKL,JOFF)
C
C     Written by  S. Knecht         - May 18 2007
C
C**********************************************************************
C
C     copy c-vector from file LUIN to LUOUT batchwise
C     update the file lists.
C
C     NOTE: JOFF = (IVEC resp. IROOT) - 1
C
C
C     copy vector from file LUIN to file LUOUT --> (hint LUIN, LUOUT)
C
C     active blocks on the MPI-files are flagged by a nonzero length
C
C     ready for complex double groups - S. Knecht, Odense, 22 June 2007
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION SCR(*),LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUOUTLIST(*), IBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN, IOFFSET_OUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER(KIND=df_mpi_offset_kind) ISCRATCH_SP
      INTEGER(KIND=df_mpi_offset_kind) ILEN_COMB
      INTEGER NUM_BLK
C     some constants
      IONE = 1
      ITWO = 2
C     initialize scratch offsets
      NUM_BLK = 0
      MY_NUM_BLK = 0
      IOFFSET_SCRATCH = 0
      IOFFSET_IN  = 0
      IOFFSET_OUT  = 0
      IOFFSET_INT_IN  = 0
      IOFFSET_INT_OUT  = 0
      ILEN_COMB = 0
      ILEN_COMB = L_COMBI
      MY_IOFFSET_SCRATCH = 0
      ISCRATCH_SP = 0
      NUM_BLK_SP = 0
C
C     WRITE(LUWRT,*) 'initial C vector ',JOFF+1
C
C     ================
C      COMPLEX VECTOR
C     ================
C
C     .................
C      REAL PART FIRST
C     .................
C
C     set new offset
C
C     position in file is at the end of vector JOFF
C
C     note: real part: --> MY_VEC2_IOFF, MY_ACT_BLK2
C
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(SCR,LEBATCH(ISBATCH))
C
        IOFFSET_IN = MY_IOFF_LUIN + ( MY_VEC2_IOFF * JOFF ) +
     &               MY_IOFFSET_SCRATCH
        IOFFSET_INT_IN = ( MY_ACT_BLK2 * JOFF ) + MY_NUM_BLK + 1
C
C       write active block array for LUOUT 
C
        IOFFSET_INT_OUT = NUM_BLK + 1
C
        NUM_BLK_SP  = 0
        ISCRATCH_SP = 0
C
CSK        WRITE(LUWRT,*) ' THIS IS MY IOFFSET_IN',IOFFSET_IN
CSK        WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_IN',IOFFSET_INT_IN
CSK        WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_OUT',IOFFSET_INT_OUT
C
        CALL RDVEC_BATCH_DRV3(LUIN,SCR,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_IN,IOFFSET_INT_IN,
     &                       IOFFSET_INT_OUT,LUINLIST,
     &                       LUOUTLIST,NUM_BLK_SP)
C
        IOFFSET_OUT = MY_IOFF_LUOUT + IOFFSET_SCRATCH
     
C       WRITE(LUWRT,*) ' THIS IS MY IOFFSET_OUT',IOFFSET_OUT
C       CALL WRTMATMN(SCR,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
        
        IOFFSET_INT_OUT = NUM_BLK + 1
C
        CALL WTVEC_BATCH_DRV3(SCR,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        LUOUT,IOFFSET_OUT,IOFFSET_INT_OUT,
     &                        LUOUTLIST,1,ISCRATCH_SP,
     &                        IBLOCKL)
C
C
C       count the length of the last copy
C
C       LUOUT
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + ISCRATCH_SP
        NUM_BLK         = NUM_BLK + LBATCH(ISBATCH)
C       LUIN
        MY_NUM_BLK         = MY_NUM_BLK + NUM_BLK_SP
        MY_IOFFSET_SCRATCH = MY_IOFFSET_SCRATCH + LEBATCH(ISBATCH)
C
      END DO
C
      IOFFSET_SCRATCH    = 0
      MY_IOFFSET_SCRATCH = 0
      NUM_BLK    = 0
      MY_NUM_BLK = 0
      NUM_BLK_SP = 0
      ISCRATCH_SP = 0
C
C     ..............
C      COMPLEX PART
C     ..............
C
C     set new offset
C
C     position in file is at the end of vector JOFF
C
C     note: complex part: --> MY_VEC1_IOFF and MY_ACT_BLK1
C
      DO JSBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(SCR,LEBATCH(JSBATCH))
C
        IOFFSET_IN = MY_IOFF_LUIN + ( MY_VEC2_IOFF * JOFF ) +
     &               MY_IOFFSET_SCRATCH +  MY_VEC1_IOFF 
C
        IOFFSET_INT_IN = ( MY_ACT_BLK2 * JOFF ) + MY_NUM_BLK + 
     &                   1 + MY_ACT_BLK1
C
C       write active block array for LUOUT - complex part 
C       --> MY_ACT_BLK_ALL needed!
C
        IOFFSET_INT_OUT = MY_ACT_BLK_ALL + NUM_BLK + 1
C
        NUM_BLK_SP  = 0
        ISCRATCH_SP = 0
C
CSK      WRITE(LUWRT,*) ' THIS IS MY IOFFSET_IN imag',IOFFSET_IN
CSK      WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_IN imag',IOFFSET_INT_IN
CSK      WRITE(LUWRT,*) ' THIS IS MY IOFFSET_INT_OUT imag',IOFFSET_INT_OUT
C
        CALL RDVEC_BATCH_DRV3(LUIN,SCR,LBATCH(JSBATCH),
     &                        IBATCH(1,I1BATCH(JSBATCH)),
     &                        IOFFSET_IN,IOFFSET_INT_IN,
     &                        IOFFSET_INT_OUT,LUINLIST,
     &                        LUOUTLIST,NUM_BLK_SP)
C
        IOFFSET_OUT = MY_IOFF_LUOUT + ILEN_COMB + IOFFSET_SCRATCH
C     
C       WRITE(LUWRT,*) ' THIS IS MY IOFFSET_OUT imag',IOFFSET_OUT
C       CALL WRTMATMN(SCR,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
      
        IOFFSET_INT_OUT = MY_ACT_BLK_ALL + NUM_BLK + 1
C
        CALL WTVEC_BATCH_DRV3_C(SCR,LBATCH(JSBATCH),
     &                          IBATCH(1,I1BATCH(JSBATCH)),
     &                          LUOUT,IOFFSET_OUT,IOFFSET_INT_OUT,
     &                          LUOUTLIST,1,ISCRATCH_SP,
     &                          IBLOCKL)
C
C
C       count the length of the last copy
C
C       LUOUT
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + ISCRATCH_SP
        NUM_BLK         = NUM_BLK + LBATCH(JSBATCH)
C       LUIN
        MY_NUM_BLK         = MY_NUM_BLK + NUM_BLK_SP
        MY_IOFFSET_SCRATCH = MY_IOFFSET_SCRATCH + LEBATCH(JSBATCH)
C
      END DO
C
C
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PP_B_RL(VEC1,LUINLIST,LUOUTLIST,
     &                          NBATCH,LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                          MY_IOFF_LUIN,MY_IOFF_LUOUT,
     &                          IROOT,LUIN,LUOUT)
C
C     Written by  S. Knecht         - June 11 2007
C
C**********************************************************************
C
C     copy vector from file LUIN to LUOUT batchwise
C
C     NOTE: IROOT = IROOT
C
C     active blocks on the MPI-files are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN   = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_IN    = 0
      IOFFSET_INT_LUOUT = 0
C
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
C       set new offset
C
C       position in file is at the end of vector IROOT - 1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( IROOT - 1 )   * MY_VEC1_IOFF
        IOFFSET_INT_IN   = 1 + NUM_BLK  +
     &                   ( IROOT - 1 )   * MY_ACT_BLK1
C
CSK         WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK     &                   IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC1,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK        WRITE(LUWRT,*) 'initial VEC1 on LUIN'
CSK        CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                LUWRT)
C
C       new offset for writing on LUOUT
C
        IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH + 
     &                        ( IROOT - 1 ) * MY_VEC1_IOFF
C
        IOFFSET_INT_LUOUT  = 1 + NUM_BLK    +
     &                        ( IROOT - 1 ) * MY_ACT_BLK1
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC1,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_LUOUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE COPVCD_PP_B_CPX(VEC1,LUINLIST,LUOUTLIST,
     &                           NBATCH,LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                           MY_IOFF_LUIN,MY_IOFF_LUOUT,
     &                           IROOT,LUIN,LUOUT)
C
C     Written by  S. Knecht         - June 26 2007
C
C**********************************************************************
C
C     copy vector from file LUIN to LUOUT batchwise
C
C     NOTE: IROOT = IROOT
C
C     active blocks on the MPI-files are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN   = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_IN    = 0
      IOFFSET_INT_LUOUT = 0
C
C
C     ==============
C     COMPLEX VECTOR
C     ==============
C
C     ---------
C     REAL PART
C     ---------
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
C       set new offset
C
C       position in file is at the end of vector IROOT - 1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( IROOT - 1 )   * MY_VEC2_IOFF
        IOFFSET_INT_IN   = 1 + NUM_BLK  +
     &                   ( IROOT - 1 )   * MY_ACT_BLK2
C
CSK         WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK     &                   IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC1,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK        WRITE(LUWRT,*) 'initial VEC1 on LUIN'
CSK        CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                LUWRT)
C
C       new offset for writing on LUOUT
C
        IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH + 
     &                        ( IROOT - 1 ) * MY_VEC2_IOFF
C
        IOFFSET_INT_LUOUT  = 1 + NUM_BLK    +
     &                        ( IROOT - 1 ) * MY_ACT_BLK2
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC1,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_LUOUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
C
C     zero scratch offsets
      NUM_BLK          = 0
      IOFFSET_SCRATCH  = 0
      NUM_ACTIVE_BATCH = 0
C
C     ---------
C     IMAG PART
C     ---------
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
C       set new offset
C
C       position in file is at the end of vector IROOT - 1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( IROOT - 1 )  * MY_VEC2_IOFF    +
     &                                    MY_VEC1_IOFF
C
        IOFFSET_INT_IN   = 1 + NUM_BLK + MY_ACT_BLK1 +
     &                   ( IROOT - 1 ) * MY_ACT_BLK2
C
CSK         WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK     &                   IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC1,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK        WRITE(LUWRT,*) 'initial VEC1 on LUIN'
CSK        CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                LUWRT)
C
C       new offset for writing on LUOUT
C
        IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH + 
     &                        ( IROOT - 1 ) * MY_VEC2_IOFF    +
     &                                        MY_VEC1_IOFF
C
        IOFFSET_INT_LUOUT  = 1 + NUM_BLK    + MY_ACT_BLK1 +
     &                        ( IROOT - 1 ) * MY_ACT_BLK2
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC1,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_LUOUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE WTVEC_BATCH_DRV3_C(SCR,NBATCH_BLK,NBATCH_INF,
     &                              LUOUT,IOFFSET,IOFFSET_INT,
     &                              IVCOFF_OUT,NO_CHECK,ISCRATCH_SP,
     &                              IBLOCKL)
C
C     Written by  S. Knecht         -      June 22 2007
C
C**********************************************************************
C
C
C     write batch from SCR to MPI-file LUOUT - C-vector routine
C
C     COMPLEX vector part
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*), IVCOFF_OUT(*)
      DIMENSION IBLOCKL(*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET, ISCRATCH_SP
      INTEGER JOFF, LENGTH
C
      JOFF   = 0
      LENGTH = 0
C
C     loop over all blocks in that batch, do a norm check
C
      DO IBLK = 1, NBATCH_BLK
C
        JOFF = NBATCH_INF(6,IBLK)
        LENGTH = IBLOCKL( IOFFSET_INT + IBLK - 1 - MY_ACT_BLK_ALL )
C       check norm of vector
        IF( NO_CHECK .eq. 0 ) THEN
          XXX = 0.0D0
          XXX = DDOT(LENGTH,SCR(JOFF),1,SCR(JOFF),1)
C
          IF( XXX .eq. 0.0D0 ) THEN
            IVCOFF_OUT( IOFFSET_INT + IBLK - 1 ) = 0
            GOTO 100
          END IF
        ELSE
C
          IF( IVCOFF_OUT( IOFFSET_INT + IBLK - 1 ) .eq. 0 ) GOTO 100
C
        END IF
C
CSK          WRITE(LUWRT,'(2X,A,1X,I6,1X,A,1X,I14,1X,I6)') 'THIS IS block',
CSK     &         IOFFSET_INT + IBLK - 1,'to go on pos (JOFF)',IOFFSET,JOFF
CSK          CALL WRTMATMN(SCR(JOFF),1,LENGTH,1,LENGTH,LUWRT)
C
          call interface_mpi_FILE_WRITE_AT_r(LUOUT,IOFFSET,SCR(JOFF),
     &                           LENGTH,ISTAT)
C
C
 100    CONTINUE  ! skip zero blocks on file
        IOFFSET = IOFFSET + LENGTH
        ISCRATCH_SP = ISCRATCH_SP + LENGTH
C
      END DO
C     ^ loop over blocks in a batch
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE RDVEC_BATCH_DRV4(MY_LUIN,SCR,NBATCH_BLK,NBATCH_INF,
     &                            IOFFSET,IOFFSET_INT,IVCOFF_IN1,
     &                            NUM_A_B)
C
C     read in a batch from a MPI-file MY_LUIN to segment SCR
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*), IVCOFF_IN1(*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET
      INTEGER JOFF
      JOFF = 0
C
      NUM_A_B = 0
C
        DO IBLK = 1, NBATCH_BLK
C
          ILEN2 = NBATCH_INF(8,IBLK) 
C          WRITE(LUWRT,*) 'ILEN2 for block IBLK',
C     &                    ILEN2,IBLK
          IF( ILEN2 .gt. 0 ) THEN
C
            NUM_A_B = NUM_A_B + 1 
C
            ILEN  = IVCOFF_IN1( IOFFSET_INT + NUM_A_B -1 )
C
            IF( ILEN .gt. 0 ) THEN
C             memory offset
              JOFF = NBATCH_INF(6,IBLK)
C
CSK              WRITE(LUWRT,*) 'JOFF,ILEN2,ILEN again for block 
CSK     & IOFFSET_INT+ NUM_A_B -1',JOFF,ILEN2,ILEN,IOFFSET_INT+ NUM_A_B -1
              call interface_mpi_FILE_READ_AT_r(MY_LUIN,IOFFSET,
     &                              SCR(JOFF),ILEN2,ISTAT)
CSK              WRITE(LUWRT,*) ' my block'
CSK              CALL WRTMATMN(SCR(JOFF),1,ILEN2,1,ILEN2,LUWRT)
            ENDIF 
C
          ENDIF 
C
          IOFFSET = IOFFSET + ILEN2
C
        END DO
C       ^ loop over blocks in batch
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE WTVEC_BATCH_DRV4(MY_LUOUT,SCR,NBATCH_BLK,NBATCH_INF,
     &                            IOFFSET,IOFFSET_INT,IVCOFF_IN1,
     &                            NUM_A_B)
C
C     write batch from segment SCR to MPI-file MY_LUOUT
C
C     active blocks on the MPI-file are flagged by a nonzero entry in 
C     file list
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*), IVCOFF_IN1(*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET
      INTEGER JOFF
      JOFF = 0
C
      NUM_A_B = 0
C
        DO IBLK = 1, NBATCH_BLK
C
          ILEN2 = NBATCH_INF(8,IBLK) 
CSK          WRITE(LUWRT,*) 'ILEN2 for block IBLK',
CSK     &                    ILEN2,IBLK
          IF( ILEN2 .gt. 0 ) THEN
C
C           memory offset
            JOFF  = NBATCH_INF(6,IBLK)
            NUM_A_B = NUM_A_B + 1
C           check norm of vector
            XXX = 0.0D0
            XXX = DDOT(ILEN2,SCR(JOFF),1,SCR(JOFF),1)
C
            IF( XXX .eq. 0.0D0 ) THEN
              IVCOFF_IN1( IOFFSET_INT + NUM_A_B - 1 ) = 0
              GOTO 100
            ELSE
              IVCOFF_IN1( IOFFSET_INT + NUM_A_B - 1 ) = 1
            END IF
C
#ifdef LUCI_DEBUG
              WRITE(LUWRT,'(A,1X,I6,1X,I6,1X,I6,1X,A,1X,I14)') 'JOFF, 
     & ILEN2 for block',JOFF,ILEN2,IOFFSET_INT+ NUM_A_B -1,'at off',
     & IOFFSET
              CALL WRTMATMN(SCR(JOFF),1,ILEN2,1,ILEN2,LUWRT)
#endif
              call interface_mpi_FILE_WRITE_AT_r(MY_LUOUT,IOFFSET,
     &             SCR(JOFF),ILEN2,ISTAT)
C
 100        CONTINUE
C
          ENDIF 
C
          IOFFSET = IOFFSET + ILEN2
C
        END DO
C       ^ loop over blocks in batch
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE RDVEC_BATCH_DRV5(MY_LUIN,SCR,NBATCH_BLK,NBATCH_INF,
     &                            IOFFSET)
C
C     read in a batch from a MPI-file MY_LUIN to segment SCR
C
C     Last revision:     S. Knecht       - June  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
      DIMENSION SCR(*), NBATCH_INF(8,*)
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET
      INTEGER JOFF
C
      JOFF = 0
C
        DO IBLK = 1, NBATCH_BLK
C
          ILEN2 = NBATCH_INF(8,IBLK) 
CSK          WRITE(LUWRT,*) 'ILEN2 for block IBLK',
CSK     &                    ILEN2,IBLK
          IF( ILEN2 .gt. 0 ) THEN
C
C           memory offset
C
            JOFF = NBATCH_INF(6,IBLK)
C
            call interface_mpi_FILE_READ_AT_r(MY_LUIN,IOFFSET,SCR(JOFF),
     &                            ILEN2,ISTAT)
C
          ENDIF 
C
          IOFFSET = IOFFSET + ILEN2
C
        END DO
C       ^ loop over blocks in batch
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE TRAVC_B_RL_DRV(VEC1,VEC2,XMAT,LUIN1LIST,
     &                          LUOUTLIST,NBATCH,LBATCH,LEBATCH,I1BATCH,
     &                          IBATCH,MY_IOFF_LUIN1,MY_IOFF_LUOUT,
     &                          NVEC,NVEC2,LUIN1,LUOUT,IALL_LUIN)
C
C     Written by  S. Knecht         - March 6 2008
C
C**********************************************************************
C
C     transforming vectors so that they become the actual approx. to the 
C     eigenvectors
C
C     NOTE: NVEC  = NVEC
C     NOTE: NVEC2 = NVEC2
C
C     active blocks on the MPI-files are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - March  2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*), VEC2(*), XMAT(NVEC,NVEC2)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUIN1LIST(*), LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN1
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      NZERO = 0
C
C     transform: LUIN1 --> LUOUT
C
      DO IVECOUT = 1, NVEC2
         CALL TRAVC_B_RL(VEC1,VEC2,XMAT(1,IVECOUT),LUIN1LIST,
     &                   LUOUTLIST,NBATCH,LBATCH,LEBATCH,I1BATCH,
     &                   IBATCH,MY_IOFF_LUIN1,MY_IOFF_LUOUT,NVEC,
     &                   IVECOUT,LUIN1,LUOUT)
      END DO
C
C     copy back: LUOUT --> LUIN1
C
      Call IZero(LuIn1List,IAll_LuIn)
C      CALL ISETVC(LUIN1LIST,NZERO,IALL_LUIN)
C
      DO IROOT = 1, NVEC2
         CALL COPVCD_PP_B_RL(VEC1,LUOUTLIST,LUIN1LIST,
     &                       NBATCH,LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                       MY_IOFF_LUOUT,MY_IOFF_LUIN1,IROOT,LUOUT,
     &                       LUIN1)
      END DO
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE TRAVC_B_RL(VEC1,VEC2,FAC,LUIN1LIST,
     &                      LUOUTLIST,NBATCH,LBATCH,LEBATCH,I1BATCH,
     &                      IBATCH,MY_IOFF_LUIN1,MY_IOFF_LUOUT,
     &                      NVEC,ISVEC,LUIN1,LUOUT)
C
C     Written by  S. Knecht         - March 6 2008
C
C**********************************************************************
C
C     transforming vectors so that they become the actual approx. to the 
C     eigenvectors
C
C     NOTE: NVEC  = NVEC
C     NOTE: ISVEC = ISVEC
C
C     active blocks on the MPI-files are flagged by a nonzero list entry
C
C     Last revision:     S. Knecht       - March  2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*), VEC2(*), FAC(NVEC)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUIN1LIST(*), LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN1
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN1
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK           = 0
      IOFFSET_SCRATCH   = 0
      IOFFSET_IN_LUIN1  = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_IN1   = 0
      IOFFSET_INT_LUOUT = 0
C
      FACTOR = 0.0D0
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
        DO 100 IVEC = 1, NVEC
C
          FACTOR = FAC( IVEC )
C
C         set new offset
C
C         position in file is at the end of vector IVEC - 1
C
          IOFFSET_IN_LUIN1 = MY_IOFF_LUIN1 + IOFFSET_SCRATCH +
     &                     ( IVEC - 1 )  * MY_VEC1_IOFF
          IOFFSET_INT_IN1  = 1 + NUM_BLK   +
     &                     ( IVEC - 1 )  * MY_ACT_BLK1
C
CSK         WRITE(LUWRT,*) 'This is my OFFSET for LUIN1',
CSK     &                   IOFFSET_IN_LUIN1
C
          IF( IVEC .eq. 1 ) THEN
C
            CALL RDVEC_BATCH_DRV4(LUIN1,VEC2,LBATCH(ISBATCH),
     &                           IBATCH(1,I1BATCH(ISBATCH)),
     &                           IOFFSET_IN_LUIN1,IOFFSET_INT_IN1,
     &                           LUIN1LIST,NUM_ACTIVE_BATCH)
C
CSK            WRITE(LUWRT,*) 'initial VEC2 on LUIN1'
CSK            CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                    LUWRT)
C
            CALL DSCAL(LEBATCH(ISBATCH),FACTOR,VEC2,1)
C
          ELSE
C
            CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
            CALL RDVEC_BATCH_DRV4(LUIN1,VEC1,LBATCH(ISBATCH),
     &                           IBATCH(1,I1BATCH(ISBATCH)),
     &                           IOFFSET_IN_LUIN1,IOFFSET_INT_IN1,
     &                           LUIN1LIST,NUM_ACTIVE_BATCH)
C
CSK            WRITE(LUWRT,*) 'initial VEC1 on LUIN1'
CSK            CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK     &                    LUWRT)
C
C           VEC2 == VEC2 + VEC1 * FACTOR
C
            CALL DAXPY(LEBATCH(ISBATCH),FACTOR,VEC1,1,VEC2,1)
C
          END IF
C
  100   CONTINUE
C
C       write VEC2 to LUOUT wrt offset at ISVEC - 1
C
        IOFFSET_LUOUT     = MY_IOFF_LUOUT  + IOFFSET_SCRATCH + 
     &                      ( MY_VEC1_IOFF * ( ISVEC - 1 ) )
        IOFFSET_INT_LUOUT = 1 + NUM_BLK 
     &                        + ( MY_ACT_BLK1 * ( ISVEC - 1 ) )
csk        WRITE(LUWRT,*) 'This is my OFFSET for LUOUT',
csk     &                  IOFFSET_LUOUT
csk     WRITE(LUWRT,*) 'final VEC2 to write on LUOUT'
csk     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
csk  &                LUWRT)
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC2,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_LUOUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE BLK_DSCAL(FAC,SCR,NBATCH_BLK,NBATCH_INF,
     &                     IOFFSET,IVCOFF)
C
C     scaling of vector SCR with factor FAC using a blocked vector 
C
C     Last revision:     S. Knecht       - April  2007
C
************************************************************************
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
      DIMENSION NBATCH_INF(8,*), IVCOFF(*), SCR(*) 
      INTEGER JOFF,ILEN
C
      JOFF = 0
      ILEN = 0
      DO ISBLK = 1, NBATCH_BLK
C
        ILEN = IVCOFF( IOFFSET + ISBLK - 1 )
        IF( ILEN .gt. 0 ) THEN
          JOFF = NBATCH_INF(6,ISBLK)
          CALL DSCAL( ILEN, FAC, SCR(JOFF), 1 )
        END IF
C
      END DO
C
      END 
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE ORTHG_VEC_BATCH_LUCI1(VEC1,VEC2,SUBSPH,SCRRED,
     &                                 SCRRED2,NROOT,JOFF,JOFF2,
     &                                 NBATCH,LBATCH,LEBATCH,
     &                                 I1BATCH,IBATCH,
     &                                 MY_IOFF_LUIN,MY_IOFF_LUIN2,
     &                                 MY_IOFF_LUIN3,MY_IOFF_LUOUT,
     &                                 LUINLIST,LUIN2LIST,
     &                                 LUIN3LIST,LUOUTLIST,
     &                                 LUIN,LUIN2,
     &                                 LUIN3,LUOUT)
C
C     calculate dot products between vector on file LUIN1 and LUIN2 
C     resp. LUIN1 and LUIN3. Store vector VEC2 on LUOUT 
C                            ==> LUIN1(VEC2) --> LUOUT(VEC2)
C
C     dot products SUM_batches ( VEC1 * VEC2 ) are reduced on every 
C     process in global_communicator
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Note: JOFF  = JOFF
C           JOFF2 = JOFF2
C
C     Last revision:     S. Knecht       - March 2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*),VEC2(*)
      DIMENSION SUBSPH(*), SCRRED(*), SCRRED2(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUIN2LIST(*), LUIN3LIST(*)
      DIMENSION LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN, MY_IOFF_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN, IOFFSET_IN_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK, ISTRED, ISTRED2
      CHARACTER*12 SECTID, WALLTID
C     initialize scratch vector offsets
      ISTRED  = 0
      ISTRED2 = 0
C     initialize scratch offsets
      NUM_BLK = 0
      IOFFSET_SCRATCH  = 0
      IOFFSET_IN_LUIN  = 0
      IOFFSET_IN_LUIN2 = 0
      IOFFSET_IN_LUIN3 = 0
      IOFFSET_LUOUT    = 0
      IOFFSET_INT_IN   = 0
      IOFFSET_INT_IN2  = 0
      IOFFSET_INT_IN3  = 0
      IOFFSET_INT_OUT  = 0
C     set scratch vectors to zero
      CALL DZERO(SCRRED,NROOT)
C
      IF( ( JOFF - 1 ) .gt. 0 ) CALL DZERO(SCRRED2, ( JOFF - 1) )
C 
      DO ISBATCH = 1, NBATCH
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
C       set new offset
C
C       position in file is at the end of vector JOFF - 1
C       and JOFF2 - 1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( JOFF  - 1 )   * MY_VEC1_IOFF   +
     &                   ( JOFF2 - 1 )   * MY_VEC1_IOFF
        IOFFSET_INT_IN   = 1 + NUM_BLK                    +
     &                   ( JOFF  - 1 )   * MY_ACT_BLK1    +
     &                   ( JOFF2 - 1 )   * MY_ACT_BLK1
C
CSK     WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK  &                  IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) 'initial VEC2 on LUIN'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
        DO 100 IROOT = 1, NROOT
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN2 = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                     ( IROOT - 1 )   * MY_VEC1_IOFF
          IOFFSET_INT_IN2  = 1 + NUM_BLK                     +
     &                     ( IROOT - 1 )   * MY_ACT_BLK1
C
C         read in batch ISBATCH from LUIN2 to VEC1
C
          CALL RDVEC_BATCH_DRV4(LUIN2,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                          LUIN2LIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) '1st VEC1 read from position',IROOT -1
CSK     CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
          IF( IROOT .eq. 1 ) ISTRED = IROOT + ( JOFF - 1 ) * 2 * NROOT
C
C         SCRRED(IROOT) == SCRRED(IROOT) - SUM ( VEC1 * VEC2 )
          SCRRED(IROOT) =  SCRRED(IROOT)
     &                  -  DDOT( LEBATCH(ISBATCH), VEC1, 1, VEC2, 1 )
C
  100   CONTINUE
C
C       continue with vectors on LUIN3
C
        DO 200 KVEC = 1, (JOFF - 1)
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN3 = MY_IOFF_LUIN3 + IOFFSET_SCRATCH +
     &                     ( KVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN3  = 1 + NUM_BLK                     +
     &                     ( KVEC - 1 )    * MY_ACT_BLK1
C
          CALL RDVEC_BATCH_DRV4(LUIN3,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN3,IOFFSET_INT_IN3,
     &                          LUIN3LIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) '2nd VEC1 read from position',KVEC -1
CSK     CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
          IF( KVEC .eq. 1 ) ISTRED2 = NROOT + KVEC + (JOFF - 1 )*2*NROOT
C         SCRRED2(KVEC) == SCRRED2(KVEC) - SUM ( VEC1 * VEC2 )
          SCRRED2(KVEC) =  SCRRED2(KVEC) 
     &                  -  DDOT( LEBATCH(ISBATCH), VEC1, 1, VEC2, 1 )
C
  200   CONTINUE
C
C       write vector VEC2 from batch IBATCH to file LUOUT
C
        IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH +
     &                     ( JOFF - 1 )     * MY_VEC1_IOFF
        IOFFSET_INT_OUT    = 1 + NUM_BLK    +
     &                     ( JOFF - 1 )     * MY_ACT_BLK1
C
CSK     WRITE(LUWRT,*) 'final VEC2 to write on position',JOFF -1
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC2,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_LUOUT,IOFFSET_INT_OUT,
     &                       LUOUTLIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C     ^ loop over batches
C
C     individual dot products need to be communicated
C
      starttime = interface_MPI_WTIME()
C
      CALL DZERO(SUBSPH(ISTRED),NROOT)
      CALL REDVEC(SCRRED,SUBSPH(ISTRED),NROOT,2,
     &            op_MPI_SUM,global_communicator,-1)
C
      IF( (JOFF - 1) .gt. 0 ) THEN
        CALL DZERO(SUBSPH(ISTRED2),JOFF-1)
        CALL REDVEC(SCRRED2,SUBSPH(ISTRED2),JOFF-1,2,
     &              op_MPI_SUM,global_communicator,-1)
      END IF
C
      endtime = interface_MPI_WTIME()
      WALLTID = SECTID(endtime-starttime)
csk   WRITE(LUWRT,*) 'THIS IS WHAT I HAVE REDUCED TO'
csk   WRITE(LUWRT,*) ' ISTRED, ISTRED2, NROOT, JOFF',
csk  &                  ISTRED, ISTRED2, NROOT, JOFF-1
csk   CALL WRTMATMN(SUBSPH(ISTRED),1,NROOT,1,NROOT,LUWRT)
csk   CALL WRTMATMN(SUBSPH(ISTRED2),1,JOFF-1,1,JOFF-1,LUWRT)
      IF( TIMING )
     &WRITE(LUWRT,9450) WALLTID
 9450 FORMAT(' >>>  WALL TIME FOR REDUCING WORK                : ',A)
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE ORTHG_VEC_BATCH_LUCI2(VEC1,VEC2,FACIN1,FACIN2,NINVEC1,
     &                                 NINVEC2,NBATCH,LBATCH,LEBATCH,
     &                                 I1BATCH,IBATCH,MY_IOFF_LUIN,
     &                                 MY_IOFF_LUIN2,MY_IOFF_LUIN3,
     &                                 MY_IOFF_LUOUT,MY_IOFF_LUOUT2,
     &                                 SCAL_ARRAY,NSCALING,IADD,
     &                                 LUINLIST,LUIN2LIST,LUIN3LIST,
     &                                 LUOUTLIST,LUOUT2LIST,
     &                                 LUIN,LUIN2,LUIN3,LUOUT,LUOUT2)
C
C     add vectors on file LUIN1 and LUIN2 resp. LUIN2 and LUIN3. 
C     store resulting vector VEC2 on LUOUT with scaling factor FACOFSCAL.
C     LUOUT(VEC2) = LUIN2(VEC2) x FACOFSCAL
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - March 2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*),VEC2(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION SCAL_ARRAY(*), FACIN1(*), FACIN2(1)
      DIMENSION LUINLIST(*),LUIN2LIST(*), LUIN3LIST(*)
      DIMENSION LUOUTLIST(*), LUOUT2LIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN, MY_IOFF_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT, MY_IOFF_LUOUT2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN, IOFFSET_IN_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT, IOFFSET_LUOUT2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
      REAL*8 REDFAC
C     initialize scratch factor 
      REDFAC = 0.0D0
C     initialize scratch offsets
      NUM_BLK = 0
      IOFFSET_SCRATCH  = 0
      IOFFSET_IN_LUIN  = 0
      IOFFSET_IN_LUIN2 = 0
      IOFFSET_IN_LUIN3 = 0
      IOFFSET_LUOUT    = 0
      IOFFSET_LUOUT2   = 0
      IOFFSET_INT_IN   = 0
      IOFFSET_INT_IN2  = 0
      IOFFSET_INT_IN3  = 0
      IOFFSET_INT_OUT  = 0
      IOFFSET_INT_OUT2 = 0
C
csk   WRITE(LUWRT,*) 'THIS IS WORK_SP(1+(JVEC-1)*2*NROOT) inside',myproc
csk   CALL WRTMATMN(FACIN1,1,NINVEC1,1,NINVEC1,LUWRT)
C 
      DO ISBATCH = 1, NBATCH
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
C       find offset 
        IOFFSET_IN_LUIN2 = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                   ( NINVEC2 - 1 )    * MY_VEC1_IOFF
        IOFFSET_INT_IN2  = 1 + NUM_BLK                     +
     &                   ( NINVEC2 - 1 )    * MY_ACT_BLK1
C
        CALL RDVEC_BATCH_DRV4(LUIN2,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                        LUIN2LIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) ' VEC2 read from position',NINVEC2 - 1
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
C       calculate scalar products with vectors on LUIN
C   
        DO 100 IVEC = 1, NINVEC1
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN  = MY_IOFF_LUIN  + IOFFSET_SCRATCH +
     &                     ( IVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN   = 1 + NUM_BLK                     +
     &                     ( IVEC - 1 )    * MY_ACT_BLK1
C
          CALL RDVEC_BATCH_DRV4(LUIN,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                          LUINLIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) ' VEC1 read from position',IVEC - 1
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
          IF( IVEC .eq. 1 ) THEN
            CALL DAXPY(LEBATCH(ISBATCH),FACIN1(1),VEC1,1,VEC2,1)
          ELSE
            CALL DAXPY(LEBATCH(ISBATCH),FACIN1(IVEC),VEC1,1,VEC2,1)
          END IF
C
  100   CONTINUE
C
CSK     WRITE(LUWRT,*) ' VEC2 after 1st DAXPY part '
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
C       continue with vectors on LUIN3
C
        DO 200 IVEC = 1, ( NINVEC2 - 1)
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN3 = MY_IOFF_LUIN3 + IOFFSET_SCRATCH +
     &                     ( IVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN3  = 1 + NUM_BLK                     +
     &                     ( IVEC - 1 )    * MY_ACT_BLK1
C
          CALL RDVEC_BATCH_DRV4(LUIN3,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN3,IOFFSET_INT_IN3,
     &                          LUIN3LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'VEC1 read from LUIN3 at IVEC - 1', IVEC - 1
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
C    &                  LUWRT)
C
C         VEC2 == VEC2 + VEC1 * FACIN2(IVEC)
C
          IF( IVEC .eq. 1 ) THEN
            CALL DAXPY(LEBATCH(ISBATCH),FACIN2(1),VEC1,1,VEC2,1)
          ELSE
            CALL DAXPY(LEBATCH(ISBATCH),FACIN2(IVEC),VEC1,1,VEC2,1)
          END IF
C
  200   CONTINUE
C
CSK     WRITE(LUWRT,*) 'VEC2 after 2nd DAXPY == after sec ort'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
C    &                LUWRT)
C
C       update REDFAC == REDFAC + SUM ( VEC2 * VEC2 )
        REDFAC = REDFAC + DDOT( LEBATCH(ISBATCH), VEC2, 1, VEC2, 1 )
CSK     WRITE(LUWRT,*) 'REDFAC = REDFAC + DDOT ...',REDFAC
C
C       write vector VEC2 from batch IBATCH to file LUIN2
C
        IOFFSET_IN_LUIN2 =  MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                   ( NINVEC2 - 1 )  * MY_VEC1_IOFF
        IOFFSET_INT_IN2  = 1 + NUM_BLK    +
     &                   ( NINVEC2 - 1 )  * MY_ACT_BLK1
C
CSK     WRITE(LUWRT,*) 'final VEC2 to write on position',NINVEC2 -1
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
        CALL WTVEC_BATCH_DRV4(LUIN2,VEC2,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                       LUIN2LIST,NUM_ACTIVE_BATCH)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C     ^ loop over batches - part 1 -
C
C     communicate REDFAC to obtain correct scaling factor
      
      FACTOR = 0.0D0
      CALL REDVEC(REDFAC,FACTOR,1,2,op_MPI_SUM,global_communicator,-1)
csk   WRITE(LUWRT,*) 'REDUCED FACTOR IN PART 3.2',FACTOR
C
      SCALING = 0.0D0
      SCALING = 1.0D0/SQRT(FACTOR)
csk   WRITE(LUWRT,*) 'SCALING FACTOR IN PART 3.2',SCALING
C
C     scaling part 1
C
      CALL DSCAL(NSCALING, SCALING, SCAL_ARRAY, 1 )
C
csk   WRITE(LUWRT,*) 'THIS IS MY SCALED WORK PART'
csk   CALL WRTMATMN(SCAL_ARRAY,1,NSCALING,1,NSCALING,LUWRT)
C
C     initialize scratch offsets again
      NUM_BLK = 0
      IOFFSET_SCRATCH = 0
C
C     we loop again
C
      DO ISBATCH = 1, NBATCH
C
        CALL DZERO( VEC2, LEBATCH(ISBATCH) )
C
C       read vector from file LUIN2
C
        IOFFSET_IN_LUIN2 = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                   ( NINVEC2 - 1 )    * MY_VEC1_IOFF
        IOFFSET_INT_IN2  = 1 + NUM_BLK                     +
     &                   ( NINVEC2 - 1 )    * MY_ACT_BLK1
C
        CALL RDVEC_BATCH_DRV4(LUIN2,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                        LUIN2LIST,NUM_ACTIVE_BATCH)
C
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
C       scaling - part 2 -
C
        CALL DSCAL(LEBATCH(ISBATCH),SCALING,VEC2,1)
C
C       write scaled vector to files LUOUT and LUOUT2
C       writing to LUOUT1 only for ( NINVEC2 -1 + 1 .ne. IADD )
C
        IF( ( NINVEC2 - 1 + 1 ) .ne. IADD ) THEN
C
C         new offset for writing on LUOUT
C
          IOFFSET_LUOUT      =  MY_IOFF_LUOUT + IOFFSET_SCRATCH +
     &                       ( NINVEC2 - 1 )  * MY_VEC1_IOFF
C
          IOFFSET_INT_OUT    = 1 + NUM_BLK    +
     &                       ( NINVEC2 - 1 )  * MY_ACT_BLK1
C
          CALL WTVEC_BATCH_DRV4(LUOUT,VEC2,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_LUOUT,IOFFSET_INT_OUT,
     &                          LUOUTLIST,NUM_ACTIVE_BATCH)
C
        END IF
C
C       new offset for writing on LUOUT2
C
        IOFFSET_LUOUT2     =  MY_IOFF_LUOUT2 + IOFFSET_SCRATCH +
     &                     ( NINVEC2 - 1 )   * MY_VEC1_IOFF
C
        IOFFSET_INT_OUT2   = 1 + NUM_BLK     +
     &                     ( NINVEC2 - 1 )   * MY_ACT_BLK1
C
        CALL WTVEC_BATCH_DRV4(LUOUT2,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_LUOUT2,IOFFSET_INT_OUT2,
     &                        LUOUT2LIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) 'This is what i have written'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C     ^ loop over batches 2
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE ORTHG_VEC_BATCH_LUCI3(VEC1,VEC2,FACIN1,FACIN3,FACIN2,
     &                                 NROOT,JOFF,NBATCH,LBATCH,LEBATCH,
     &                                 I1BATCH,IBATCH,MY_IOFF_LUIN,
     &                                 MY_IOFF_LUIN2,MY_IOFF_LUIN3,
     &                                 MY_IOFF_LUOUT,LUINLIST,LUIN2LIST,
     &                                 LUIN3LIST,LUOUTLIST,JOFF2,
     &                                 LUIN,LUIN2,LUIN3,LUOUT)
C
C     calculate vector products between vector on file LUIN1 and LUIN2 
C     resp. LUIN1 and LUIN3. Store vector VEC2 on LUOUT 
C                            ==> LUIN1(VEC2) --> LUOUT(VEC2)
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Note: JOFF  = JOFF
C           JOFF2 = JOFF2
C
C     Last revision:     S. Knecht       - March  2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*),VEC2(*)
      DIMENSION FACIN1(*), FACIN3(1)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUIN2LIST(*), LUIN3LIST(*)
      DIMENSION LUOUTLIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN, MY_IOFF_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN, IOFFSET_IN_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_LUOUT
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C     initialize scratch offsets
      NUM_BLK = 0
      IOFFSET_SCRATCH = 0
      IOFFSET_IN_LUIN   = 0
      IOFFSET_IN_LUIN2  = 0
      IOFFSET_IN_LUIN3  = 0
      IOFFSET_LUOUT     = 0
      IOFFSET_INT_IN    = 0
      IOFFSET_INT_IN2   = 0
      IOFFSET_INT_IN3   = 0
      IOFFSET_INT_OUT   = 0
C 
      DO ISBATCH = 1, NBATCH
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
C       position in file is at the end of vector JOFF - 1
C       and JOFF2 - 1
C
        IOFFSET_IN_LUIN  = MY_IOFF_LUIN + IOFFSET_SCRATCH +
     &                   ( JOFF  - 1 )   * MY_VEC1_IOFF   +
     &                   ( JOFF2 - 1 )   * MY_VEC1_IOFF
        IOFFSET_INT_IN   = 1 + NUM_BLK                    +
     &                   ( JOFF  - 1 )   * MY_ACT_BLK1    +
     &                   ( JOFF2 - 1 )   * MY_ACT_BLK1
C
CSK     WRITE(LUWRT,*) 'This is my OFFSET for LUIN',
CSK  &                  IOFFSET_IN_LUIN
C
        CALL RDVEC_BATCH_DRV4(LUIN,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                        LUINLIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) 'initial VEC2 on LUIN'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                LUWRT)
C
        DO 100 IROOT = 1, NROOT
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN2  = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                      ( IROOT - 1 )   * MY_VEC1_IOFF
          IOFFSET_INT_IN2   = 1 + NUM_BLK                     +
     &                      ( IROOT - 1 )   * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN2',
CSK  &                    IOFFSET_IN_LUIN2
C
          CALL RDVEC_BATCH_DRV4(LUIN2,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                          LUIN2LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC1 on LUIN2'
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
C         scaling of VEC2
C
          IF( IROOT .eq. 1 ) THEN 
            CALL DSCAL(LEBATCH(ISBATCH),FACIN2,VEC2,1)
          END IF
C
C         VEC2 == VEC2 + VEC1 * FACIN1(IROOT)
C
          CALL DAXPY(LEBATCH(ISBATCH),FACIN1(IROOT),VEC1,1,VEC2,1)
CSK
CSK       WRITE(LUWRT,*) 'VEC1 and VEC2 after DAXPY == after first ort'
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH))
CSK  &                  LUWRT)
CSK       CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
  100   CONTINUE
C
C       continue with vectors on LUIN3
C
        DO 200 KVEC = 1, (JOFF - 1)
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN3  = MY_IOFF_LUIN3 + IOFFSET_SCRATCH +
     &                      ( KVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN3   = 1 + NUM_BLK                     +
     &                      ( KVEC - 1 )    * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN3',
CSK  &                    IOFFSET_IN_LUIN3
C
          CALL RDVEC_BATCH_DRV4(LUIN3,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN3,IOFFSET_INT_IN3,
     &                          LUIN3LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC1 on LUIN3'
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
C         VEC2 == VEC2 + VEC1 * FACIN3(KVEC)
C
          IF( KVEC .eq. 1 ) THEN
            CALL DAXPY(LEBATCH(ISBATCH),FACIN3(1),VEC1,1,VEC2,1)
          ELSE
            CALL DAXPY(LEBATCH(ISBATCH),FACIN3(KVEC),VEC1,1,VEC2,1)
          END IF
C
  200   CONTINUE
C
C       new offset for writing on LUOUT
C
        IOFFSET_LUOUT     =  MY_IOFF_LUOUT + IOFFSET_SCRATCH +
     &                    ( JOFF - 1 )   * MY_VEC1_IOFF
C
        IOFFSET_INT_OUT   = 1 + NUM_BLK                      +
     &                    ( JOFF - 1 )   * MY_ACT_BLK1
C
        CALL WTVEC_BATCH_DRV4(LUOUT,VEC2,LBATCH(ISBATCH),
     &                        IBATCH(1,I1BATCH(ISBATCH)),
     &                        IOFFSET_LUOUT,IOFFSET_INT_OUT,
     &                        LUOUTLIST,NUM_ACTIVE_BATCH)
C
CSK     WRITE(LUWRT,*) 'This is what I have written - sigma part'
CSK     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
      END DO
C     ^ loop over batches
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE CALC_SUBSPACE_H_LUCI(VEC1,VEC2,SCRRED,IMUSTRED,IVEC,
     &                                NROOT,ISTRED,NBATCH,LBATCH,
     &                                LEBATCH,I1BATCH,IBATCH,ILOOP,
     &                                MY_IOFF_LUIN,MY_IOFF_LUIN2,
     &                                MY_IOFF_LUIN3,MY_IOFF_LUIN4,
     &                                LUINLIST,LUIN2LIST,LUIN3LIST,
     &                                LUIN4LIST,LUIN,LUIN2,LUIN3,LUIN4)
C
C     calculate dot products between vectors on file LUIN1 / LUIN2 
C     and LUIN3 resp. LUIN4
C
C     active blocks on the MPI-file are flagged by a nonzero length
C
C     Note: IVEC  = IVEC
C           ILOOP = ILOOP
C
C     Last revision:     S. Knecht       - March  2008
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*),VEC2(*)
      DIMENSION SCRRED(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUINLIST(*), LUIN2LIST(*), LUIN3LIST(*)
      DIMENSION LUIN4LIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN, MY_IOFF_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN3
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN4
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN, IOFFSET_IN_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN3,IOFFSET_IN_LUIN4
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C     initialize scratch offsets
      NUM_BLK = 0
      IOFFSET_SCRATCH  = 0
      IOFFSET_IN_LUIN  = 0
      IOFFSET_IN_LUIN2 = 0
      IOFFSET_IN_LUIN3 = 0
      IOFFSET_IN_LUIN4 = 0
      IOFFSET_INT_IN   = 0
      IOFFSET_INT_IN2  = 0
      IOFFSET_INT_IN3  = 0
      IOFFSET_INT_IN4  = 0
C 
      DO ISBATCH = 1, NBATCH
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
        IF( ILOOP .le. 0 )THEN
C
          IOFFSET_IN_LUIN   = MY_IOFF_LUIN  + IOFFSET_SCRATCH +
     &                      ( IVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN    = 1 + NUM_BLK                     +
     &                      ( IVEC - 1 )    * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN ',
CSK  &                    IOFFSET_IN_LUIN 
C
          CALL RDVEC_BATCH_DRV4(LUIN,VEC2,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN,IOFFSET_INT_IN,
     &                          LUINLIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC2 on LUIN '
CSK       CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
        ELSE 
C
          IOFFSET_IN_LUIN2  = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                      ( ILOOP - 1 )   * MY_VEC1_IOFF
          IOFFSET_INT_IN2   = 1 + NUM_BLK                     +
     &                      ( ILOOP - 1 )   * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN2',
CSK  &                    IOFFSET_IN_LUIN2
C
          CALL RDVEC_BATCH_DRV4(LUIN2,VEC2,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                          LUIN2LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC2 on LUIN2'
CSK       CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
        END IF
C
        DO 100 JVEC = 1, MIN(IVEC,NROOT)
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN3  = MY_IOFF_LUIN3 + IOFFSET_SCRATCH +
     &                      ( JVEC - 1 )    * MY_VEC1_IOFF
          IOFFSET_INT_IN3   = 1 + NUM_BLK                     +
     &                      ( JVEC - 1 )    * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN3',
CSK  &                    IOFFSET_IN_LUIN3
C
          CALL RDVEC_BATCH_DRV4(LUIN3,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN3,IOFFSET_INT_IN3,
     &                          LUIN3LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC1 on LUIN3'
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
          IJ = IVEC * ( IVEC - 1 ) / 2 + JVEC
C         SCRRED(IJ) = SCRRED(IJ) + SUM ( VEC1 * VEC2 )
          SCRRED(IJ) = SCRRED(IJ) + 
     &                 DDOT(LEBATCH(ISBATCH),VEC1,1,VEC2,1)
C
          IF (ISBATCH .eq. 1 ) THEN
            IMUSTRED = IMUSTRED + 1
            IF( IVEC .eq. 1 .and. JVEC .eq. 1 ) ISTRED = IJ
          END IF
C
  100   CONTINUE
C
C       continue with vectors on LUIN4
C
        ILOOP2 = 0
C
        DO 200 JVEC = ( NROOT + 1), IVEC
C
          ILOOP2 = ILOOP2 + 1
C
C         set new offset and zero read-in vector
C
          CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
          IOFFSET_IN_LUIN4  = MY_IOFF_LUIN4 + IOFFSET_SCRATCH +
     &                      ( ILOOP2 - 1 )  * MY_VEC1_IOFF
          IOFFSET_INT_IN4   = 1 + NUM_BLK                     +
     &                      ( ILOOP2 - 1 )  * MY_ACT_BLK1
C
CSK       WRITE(LUWRT,*) 'This is my OFFSET for LUIN4',
CSK  &                    IOFFSET_IN_LUIN4
C
          CALL RDVEC_BATCH_DRV4(LUIN4,VEC1,LBATCH(ISBATCH),
     &                          IBATCH(1,I1BATCH(ISBATCH)),
     &                          IOFFSET_IN_LUIN4,IOFFSET_INT_IN4,
     &                          LUIN4LIST,NUM_ACTIVE_BATCH)
C
CSK       WRITE(LUWRT,*) 'initial VEC1 on LUIN4'
CSK       CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),
CSK  &                  LUWRT)
C
          IJ = IVEC * ( IVEC - 1 ) / 2 + JVEC
C         SCRRED(IJ) = SCRRED(IJ) + SUM ( VEC1 * VEC2 )
          SCRRED(IJ) = SCRRED(IJ) +
     &                 DDOT(LEBATCH(ISBATCH),VEC1,1,VEC2,1)
C
          IF (ISBATCH .eq. 1 ) IMUSTRED = IMUSTRED + 1
C
  200   CONTINUE
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C     ^ loop over batches
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE PART_CIV_PAR1(IDC,IBLTP,NSSOA,NSSOB,NOCTPA,NOCTPB,
     &                         NSMST,MXLNG,IOCOC,ISMOST,
     &                         NBATCH,LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                         ICOMP,ITTSS_ORD,IBLOCKD,NDIM)
C
C     Partition a CI vector into batches of blocks. The length of a
C     batch must be atmost MXLNG.
C     S-vector routine.
C
C     IF ICOMP .eq. 1: the complete CI vector is constructed in just one
C     batch.
C
C     OUTPUT
C     ======
C
C     NBATCH      : Number of batches
C     LBATCH(*)   : Number of blocks in a given batch
C     LEBATCH(*)  : Number of elements in a given batch ( packed ) !
C     I1BATCH(*)  : Number of first block in a given batch
C     IBATCH(8,*) : TTS blocks in Start of a given TTS block with respect to
C                   start
C     of batch --
C     IBATCH(1,*) : Alpha type
C     IBATCH(2,*) : Beta  type
C     IBATCH(3,*) : Sym of alpha
C     IBATCH(4,*) : Sym of beta
C     IBATCH(5,*) : Offset of block with respect to start of block in
C                   expanded form
C     IBATCH(6,*) : Offset of block with respect to start of block in
C                   packed form
C     IBATCH(7,*) : Length of block, expandend form
C     IBATCH(8,*) : Length of block, packed form
C    
C     original version : Jeppe Olsen     - August 1995
C     parallel adaption: S. Knecht       - March  2007 
C
C     Last revision:     S. Knecht       - March  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
C     Input
      INTEGER NSSOA(NSMST,*),NSSOB(NSMST,*)
      INTEGER IOCOC(NOCTPA,NOCTPB)
      INTEGER IBLTP(*)
      INTEGER ISMOST(*)
      DIMENSION IBLOCKD(NDIM)
C     Output
      INTEGER LBATCH(*)
      INTEGER LEBATCH(*)
      INTEGER I1BATCH(*)
      INTEGER IBATCH(8,*)
*
      NTEST = 0000
      IF(NTEST.GE.100) THEN
        WRITE(6,*)
        WRITE(6,*) ' ==================='
        WRITE(6,*) '    PART_CIV_PAR1   '
        WRITE(6,*) ' ==================='
        WRITE(6,*) ' IDC = ', IDC
        WRITE(6,*)
        WRITE(6,*) ' IOCOC Array '
        CALL IWRTMA(IOCOC,NOCTPA,NOCTPB,NOCTPA,NOCTPB)
        if (NTEST.ge.500) then
          WRITE(6,*) ' NSSOA array ( input ) '
          CALL IWRTMA(NSSOA,NSMST,NOCTPA,NSMST,NOCTPA)
          WRITE(6,*) ' NSSOB array ( input ) '
          CALL IWRTMA(NSSOB,NSMST,NOCTPB,NSMST,NOCTPB)
          write(6,*) ' IBLTP array: '
          call iwrtma(IBLTP,1,NSMST,1,NSMST)
        end if
      END IF
*
*. block zero
*
      IB = 1
      IA = 1
      ISM = 1
      IFRST = 1
      NBATCH = 0
      IBLOCK = 0
      IFINI = 0
*. Loop over batches of blocks
 2000 CONTINUE
      NBATCH = NBATCH + 1
      LBATCH(NBATCH) = 0
      I1BATCH(NBATCH) = IBLOCK  + 1
      LENGTH = 0
      LENGTHP= 0
      NBLOCK = 0
      IFRST = 1
*. Loop over blocks in batch
 1000 CONTINUE
      IF(IFRST.EQ.0) THEN
        call nxt_tts(ITTSS_ORD,IA,IB,ISM,IFINI,NOCTPA,NOCTPB,NSMST)
      END IF
      IFRST = 0
      IF (IFINI.EQ.1) GOTO 2002
*. Should this block be included
      IF(IBLTP(ISM).EQ.0) GOTO 1000
      IF(IBLTP(ISM).EQ.2.AND.IA.LT.IB) GOTO 1000
      IF(IOCOC(IA,IB).EQ.0) GOTO 1000
*. can this block be included
      IBSM = ISMOST(ISM)
      NSTA = NSSOA(ISM,IA)
      NSTB = NSSOB(IBSM,IB)
      LBLOCK= NSTA*NSTB
C     set unpacked length for unused block to zero
      IF( IBLOCKD(IBLOCK+1) .ne. MYPROC ) LBLOCK = 0
      IF(IBLTP(ISM).EQ.1.OR.(IBLTP(ISM).EQ.2.AND.IA.NE.IB)) THEN
        LBLOCKP = NSTA*NSTB
      ELSE IF (IBLTP(ISM) .EQ. 2.AND.IA.EQ.IB) THEN
        LBLOCKP = NSTA*(NSTA+1)/2
      END IF
C     set packed length for unused block to zero
      IF( IBLOCKD(IBLOCK+1) .ne. MYPROC ) LBLOCKP = 0
C?    write(6,*) ' IA IB ISM LBLOCK ', IA,IB,ISM,LBLOCK
      IF(LENGTH+LBLOCK.LE.MXLNG.OR.ICOMP.EQ.1) THEN
        NBLOCK = NBLOCK + 1
        IBLOCK = IBLOCK + 1
        LBATCH(NBATCH) = LBATCH(NBATCH)+1
        IBATCH(1,IBLOCK) = IA
C       only s-blocks corresponding to distribution
        IF( IBLOCKD(IBLOCK) .ne. MYPROC ) IBATCH(1,IBLOCK) = 0
        IBATCH(2,IBLOCK) = IB
        IBATCH(3,IBLOCK) = ISM
        IBATCH(4,IBLOCK) = IBSM
        IBATCH(5,IBLOCK) = LENGTH+1
        IBATCH(6,IBLOCK) = LENGTHP+1
        IBATCH(7,IBLOCK) = LBLOCK
        IBATCH(8,IBLOCK) = LBLOCKP
        LENGTH = LENGTH + LBLOCK
        LENGTHP= LENGTHP+ LBLOCKP
        LEBATCH(NBATCH) = LENGTHP
        GOTO 1000
      ELSE IF(ICOMP.EQ.0.AND.
     &  LENGTH+LBLOCK.GT. MXLNG .AND. NBLOCK.EQ.0) THEN
        WRITE(6,*) ' Not enough scratch space to include a single Block'
        WRITE(6,*) ' Since I cannot procede I will stop '
        WRITE(6,*) ' Insufficient buffer detected in PART_CIV_PAR1'
        write(6,*) '  LENGTH,LBLOCK ',LENGTH,LBLOCK
        WRITE(6,*) ' Alter GAS space of raise Buffer from ', MXLNG
        call quit( ' Insufficient buffer space in PART_CIV_PAR1. ' )
      ELSE
*. This batch is finished, goto next batch
        GOTO 2000
      END IF
 2002 CONTINUE
*
      IF(NTEST.NE.0) THEN
        WRITE(6,*) 'Output from PART_CIV_PAR1'
        WRITE(6,*) '========================='
        WRITE(6,*)
        WRITE(6,*) ' Number of batches ', NBATCH
        DO JBATCH = 1, NBATCH
          WRITE(6,*)
          WRITE(6,*) ' Info on batch ', JBATCH
          WRITE(6,*) ' *********************** '
          WRITE(6,*)
          WRITE(6,*) '      Number of blocks included ', LBATCH(JBATCH)
          WRITE(6,*) '      TTSS and offsets and lengths of each block '
          DO IBLOCK = I1BATCH(JBATCH),I1BATCH(JBATCH)+ LBATCH(JBATCH)-1
            WRITE(6,'(10X,4I3,4I8)') (IBATCH(II,IBLOCK),II=1,8)
          END DO
        END DO
      END IF
*
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE PART_CIV_PAR2(IDC,IBLTP,NSSOA,NSSOB,NOCTPA,NOCTPB,
     &                         NSMST,MXLNG,IOCOC,ISMOST,
     &                         NBATCH,LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                         ICOMP,ITTSS_ORD,SCLFAC,NDIM,IEXCLBLK)
C
C     Partition a CI vector into batches of blocks. The length of a
C     batch must be atmost MXLNG.
C     C-vector routine.
C
C     IF ICOMP .eq. 1: the complete CI vector is constructed in just one
C     batch.
C
C     OUTPUT
C     ======
C
C     NBATCH      : Number of batches
C     LBATCH(*)   : Number of blocks in a given batch
C     LEBATCH(*)  : Number of elements in a given batch ( packed ) !
C     I1BATCH(*)  : Number of first block in a given batch
C     IBATCH(8,*) : TTS blocks in Start of a given TTS block with respect to
C                   start
C     of batch --
C     IBATCH(1,*) : Alpha type
C     IBATCH(2,*) : Beta  type
C     IBATCH(3,*) : Sym of alpha
C     IBATCH(4,*) : Sym of beta
C     IBATCH(5,*) : Offset of block with respect to start of block in
C                   expanded form
C     IBATCH(6,*) : Offset of block with respect to start of block in
C                   packed form
C     IBATCH(7,*) : Length of block, expandend form
C     IBATCH(8,*) : Length of block, packed form
C    
C     original version : Jeppe Olsen     - August 1995
C     parallel adaption: S. Knecht       - March  2007 
C
C     Last revision:     S. Knecht       - March  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
C     input
      INTEGER NSSOA(NSMST,*),NSSOB(NSMST,*)
      INTEGER IOCOC(NOCTPA,NOCTPB)
      INTEGER IBLTP(*)
      INTEGER ISMOST(*)
      DIMENSION SCLFAC(NDIM)
C     output
      INTEGER LBATCH(*)
      INTEGER LEBATCH(*)
      INTEGER I1BATCH(*)
      INTEGER IBATCH(8,*)
C     scratch
      INTEGER LBLOCK_SAVE, LBLOCKP_SAVE 
C
      LBLOCK_SAVE  = 0
      LBLOCKP_SAVE = 0 
C
      NTEST = 0000
      IF(NTEST.GE.100) THEN
        WRITE(6,*)
        WRITE(6,*) ' ==================='
        WRITE(6,*) '    PART_CIV_PAR2   '
        WRITE(6,*) ' ==================='
        WRITE(6,*) ' IDC = ', IDC
        WRITE(6,*)
        WRITE(6,*) ' IOCOC Array '
        CALL IWRTMA(IOCOC,NOCTPA,NOCTPB,NOCTPA,NOCTPB)
        if (NTEST.ge.500) then
          WRITE(6,*) ' NSSOA array ( input ) '
          CALL IWRTMA(NSSOA,NSMST,NOCTPA,NSMST,NOCTPA)
          WRITE(6,*) ' NSSOB array ( input ) '
          CALL IWRTMA(NSSOB,NSMST,NOCTPB,NSMST,NOCTPB)
          write(6,*) ' IBLTP array: '
          call iwrtma(IBLTP,1,NSMST,1,NSMST)
        end if
      END IF
C
C     block zero
C
      ILOOPBLK = 0
      IB = 1
      IA = 1
      ISM = 1
      IFRST = 1
      NBATCH = 0
      IBLOCK = 0
      IFINI = 0
C     loop over batches of blocks
 2000 CONTINUE
      NBATCH = NBATCH + 1
      LBATCH(NBATCH) = 0
      I1BATCH(NBATCH) = IBLOCK  + 1
      LENGTH = 0
      LENGTHP= 0
      NBLOCK = 0
      IFRST = 1
      IF( NBATCH .gt. 1 ) ILOOPBLK = ILOOPBLK - 1 
      
C     loop over blocks in batch
 1000 CONTINUE
      IF(IFRST.EQ.0) THEN
        call nxt_tts(ITTSS_ORD,IA,IB,ISM,IFINI,NOCTPA,NOCTPB,NSMST)
      END IF
      IFRST = 0
      IF (IFINI.EQ.1) GOTO 2002
C     should this block be included
      IF(IBLTP(ISM).EQ.0) GOTO 1000
      IF(IBLTP(ISM).EQ.2.AND.IA.LT.IB) GOTO 1000
      IF(IOCOC(IA,IB).EQ.0) GOTO 1000
      ILOOPBLK = ILOOPBLK + 1
C     can this block be included
      IBSM   = ISMOST(ISM)
      NSTA   = NSSOA(ISM,IA)
      NSTB   = NSSOB(IBSM,IB)
      LBLOCK = NSTA*NSTB
      LBLOCK_SAVE = LBLOCK
      IF(IBLTP(ISM).EQ.1.OR.(IBLTP(ISM).EQ.2.AND.IA.NE.IB)) THEN
        LBLOCKP = NSTA*NSTB
      ELSE IF (IBLTP(ISM) .EQ. 2.AND.IA.EQ.IB) THEN
        LBLOCKP = NSTA*(NSTA+1)/2
      END IF
      LBLOCKP_SAVE = LBLOCKP
C     check if we need this block, if not: 
C     LBLOCKP = 0
C     LBLOCK  = 0
CSK      IF(IEXCLBLK .eq. 1) THEN
CSK       WRITE(6,*) 'ILOOPBLK,SCLFAC(ILOOPBLK),IBLOCK',ILOOPBLK,
CSK     &             SCLFAC(ILOOPBLK),IBLOCK
CSK      END IF
      IF( IEXCLBLK .eq. 1 ) THEN
         IF( SCLFAC(ILOOPBLK) .eq. 0.0D0 ) THEN
           LBLOCKP = 0
           LBLOCK  = 0
           GOTO 1000
         END IF
      ELSE
        IF( SCLFAC(IBLOCK+1) .eq. 0.0D0 ) THEN
          LBLOCKP = 0
          LBLOCK  = 0
        END IF
      END IF
C?    write(6,*) ' IA IB ISM LBLOCK ', IA,IB,ISM,LBLOCK,myproc
      IF(LENGTH+LBLOCK.LE.MXLNG.OR.ICOMP.EQ.1) THEN
        NBLOCK = NBLOCK + 1
        IBLOCK = IBLOCK + 1
        LBATCH(NBATCH) = LBATCH(NBATCH)+1
        IBATCH(1,IBLOCK) = IA
        IBATCH(2,IBLOCK) = IB
        IBATCH(3,IBLOCK) = ISM
        IBATCH(4,IBLOCK) = IBSM
        IBATCH(5,IBLOCK) = LENGTH+1
        IBATCH(6,IBLOCK) = LENGTHP+1
C       keep length information, needed?
        IBATCH(7,IBLOCK) = LBLOCK_SAVE
        IBATCH(8,IBLOCK) = LBLOCKP_SAVE
C       all blocks are included, but only active blocks have a length
        LENGTH = LENGTH + LBLOCK
        LENGTHP= LENGTHP+ LBLOCKP
        LEBATCH(NBATCH) = LENGTHP
        GOTO 1000
      ELSE IF(ICOMP.EQ.0.AND.
     &  LENGTH+LBLOCK.GT. MXLNG .AND. NBLOCK.EQ.0) THEN
        WRITE(6,*) ' Not enough scratch space to include a single block'
        WRITE(6,*) ' Since I cannot procede I will stop '
        WRITE(6,*) ' Insufficient buffer detected in PART_CIV_PAR2'
        write(6,*) '  LENGTH,LBLOCK ',LENGTH,LBLOCK
        WRITE(6,*) ' Alter GAS space of raise buffer from ', MXLNG
        call quit( ' Insufficient buffer space in PART_CIV_PAR2. ' )
      ELSE
C       This batch is finished, goto next batch
        GOTO 2000
      END IF
 2002 CONTINUE
C
      IF(NTEST.NE.0) THEN
       IF(IEXCLBLK .eq. 1 ) THEN
        WRITE(6,*) 'Output from PART_CIV_PAR2'
        WRITE(6,*) '========================='
        WRITE(6,*)
        WRITE(6,*) ' Number of batches (MYPROC)', NBATCH, MYPROC
        DO JBATCH = 1, NBATCH
          WRITE(6,*)
          WRITE(6,*) ' Info on batch ', JBATCH
          WRITE(6,*) ' *********************** '
          WRITE(6,*)
          WRITE(6,*) '      Length of batch           ', LEBATCH(JBATCH)
          WRITE(6,*) '      Number of blocks included ', LBATCH(JBATCH)
          WRITE(6,*) '      TTSS and offsets and lengths of each block '
          DO IBLOCK = I1BATCH(JBATCH),I1BATCH(JBATCH)+ LBATCH(JBATCH)-1
            WRITE(6,'(10X,4I3,4I8)') (IBATCH(II,IBLOCK),II=1,8)
          END DO
        END DO
       END IF
      END IF
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE PRINT_BATCH_INFO(NCBATCH,LBATC,LEBATC,
     &           I1BATC,IBATC,ICCTOS,ICWEIGHT,NBLOCK,IBTOTW,ICWEIGHTF,
     &           IABSOLUTE_WEIGHT)
*
      IMPLICIT REAL*8(A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION LBATC(*),LEBATC(*),I1BATC(*),IBATC(8,*)
      DIMENSION ICCTOS(NBLOCK,NBLOCK),ICWEIGHT(NBLOCK)
      DIMENSION IBTOTW(NBLOCK),ICWEIGHTF(NBLOCK)
      INTEGER*8 IABSOLUTE_WEIGHT
*
      NTEST = 99
      IF(MYPROC.EQ.MASTER .and. NTEST .ge. 100 )THEN
        WRITE(6,*) ' OUTPUT from PRINT_BATCH_INFO'
        WRITE(6,*) ' ============================'
        WRITE(6,*) ' '
        WRITE(6,*) ' Number of batches ', NCBATCH
      END IF
      DO JBATCH = 1, NCBATCH
        IF(MYPROC.EQ.MASTER .and. NTEST .ge. 100 ) THEN
          WRITE(6,*)
          WRITE(6,*) ' Info on batch ', JBATCH
          WRITE(6,*) ' *********************** '
          WRITE(6,*) ' '
          WRITE(6,*) ' Number of blocks included ', LBATC(JBATCH)
        END IF
CSK        WRITE(6,*) '   TTSS and offsets and lengths of each block '
        DO IBLOCK =I1BATC(JBATCH),I1BATC(JBATCH)+ LBATC(JBATCH)-1
CSK          WRITE(6,'(10X,4I3,4I8)') (IBATC(II,IBLOCK),II=1,8)
CSK          IF(IBATC(8,IBLOCK).NE.0.AND.MYPROC.EQ.MASTER.AND.NTEST.GE.10) 
CSK     &       WRITE(6,'(2X,A,1X,I5,1X,A,1X,I8)') 'Block number',IBLOCK,
CSK     & 'has length',(IBATC(II,IBLOCK),II=8,8)
          JJJ = 0
          DO IJ = 1, NBLOCK
            IF(ICCTOS(IJ,IBLOCK).NE.0) THEN
              JJJ = JJJ + 1
              ICWEIGHT(JJJ) = IJ
            ENDIF
          END DO
          ICWEIGHTF(IBLOCK) = JJJ
          IF(JJJ.GT.0.AND.MYPROC.EQ.MASTER.AND.NTEST.GE.100) THEN
            WRITE(6,'(2X,A,I4)')'Number of connections for this block:',
     &                           JJJ
            CALL IWRTMA(ICWEIGHT,1,JJJ,1,JJJ)
            
            WRITE(6,*)'  '
          ENDIF
          IBTOTW(IBLOCK) = (JJJ) * IBATC(8,IBLOCK)
        END DO
      END DO
      IF(MYPROC.EQ.MASTER.AND.NTEST.GE.10) THEN
CSK        WRITE(6,'(2X,A)')'TOTAL WEIGHT FOR EACH BLOCK'
CSK        CALL IWRTMA8(IBTOTW,1,NBLOCK,1,NBLOCK)
CSK        WRITE(6,'(2X,A)')'WEIGHT MATRIX'
CSK        CALL IWRTMA(ICWEIGHTF,1,NBLOCK,1,NBLOCK)
      END IF
      DO IBLK = 1, NBLOCK
        IABSOLUTE_WEIGHT = IABSOLUTE_WEIGHT + 
     &                     ( ICWEIGHTF(IBLK) * IBATC(8,IBLK) )
      END DO
*
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE PRSYMMN(A,MATDIM,LUWRT)
C PRINT LOWER HALF OF A SYMMETRIC MATRIX OF DIMENSION MATDIM.
C THE LOWER HALF OF THE MATRIX IS SUPPOSED TO BE IN VECTOR A.
#include "implicit.h"
      DIMENSION A(*)
      JSTART=1
      JSTOP=0
      DO 100 I=1,MATDIM
        JSTART=JSTART+I-1
        JSTOP=JSTOP +I
        WRITE(LUWRT,1010) I,(A(J),J=JSTART,JSTOP)
  100 CONTINUE
      RETURN
 1010 FORMAT(/I6,5E14.7,/,(6X,5E14.7))
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE REDVEC(VECIN,VECOUT,NDIM,NTYPE,NOP,NCOMM,NTARGET)
*
*. Reduce an array VECIN of dimension NDIM that is distributed over 
*. NCOMM to VECOUT on either a single process (NTARGET >= 0) or on 
*. all processes (NTARGET < 0). NTYPE determines the MPI_DATATYPE 
*. used in the reduce-call. NOP is the reduce-operation.
*
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VECIN(*),VECOUT(*)
      INTEGER DATATYPE
*
      IF(NTARGET.GE.0)THEN
        if(NTYPE .eq. 1)then
          call interface_mpi_reduce_i1_work_f77(VECIN,VECOUT,NDIM,NOP,
     &                                          NTARGET,NCOMM)
        else if(NTYPE .eq. 2)then
          call interface_mpi_reduce_r1_work_f77(VECIN,VECOUT,NDIM,NOP,
     &                                          NTARGET,NCOMM)
        end if
      ELSE IF(NTARGET.LT.0)THEN
        if(NTYPE .eq. 1)then
          call interface_mpi_allreduce_i1_work_f77(VECIN,VECOUT,NDIM,
     &                                             NOP,NCOMM)
        else if(NTYPE .eq. 2)then
          call interface_mpi_allreduce_r1_work_f77(VECIN,VECOUT,NDIM,
     &                                             NOP,NCOMM)
        end if
      ENDIF

      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE SCLVCSUM(FAC,VEC1,VEC2,NBATCH_BLK,NBATCH_INF,
     &                    IOFF_INT,IVCOFF)
 
C
C     OUTPUT
C     ======
C
C     VEC2 == VEC1 * FAC + VEC2
C
C     Last revision:     S. Knecht       - April  2007
C
************************************************************************
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*), VEC2(*),IVCOFF(*),NBATCH_INF(8,*)
      INTEGER JOFF,ILEN
C
      DO ISBLK = 1, NBATCH_BLK
C
        ILEN = IVCOFF( IOFF_INT + ISBLK - 1 )
C
        IF ( ILEN .gt. 0 ) THEN
          JOFF = NBATCH_INF(6,ISBLK)
          CALL DAXPY( ILEN, FAC, VEC1(JOFF), 1, VEC2(JOFF), 1 )
        END IF

      END DO
C     
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      REAL*8 FUNCTION BLK_DDOT(VEC1,VEC2,NBATCH_BLK,NBATCH_INF,
     &                         IOFF_INT,IVCOFF)
 
C
C     OUTPUT
C     ======
C
C     BLK_DDOT == SUM( VEC1 * VEC2)
C
C     Last revision:     S. Knecht       - April  2007
C
************************************************************************
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*), VEC2(*),IVCOFF(*),NBATCH_INF(8,*)
      INTEGER JOFF,ILEN
C
      XXX = 0.0D0
C
      DO ISBLK = 1, NBATCH_BLK
C
        ILEN = IVCOFF( IOFF_INT + ISBLK - 1 )
C
        IF ( ILEN .gt. 0 ) THEN
          JOFF = NBATCH_INF(6,ISBLK)
          XXX = XXX + DDOT( ILEN, VEC1(JOFF), 1, VEC2(JOFF), 1 )
        END IF

      END DO
C     
      BLK_DDOT = XXX
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE SET_ACTIVE(VEC,IAMACTIVE,FAC,NDIM)
C
C     OUTPUT
C     ======
C
C     active blocks in VEC flagged with a factor FAC
C
C     Last revision:     S. Knecht       - March  2007
C
************************************************************************
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC(NDIM), IAMACTIVE(NDIM)
C
      DO IBLK = 1, NDIM
        IF( IAMACTIVE(IBLK) .eq. MYPROC ) VEC(IBLK) = FAC
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE TRAVCD_PAR(VEC1,VEC2,X,NVECIN,NVECOUT,LUIN,LUOUT,
     &                      ICOPY,LBLK,LUSCR1,LUSCR2)
*
* NVECIN vectors reside on LU1, Transform these vectors,
*  using LUSCR1 and  LUSCR2  as
* scratch files,  with matrix X to produce output file LUOUT.
*
* Since LUIN is accessed several times it is always
* REWOUND. LUOUT is written to from current start.
*
* I ICOPY .ne. 0 the transformed vectors are written back to LUIN
*
* Jeppe Olsen, April 1997
*
      IMPLICIT REAL*8(A-H,O-Z)
*. Input
      DIMENSION X(NVECIN,NVECOUT)
*. Scratch
      DIMENSION VEC1(*),VEC2(*)
*
C             MVCSMD(LUIN,FAC,LUOUT,LUSCR,VEC1,VEC2,NVEC,IREW,LBLK)
      DO IVECOUT = 1, NVECOUT
         CALL MVCSMD_PAR(LUIN,X(1,IVECOUT),LUSCR1,LUSCR2,VEC1,VEC2,
     &               NVECIN,1,LBLK)
         CALL REWINE(LUSCR1,-1)
         CALL COPVCD(LUSCR1,LUOUT,VEC1,0,LBLK)
      END DO
*
      IF(ICOPY.EQ.1) THEN
        CALL REWINE(LUIN,-1)
        CALL REWINE(LUOUT,-1)
        DO IVECOUT = 1, NVECOUT
          CALL COPVCD(LUOUT,LUIN,VEC1,0,LBLK)
        END DO
      END IF
*
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE UPDATE_GEN_LIST(ISLIST1,ISLIST2,NDIM)
C
C     OUTPUT
C     ======
C
C     updated content of ISLIST1
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
#include "implicit.h"
      DIMENSION ISLIST1(*), ISLIST2(*)
      INTEGER NZERO
C
      NZERO = 0
C
      DO IBLK = 1, NDIM
C
        IF( ISLIST1( IBLK ) .ne. NZERO ) THEN
          IF( ISLIST2( IBLK ) .eq. NZERO ) THEN
            ISLIST1( IBLK ) = NZERO
          END IF
        END IF
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE IFACTOSFAC(ISLIST1,SLIST2,NDIM)
C
C     OUTPUT
C     ======
C
C     updated content of SLIST2
C
C     Last revision:     S. Knecht       - March  2008
C
************************************************************************
#include "implicit.h"
      DIMENSION ISLIST1(*), SLIST2(*)
      INTEGER NZERO
C
      NZERO = 0
      ONE  = 1.0D0
C
      DO IBLK = 1, NDIM
C
        IF( ISLIST1( IBLK ) .ne. NZERO ) THEN
           SLIST2( IBLK ) = ONE
        END IF
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE WRSVCD_PAR(LU,LBLK,VEC1,IPLAC,VAL,NSCAT,
     &                      IBLOCKL,IBLOCKD,NBLOCK,JPACK,IXROOT,
     &                      LU1LIST)
*
* Write scattered vector to disc, every node writes only it's part.
* Information about the length of a blcok is stored in IBLOCKL.
* Information about which block belongs to which node is stored in
* IBLOCKD. 
* IXROOT is the current root to place a 1.0D0.
* LU1LIST is the MPI file list.
*
      use interface_to_mpi
      IMPLICIT REAL*8 (A-H,O-Z)
#include "infpar.h"
      INTEGER ISTAT(df_mpi_status_size)
#include "parluci.h"
*     input
      DIMENSION IPLAC(*),VAL(*),IBLOCKL(NBLOCK),IBLOCKD(NBLOCK)
*     scratch
      DIMENSION VEC1(*)
      LOGICAL FOUND_ONE
      INTEGER(KIND=df_mpi_offset_kind) IOFF_SCR1, IOFF_SCR2
      DIMENSION LU1LIST(*)
      INTEGER IBOTTO
*
      FOUND_ONE = .FALSE.
C
C     initialize offset
      IOFF_SCR1 = 0
      IOFF_SCR2 = 0
      INT_IOFF1  = 0
      INT_IOFF2  = 0
      IBOTTO = 0
C
      IOFF_SCR1 = MY_LU1_OFF + MY_VEC2_IOFF * ( IXROOT - 1 )
      INT_IOFF1 = 1 + MY_ACT_BLK2 * ( IXROOT    - 1 )
csk   WRITE(LUWRT,*) 'OFFSET at START',IOFF_SCR1
csk   WRITE(LUWRT,*) 'INT_IOFF1 at START ',INT_IOFF1
csk   WRITE(LUWRT,*) 'IBLOCKD '
csk   CALL IWRTMAMN(IBLOCKD,1,NBLOCK,1,NBLOCK,LUWRT)
csk   WRITE(LUWRT,*) 'IBLOCKL '
csk   CALL IWRTMAMN(IBLOCKL,1,NBLOCK,1,NBLOCK,LUWRT)
csk   WRITE(LUWRT,*) 'IBLOCKL '
csk   CALL IWRTMAMN(IBLOCKL,1,NBLOCK,1,NBLOCK,LUWRT)
csk   WRITE(LUWRT,*) ' VAL '
csk   CALL WRTMATMN(VAL,1,10,1,10,LUWRT)
csk   WRITE(LUWRT,*) ' IPLAC '
csk   CALL IWRTMAMN(IPLAC,1,10,1,10,LUWRT)

C
      IONE = 1
      NTEST = 0
      IBOTTO = 1
      LBL = 0
csk   WRITE(LUWRT,*) ' IBOTTO',IBOTTO
*     loop over blocks
      DO II = 1, NBLOCK
        IF( IBLOCKD(II) .ne. MYPROC ) GOTO 300
*       length
        LBL = IBLOCKL(II)
csk     WRITE(LUWRT,*) ' LBL and IBOTTO for block II',LBL,IBOTTO,II

        CALL DZERO(VEC1,LBL)
C       *******************************************************
C       *******************************************************
C              Find the places where a 1.0 should be set 
C       *******************************************************
C       *******************************************************
        DO 200 IEFF = 1, NSCAT
csk       WRITE(LUWRT,*) ' IPLAC(IEFF), IEFF',IPLAC(IEFF), IEFF
          IF( IPLAC(IEFF).GE.IBOTTO.AND.IPLAC(IEFF).LE.IBOTTO+LBL-1)THEN
             VEC1(IPLAC(IEFF)-IBOTTO+1) = VAL(IEFF)
             FOUND_ONE = .TRUE.
          END IF
C
          NTEST = 00
C
          IF(NTEST.GE.10) THEN
            IF( IPLAC(IEFF).GE.IBOTTO.AND.IPLAC(IEFF).LE.IBOTTO+LBL-1)
     &      write(6,*) 'IBOT,IBOTTO+LBL-1',IBOTTO,IBOTTO+LBL-1
            IF( IPLAC(IEFF).GE.IBOTTO.AND.IPLAC(IEFF).LE.IBOTTO+LBL-1)
     &      write(6,*) ' Catch : IPLAC(IEFF) VAL(IEFF) ',
     &      IPLAC(IEFF),VAL(IEFF)
          END IF
*
  200     CONTINUE
C
C         new offset
C
          IOFF_SCR1 = IOFF_SCR1 + IOFF_SCR2
          INT_IOFF1 = INT_IOFF1 + INT_IOFF2
C
          IF( FOUND_ONE ) THEN
csk         WRITE(LUWRT,*) 'THIS IS WHAT I WILL WRITE',MYPROC
csk         CALL WRTMATMN(VEC1,1,LBL,1,LBL,LUWRT)
csk         WRITE(LUWRT,*) 'OFFSET',IOFF_SCR1
csk         WRITE(LUWRT,*) 'INT_IOFF1 is ',INT_IOFF1
            LU1LIST( INT_IOFF1 ) = IONE
            call interface_mpi_FILE_WRITE_AT_r(ILU1,IOFF_SCR1,VEC1,LBL,
     &                             ISTAT)
          END IF
C
          FOUND_ONE = .FALSE.
C         keep track of correct offset
          IOFF_SCR2 = LBL
          INT_IOFF2 = IONE
  300     CONTINUE
C         keep IBOT up-to-date for all nodes
          LBL = IBLOCKL( II )
          IBOTTO = IBOTTO + LBL
      END DO  
*
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE INPROD_B_PAR_RL(LUIN1,LUIN2,VEC1,VEC2,SUBSPH,NBATCH,
     &                           LBATCH,LEBATCH,I1BATCH,IBATCH,
     &                           MY_IOFF_LUIN1,MY_IOFF_LUIN2,
     &                           LUIN1LIST,LUIN2LIST,JOFF)
C
C     Written by  S. Knecht         - May 29 2007
C
C**********************************************************************
C
C     calculating dot product between two vectors on file LUIN1 resp.
C     LUIN2
C
C     NOTE: JOFF = IVEC
C
C     active blocks on the MPI-files are flagged by a nonzero length
C
C     Last revision:     S. Knecht       - May  2007
C
************************************************************************
      use interface_to_mpi
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
#include "infpar.h"
#include "parluci.h"
      DIMENSION VEC1(*), VEC2(*), SUBSPH(*)
      DIMENSION LBATCH(*), LEBATCH(*), I1BATCH(*), IBATCH(8,*)
      DIMENSION LUIN1LIST(*), LUIN2LIST(*)
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN1
      INTEGER(KIND=df_mpi_offset_kind) MY_IOFF_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN1 
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_IN_LUIN2
      INTEGER(KIND=df_mpi_offset_kind) IOFFSET_SCRATCH
      INTEGER NUM_BLK
C
C     initialize scratch offsets
      NUM_BLK = 0
      IOFFSET_SCRATCH = 0
      IOFFSET_IN_LUIN1  = 0
      IOFFSET_IN_LUIN2  = 0
      IOFFSET_INT_IN1  = 0
      IOFFSET_INT_IN2  = 0
C
      DO ISBATCH = 1, NBATCH
C
C       offset for batch ISBATCH w.r.t JOFF
C
        CALL DZERO(VEC2,LEBATCH(ISBATCH))
C
C       set new offset
C       position in file is at the end of vector JOFF - 1
C
        IOFFSET_IN_LUIN1 = MY_IOFF_LUIN1 + IOFFSET_SCRATCH +
     &                     ( JOFF - 1 )  * MY_VEC1_IOFF
        IOFFSET_INT_IN1  = 1 + NUM_BLK  +
     &                     ( JOFF - 1 ) * MY_ACT_BLK1
C
csk     WRITE(LUWRT,*) 'This is my OFFSET for LUIN1',
csk  &                  IOFFSET_IN_LUIN1
C
        CALL RDVEC_BATCH_DRV4(LUIN1,VEC2,LBATCH(ISBATCH),
     &                       IBATCH(1,I1BATCH(ISBATCH)),
     &                       IOFFSET_IN_LUIN1,IOFFSET_INT_IN1,
     &                       LUIN1LIST,NUM_ACTIVE_BATCH)
C
csk     WRITE(LUWRT,*) 'initial VEC2 on LUIN1'
csk     CALL WRTMATMN(VEC2,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
C
        DO 100 IVEC = 1, JOFF
C
C          set new offset and zero read-in vector
C
           CALL DZERO(VEC1,LEBATCH(ISBATCH))
C
           IOFFSET_IN_LUIN2 = MY_IOFF_LUIN2 + IOFFSET_SCRATCH +
     &                        ( IVEC - 1 )  * MY_VEC1_IOFF
           IOFFSET_INT_IN2  = 1 + NUM_BLK  + 
     &                        ( IVEC - 1 ) * MY_ACT_BLK1
C
csk     WRITE(LUWRT,*) 'This is my OFFSET for LUIN2',
csk  &                  IOFFSET_IN_LUIN2
C
C
C          read in batch ISBATCH from LUIN1 to VEC1
C
           CALL RDVEC_BATCH_DRV4(LUIN2,VEC1,LBATCH(ISBATCH),
     &                           IBATCH(1,I1BATCH(ISBATCH)),
     &                           IOFFSET_IN_LUIN2,IOFFSET_INT_IN2,
     &                           LUIN2LIST,NUM_ACTIVE_BATCH)
C
csk     WRITE(LUWRT,*) 'initial VEC1 on LUIN2'
csk     CALL WRTMATMN(VEC1,1,LEBATCH(ISBATCH),1,LEBATCH(ISBATCH),LUWRT)
C
C
C          IJ = JOFF*(JOFF-1)/2 + IVEC
C          SUBSPH(IJ) == VEC1 * VEC2
C
           IJ = JOFF*(JOFF-1)/2 + IVEC
C
           SUBSPH(IJ) = SUBSPH(IJ) + 
     &                  DDOT(LEBATCH(ISBATCH),VEC1,1,VEC2,1)
C
  100   CONTINUE
C
C       keep track of correct offset
        IOFFSET_SCRATCH = IOFFSET_SCRATCH + LEBATCH(ISBATCH)
        NUM_BLK         = NUM_BLK + NUM_ACTIVE_BATCH
C
      END DO
C
      END
***********************************************************************
*                                                                     *
* LUCIAREL, written by Timo Fleig and Jeppe Olsen                     *
*           parallelization by Stefan Knecht                          *
*                                                                     *
***********************************************************************
      SUBROUTINE ORDER_ARRAY(IMAT,NDIM)
C
C     Written by  S. Knecht         - May 29 2007
C
C**********************************************************************
C
C     order array with numbers .le. 0 first, .lt. 0 after            
C
C     Last revision:     S. Knecht       - May  2008
C
************************************************************************
      IMPLICIT DOUBLE PRECISION ( A-H,O-Z)
      DIMENSION IMAT(NDIM)
      INTEGER ICHECK_POS, ICHECK_PRO
      ICHECK_POS = 0
      ICHECK_PRO = 0
C
        DO JJ = 1, NDIM
           ICHECK_PRO = IMAT(JJ)
           IF( ICHECK_PRO .ne. - 1) THEN
             ICHECK_POS = ICHECK_POS + 1
             IMAT(JJ) = -1
             IMAT(ICHECK_POS) = ICHECK_PRO
           END IF
        END DO
C
      END
#else
***********************************************************************
*                                                                     *
* LUCITA, by Jeppe Olsen, DIRAC adaption by Timo Fleig                *
* parallelization by Stefan Knecht                                    *
*                                                                     *
***********************************************************************
      SUBROUTINE PAR_DUMMY2
      END 
***********************************************************************
#endif /* defined (VAR_MPI2) */
