! Copyright 2020
!
! For a comprehensive list of the developers that contributed to these codes
! see the UK-AMOR website.
!
! This file is part of UKRmol-out (UKRmol+ suite).
!
!     UKRmol-out is free software: you can redistribute it and/or modify
!     it under the terms of the GNU General Public License as published by
!     the Free Software Foundation, either version 3 of the License, or
!     (at your option) any later version.
!
!     UKRmol-out is distributed in the hope that it will be useful,
!     but WITHOUT ANY WARRANTY; without even the implied warranty of
!     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!     GNU General Public License for more details.
!
!     You should have received a copy of the GNU General Public License
!     along with  UKRmol-out (in source/COPYING). Alternatively, you can also visit
!     <https://www.gnu.org/licenses/>.
!
!> \brief   I/O routines used by MULTIDIP
!> \author  J Benda
!> \date    2020 - 2022
!>
!> This module contains routines that read the necessary input files (K-matrices, scattering coefficients, molecular_data files)
!> and return the needed subset of data.
!>
module multidip_io

    use, intrinsic :: iso_c_binding,   only: c_double, c_int, c_f_pointer
    use, intrinsic :: iso_fortran_env, only: input_unit, int32, int64, real64, output_unit, iostat_end

    ! GBTOlib
    use blas_lapack_gbl,  only: blasint
#ifdef WITH_MMAP
    use file_mapping_gbl, only: file_mapping
#endif
    use phys_const_gbl,   only: pi, imu, to_eV
    use precisn_gbl,      only: wp, wp_bytes

    implicit none

#ifdef WITH_SCALAPACK
    integer(blasint), external :: numroc
#endif

    integer :: myproc = 1
    integer :: nprocs = 1

    !> \brief   K-matrix file
    !> \author  J Benda
    !> \date    2020 - 2021
    !>
    !> This data structure contains data read from a K-matrix file produced by RSOLVE.
    !> K-matrices are used to obtain the final stationary photoionization wave function.
    !>
    type KMatrix
        integer  :: mgvn, stot, nescat, nchan, nopen, ndopen, nchsq, lukmt, nkset, maxne
        real(wp), allocatable :: escat(:)
    contains
        procedure :: get_kmatrix
        procedure :: read_kmatrix_file
        procedure :: reset_kmatrix_position
    end type KMatrix


    !> \brief   Photoionization wave function coefficients
    !> \author  J Benda
    !> \date    2020
    !>
    !> Photoionization Ak coefficients calculated and written by RSOLVE. This can be used
    !> as an alternative to calculating the Ak coefficients on the fly.
    !>
    type ScatAkCoeffs
        real(wp), allocatable :: ReA(:, :, :), ImA(:, :, :), evchl(:)
        integer,  allocatable :: ichl(:), lvchl(:), mvchl(:)
        integer               :: mgvn, nesc, nchan, nstat, lusct
    contains
        procedure :: get_wfncoeffs
        procedure :: read_wfncoeffs_file
        procedure :: reset_wfncoeffs_position
    end type ScatAkCoeffs


    !> \brief  Auxiliary data structure for matrix (potentially memory-mapped, or distributed)
    !> \author J Benda
    !> \date   2021 - 2022
    !>
    !> Matrix class that either contains allocated data, or pointer to a mapped memory.
    !> Used in MolecularData for inner dipole matrices and for boundary amplitude matrices.
    !> If the logical flag "distributed" is set to true before reading data, the matrix
    !> will be read using MPI-IO into a ScaLAPACK-compatible block-cyclic distributed matrix.
    !>
    type MappedMatrix
        real(real64), pointer :: mat(:, :) => null()  !< pointer to mapped disk data
#ifdef WITH_MMAP
        type(file_mapping)    :: mapping              !< helper structure for file memory mapping
#endif
        logical          :: distributed  = .false.    !< whether this is just a local portion of a distributed ScaLAPACK matrix
        integer(blasint) :: desc(9)      = 0          !< BLACS descriptors (only used when distributed = .true.)
        integer(blasint) :: row_context  = 0          !< auxiliary linear BLACS grid context
        integer(blasint) :: blk_context  = 0          !< main rectangular BLACS grid context
        integer(blasint) :: block_size   = 0          !< ScaLAPACK block size
    contains
        procedure :: load => load_mapped_matrix
        final     :: destruct_mapped_matrix
    end type MappedMatrix


    !> \brief   RMT molecular data file
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> This data structure contains data read from the molecular_data file produced by RMT_INTERFACE.
    !> Only a subset of values needed by this program is stored in memory. This in particular
    !> includes the inner and outer region transition dipole elements, boundary amplitudes and
    !> angular integrals of the real spherical harmonics.
    !>
    !> When the code is compiled with the WITH_MMAP=1 option, the dipole matrices will not be read
    !> into memory, but only mapped to the virtual memory.
    !>
    !> When the code is compiled with ScaLAPACK support, the large inner dipole matrices will be
    !> distributed in the standard block-cyclic fashion.
    !>
    type MolecularData

        ! inner transition dipoles storage
        type(MappedMatrix), allocatable :: dipx(:), dipy(:), dipz(:)
        integer(int32),     allocatable :: iidipx(:), iidipy(:), iidipz(:)
        integer(int32),     allocatable :: ifdipx(:), ifdipy(:), ifdipz(:)

        ! boundary amplitudes
        type(MappedMatrix), allocatable :: wamp(:)
        type(MappedMatrix), allocatable :: wmat2(:, :)

        ! number of electrons, number of protons
        integer(int32) :: nelc, nz

        ! other data
        real(real64)                :: rmatr
        real(real64),   allocatable :: crlv(:,:,:), eig(:,:), rg(:), etarg(:), gaunt(:, :, :), r_points(:)
        integer(int32), allocatable :: mgvns(:), stot(:), mnp1(:), nchan(:), l2p(:,:), m2p(:,:), ichl(:,:), lm_rg(:,:)
        integer(int32), allocatable :: ltarg(:), starg(:)

    end type MolecularData

contains

    !> \brief  Read or map a matrix from file
    !> \author J Benda
    !> \date   2021 - 2024
    !>
    !> Depending on the compilation settings, either map the given section of the file to memory,
    !> or simply allocate and read the chunk. Optionally, read and distribute the matrix in the
    !> block-cyclic way.
    !>
    subroutine load_mapped_matrix (this, filename, u, offset, rows, cols)

#ifdef WITH_SCALAPACK
        use mpi
        use mpi_gbl, only: mpi_xermsg

        integer, parameter :: mpiint = kind(MPI_COMM_WORLD)
        integer, parameter :: mpiofs = MPI_OFFSET_KIND

        character(len=200) :: msg

        integer(mpiint) :: ndims, gsizes(2), distrb(2), dargs(2), psizes(2), locsiz, fh, ierr, datype
        integer(mpiint) :: comm_size, comm_rank, stat(MPI_STATUS_SIZE), mtwo = 2, dims(2)
        integer(mpiofs) :: offs
#endif

        class(MappedMatrix), intent(inout) :: this
        character(len=*),    intent(in)    :: filename
        integer(int32),      intent(in)    :: u, rows, cols
        integer(int64),      intent(in)    :: offset

#ifdef WITH_MMAP
        integer(int64)   :: length
#endif
        integer(blasint) :: zero = 0, one = 1, nprow, npcol, lrows, lcols, min_block_size, max_block_size, ld, info, myprow, mypcol
        integer(blasint) :: brows, bcols

#ifdef WITH_SCALAPACK
        if (this % distributed) then
            call MPI_Comm_size(MPI_COMM_WORLD, comm_size, ierr)
            call assert_status('Failed to obtain world communicator size: ', int(ierr))
            call MPI_Comm_rank(MPI_COMM_WORLD, comm_rank, ierr)
            call assert_status('Failed to obtain world communicator rank: ', int(ierr))

            dims = 0
            call MPI_Dims_create(comm_size, mtwo, dims, ierr)
            call assert_status('Failed to reshape world communicator to rectangular grid: ', int(ierr))
            nprow = maxval(dims)
            npcol = minval(dims)

            if (npcol * nprow /= nprocs) then
                write (msg, '(a,3(i0,a))') 'BLACS grid ', nprow, 'x', npcol, ' does not use all ', nprocs, ' processes. Fix it.'
                call mpi_xermsg('multidip_io', 'load_mapped_matrix', trim(msg), 1, 1);
            end if

            call blacs_get(zero, zero, this % row_context)
            call blacs_get(zero, zero, this % blk_context)
            call blacs_gridinit(this % row_context, 'R', one, nprow * npcol)
            call blacs_gridinit(this % blk_context, 'R', nprow, npcol)
            call blacs_gridinfo(this % blk_context, nprow, npcol, myprow, mypcol)

            min_block_size = 1
            max_block_size = 512  ! rather arbitrary, but this particular choice coincides with a common page size (4 kiB)

            this % block_size = min(rows / nprow, cols / npcol)
            this % block_size = min(this % block_size, max_block_size)
            this % block_size = max(this % block_size, min_block_size)

            brows = rows
            bcols = cols
            lrows = numroc(brows, this % block_size, myprow, zero, nprow)
            lcols = numroc(bcols, this % block_size, mypcol, zero, npcol)
            ld = max(one, lrows)

            if (int(lrows, int64)*int(lcols, int64) > huge(1_int32)) then
                print '(a,i0,a,i0,a)', 'WARNING: Local portion of the dipole matrix has size ', lrows, ' x ', lcols, '.'
                print '(9x,a)', 'This means that it cannot be indexed with 4-byte integers.'
                print '(9x,a,/)', 'If reading fails, try distributing the calculation more.'
            end if

            call descinit(this % desc, brows, bcols, this % block_size, this % block_size, zero, zero, this % blk_context, ld, info)

            allocate (this % mat(ld, lcols))

            offs   = offset - 1
            ndims  = 2
            gsizes = [ rows, cols ]
            distrb = [ MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC ]
            dargs  = [ this % block_size, this % block_size ]
            psizes = [ nprow, npcol ]
            locsiz = lrows * lcols

            call MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, fh, ierr)
            call assert_status('Failed to open file for MPI-IO: ', int(ierr))
            call MPI_Type_create_darray(comm_size, comm_rank, ndims, gsizes, distrb, dargs, &
                                  psizes, MPI_ORDER_FORTRAN, MPI_REAL8, datype, ierr)
            call assert_status('Failed to define distributed array data type: ', int(ierr))
            call MPI_Type_commit(datype, ierr)
            call assert_status('Failed to commit distributed array data type: ', int(ierr))
            call MPI_File_set_view(fh, offs, MPI_REAL8, datype, 'native', MPI_INFO_NULL, ierr)
            call assert_status('Failed to define file view: ', int(ierr))
            call MPI_File_read_all(fh, this % mat, locsiz, MPI_REAL8, stat, ierr)
            call assert_status('Failed to read from file: ', int(ierr))
            call MPI_Type_free(datype, ierr)
            call assert_status('Failed to release distributed array data type: ', int(ierr))
            call MPI_File_close(fh, ierr)
            call assert_status('Failed to close file: ', int(ierr))
        else
#endif
#ifdef WITH_MMAP
            length = int(rows, int64) * int(cols, int64) * wp_bytes
            call this % mapping % init(filename, offset - 1, length)
            call c_f_pointer(this % mapping % ptr, this % mat, [rows, cols])
#else
            allocate (this % mat(rows, cols))
            read (u) this % mat
#endif
#ifdef WITH_SCALAPACK
        end if
#endif

    end subroutine load_mapped_matrix


    !> \brief   Finalize a (potentially mapped) matrix object
    !> \author  J Benda
    !> \date    2021 - 2022
    !>
    !> Safely deallocates the matrix. If it has been mapped, the pointer
    !> is only nullified; unmapping happens automatically in the desctructor
    !> of the file_mapping type.
    !>
    subroutine destruct_mapped_matrix (this)

        type(MappedMatrix), intent(inout) :: this

#ifdef WITH_SCALAPACK
        if (this % distributed) then
            deallocate (this % mat)
        else
#endif
#ifndef WITH_MMAP
            if (associated(this % mat)) then
                deallocate (this % mat)
            end if
#endif
#ifdef WITH_SCALAPACK
        end if
#endif
        nullify (this % mat)

    end subroutine destruct_mapped_matrix


    !> \brief   Read K-matrix files
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Read all needed K-matrix files. These are needed to calculate photionization coefficients (Ak)
    !> during the calculation. Also perform consistency check between the energy samples in
    !> individual files.
    !>
    subroutine read_kmatrices (km, lukmt, nkset)

        use mpi_gbl, only: mpi_xermsg

        type(KMatrix), allocatable, intent(inout) :: km(:)
        integer,                    intent(in)    :: lukmt(:), nkset(:)

        character(len=200) :: msg
        integer :: i

        do i = 1, size(km)

            km(i) % lukmt = lukmt(i)
            km(i) % nkset = nkset(i)

            call km(i) % read_kmatrix_file

            if (km(i) % nescat /= km(1) % nescat) then
                write (msg, '(a,2(i0,a))') 'ERROR: K-matrix units ', lukmt(1), ' and ', lukmt(i), &
                                           ' hold different number of scattering energies!'
                call mpi_xermsg('multidip_io', 'read_kmatrices', trim(msg), 1, 1)
            end if

            if (any(km(i) % escat(1 : km(i) % nescat) /= km(1) % escat(1 : km(1) % nescat))) then
                write (msg, '(a,2(i0,a))') 'ERROR: K-matrix units ', lukmt(1), ' and ', lukmt(i), &
                                           ' hold different scattering energies!'
                call mpi_xermsg('multidip_io', 'read_kmatrices', trim(msg), 1, 1)
            end if

        end do

    end subroutine read_kmatrices


    !> \brief   Read K-matrix file
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Read metadata from a K-matrix file, count K-matrices and store the unit and dataset for
    !> later use in retrieval of the K-matrices themselves. The K-matrices are *not* being read
    !> into memory here to avoid exhausting RAM (particularly in parallel mode) if the K-matrices
    !> are large.
    !>
    subroutine read_kmatrix_file (km)

        use mpi_gbl, only: mpi_xermsg

        class(KMatrix), intent(inout) :: km

        integer :: k, n, ifail, key, nset, nrec, ninfo, ndata, ntarg, nvib, ndis, ie, gutot, ion
        integer :: nopen, ndopen, nchsq, mye, nene, ierr
        logical :: iwarn
        real(wp) :: dE1, dE2, r, rmass, en
        real(wp), allocatable :: kmat(:)
        character(len=80) :: title, msg

        print '(/,A,I0,A,I0,/)', 'Reading K-matrices from unit ', km % lukmt, ', set ', km % nkset

        call getset(km % lukmt, km % nkset, 11, 'UNFORMATTED', ifail)

        read (km % lukmt) key, nset, nrec, ninfo, ndata
        read (km % lukmt) title
        read (km % lukmt) km % mgvn, km % stot, gutot, ion, r, rmass
        read (km % lukmt) ntarg, nvib, ndis, km % nchan, km % maxne

        print '(2x,A,I0)', 'mgvn  = ', km % mgvn
        print '(2x,A,I0)', 'stot  = ', km % stot
        print '(2x,A,I0)', 'ntarg = ', ntarg
        print '(2x,A,I0)', 'nvib  = ', nvib
        print '(2x,A,I0)', 'ndis  = ', ndis
        print '(2x,A,I0)', 'nchan = ', km % nchan
        print '(2x,A,I0)', 'maxne = ', km % maxne

        print '(/,2x,A,/)', 'Energy ranges:'

        km % nescat = 0
        do ie = 1, km % maxne
            read (km % lukmt) k, n, dE1, dE2
            print '(4x,I4,I8,F8.3,F8.3)', k, n, dE1, dE2
            km % nescat = km % nescat + n
        end do

        nene = (km % nescat + nprocs - 1) / nprocs
        allocate (km % escat(km % nescat), kmat(km % nchan * (km % nchan + 1)))

        iwarn = .true.
        mye = 0
        do ie = 1, km % nescat
            read (km % lukmt, iostat = ierr) nopen, ndopen, nchsq, en!, kmat(1:nchsq)
            if (ierr == iostat_end) then
                print '(/,2x,A,I0,A,I0,A)', 'WARNING: Unit ', km % lukmt, ' contains only ', ie - 1, ' K-matrices.'
                print '(11x,A)', 'This means that CFASYM failed during RSOLVE for some energies.'
                print '(11x,A)', 'The scattering energies MUST be consistent across symmetries!'
                km % nescat = ie - 1
                exit
            else if (ierr /= 0) then
                write (msg, '(2x,a,i0,a)') 'ERROR ', ierr, ' while reading K-matrix file.'
                call mpi_xermsg('multidip_io', 'read_kmatrix_file', trim(msg), 1, 1)
            end if
            km % escat(ie) = en/2  ! Ry -> a.u.
            if (iwarn .and. nopen < km % nchan) then
                print '(/,2x,A,I0,A)', 'WARNING: Unit ', km % lukmt, ' contains only open-open subset of the full K-matrix.'
                print '(11x,A)', 'Use IKTYPE = 1 in RSOLVE to save full K-matrices.'
                print '(11x,A)', 'You may get spurious below-threshold noise.'
                iwarn = .false.
            end if
        end do

        deallocate (kmat)

    end subroutine read_kmatrix_file


    !> \brief   Reset I/O pointer to start of K-matrices
    !> \author  J Benda
    !> \date    2021
    !>
    !> Rewind the unit to the start of the associated dataset and read through to the very beginning of the actual
    !> K-matrix data. Also skip the given number of leading K-matrices. This prepares the file for the subsequent
    !> calls to `get_kmatrix`.
    !>
    subroutine reset_kmatrix_position (km, skip)

        class(KMatrix),    intent(in) :: km
        integer, optional, intent(in) :: skip

        integer :: i

        ! rewind to the beginning of the K-matrix unit
        call getset(km % lukmt, km % nkset, 11, 'UNFORMATTED', i)

        read (km % lukmt)
        read (km % lukmt)
        read (km % lukmt)
        read (km % lukmt)

        do i = 1, km % maxne
            read (km % lukmt)
        end do

        if (present(skip)) then
            do i = 1, skip
                read (km % lukmt)
            end do
        end if

    end subroutine reset_kmatrix_position


    !> \brief   Read single K-matrix from the K-matrix file
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Assuming that the K-matrix file associated with this object is correctly positioned, read the next K-matrix record.
    !> Symmetrize the K-matrix and store it into the allocatable two-dimensional array `kmat` (re/allocate as necessary).
    !>
    subroutine get_kmatrix (km, kmat, skip)

        use mpi_gbl, only: mpi_xermsg

        class(KMatrix),        intent(in)    :: km
        real(wp), allocatable, intent(inout) :: kmat(:, :)
        integer,  optional,    intent(in)    :: skip

        real(wp), allocatable :: buffer(:)
        character(len=200)    :: msg

        real(wp) :: en
        integer  :: a, b, c, i, ierr, nopen, ndopen, nchsq

        allocate (buffer(km % nchan * (km % nchan + 1)))

        read (km % lukmt, iostat = ierr) nopen, ndopen, nchsq, en, buffer(1:nchsq)

        if (ierr == iostat_end) then
            write (msg, '(a,i0,a)') 'ERROR: Unit ', km % lukmt, ' ended while reading K-matrix!'
            call mpi_xermsg('multidip_io', 'get_kmatrix', trim(msg), 1, 1)
        else if (ierr /= 0) then
            write (msg, '(2x,a,2(i0,a))') 'ERROR ', ierr, ' while reading K-matrix file unit ', km % lukmt, '!'
            call mpi_xermsg('multidip_io', 'get_kmatrix', trim(msg), 1, 1)
        end if

        if (allocated(kmat)) then
            if (size(kmat, 1) /= km % nchan .or. size(kmat, 2) /= km % nchan) then
                deallocate (kmat)
            end if
        end if

        if (.not. allocated(kmat)) then
            allocate (kmat(km % nchan, km % nchan))
        end if

        c = 0
        do a = 1, nopen
            do b = 1, a
                c = c + 1
                kmat(a, b) = buffer(c)
                kmat(b, a) = buffer(c)
            end do
        end do

        if (present(skip)) then
            do i = 1, skip
                read (km % lukmt, iostat = ierr)  ! ignore EOF
            end do
        end if

    end subroutine get_kmatrix


    !> \brief   Read wave function coefficients from files
    !> \author  J Benda
    !> \date    2020
    !>
    !> Read wave function (Ak) coefficients from an unformatted file written by RSOLVE. This is an optional
    !> functionality meant for dabugging. It is recommended that the Ak coefficients are calculated by the
    !> present program instead to avoid storing large datasets on disk or in memory.
    !>
    subroutine read_wfncoeffs (ak, lusct)

        type(ScatAkCoeffs), allocatable, intent(inout) :: ak(:)
        integer, intent(in) :: lusct(:)
        integer :: i

        do i = 1, size(ak)
            ak(i) % lusct = lusct(i)
            call ak(i) % read_wfncoeffs_file
        end do

    end subroutine read_wfncoeffs


    !> \brief   Read inner bound wave function coefficients
    !> \authors J Benda
    !> \date    2023
    !>
    !> Read the inner region expansion coefficients of the bound state as calculated by BOUND.
    !>
    !> \param[out] bnd    Vector of inner region expansion coefficients to fill.
    !> \param[out] Ei     Bound state energy as calculated by BOUND.
    !> \param[in]  lubnd  File unit number with the (unformatted) BOUND output.
    !> \param[in]  mgvn0  Expected MGVN (to check).
    !> \param[in]  stot0  Expected total spin (to check).
    !>
    subroutine read_bndcoeffs (bnd, Ei, lubnd, mgvn0, stot0)

        real(wp), intent(out) :: bnd(:), Ei
        integer,  intent(in)  :: lubnd, mgvn0, stot0

        real(wp), allocatable :: xvec(:), etot(:), vtemp(:)
        real(wp)              :: rr
        integer               :: nbset, nchan, mgvn, stot, gutot, nstat, nbound, iprint, iwrite, ifail
        character(len=11)     :: bform

        bform = 'unformatted'
        nbset = 1
        iprint = 0
        iwrite = output_unit

        open (lubnd, action = 'read', form = bform)

        call readbh(lubnd, nbset, nchan, mgvn, stot, gutot, nstat, nbound, rr, bform, iprint, iwrite, ifail)

        if (mgvn /= mgvn0 .or. stot /= stot0) then
            print '(*(a,i0))', 'ERROR: Bound coefficients at unit ', lubnd, ' are for MGVN = ', mgvn, ', STOT = ', stot, &
                               ', but I am looking for MGVN = ', mgvn0, ', STOT = ', stot0
            stop 1
        end if

        nbound = 1  ! read just the first bound state, ignore the rest
        allocate (xvec(nbound*nchan), vtemp(nbound), etot(nbound))

        call readbc(nstat, etot, vtemp, bnd, nbound, nchan, xvec)

        Ei = etot(1)

        close (lubnd)

    end subroutine read_bndcoeffs


    !> \brief   Read wave function coefficients for a single symmetry from a file
    !> \author  J Benda
    !> \date    2021
    !>
    subroutine read_wfncoeffs_file (ak)

        class(ScatAkCoeffs), intent(inout) :: ak

        real(wp)          :: rr
        integer           :: j, keysc, nset, nrec, ninfo, ndata, stot, gutot, nscat
        character(len=80) :: header

        print '(/,A,I0,/)', 'Reading wave function coefficients from unit ', ak % lusct

        open (ak % lusct, status = 'old', action = 'read', form = 'unformatted')

        read (ak % lusct) keysc, nset, nrec, ninfo, ndata
        read (ak % lusct) header
        read (ak % lusct) nscat, ak % mgvn, stot, gutot, ak % nstat, ak % nchan, ak % nesc

        print '(2x,A,I0)', 'mgvn  = ', ak % mgvn
        print '(2x,A,I0)', 'stot  = ', stot
        print '(2x,A,I0)', 'nstat = ', ak % nstat
        print '(2x,A,I0)', 'nchan = ', ak % nchan
        print '(2x,A,I0)', 'nesc  = ', ak % nesc

        read (ak % lusct) rr

        if (allocated(ak % ichl))  deallocate (ak % ichl)
        if (allocated(ak % lvchl)) deallocate (ak % lvchl)
        if (allocated(ak % mvchl)) deallocate (ak % mvchl)
        if (allocated(ak % evchl)) deallocate (ak % evchl)

        allocate (ak % ichl(ak % nchan))
        allocate (ak % lvchl(ak % nchan))
        allocate (ak % mvchl(ak % nchan))
        allocate (ak % evchl(ak % nchan))

        read (ak % lusct) ak % ichl, ak % lvchl, ak % mvchl, ak % evchl

        print '(/,2x,A)', 'channel table'
        do j = 1, ak % nchan
            print '(2x,4I6,F15.7)', j, ak % ichl(j), ak % lvchl(j), ak % mvchl(j), ak % evchl(j)
        end do

        close (ak % lusct)

    end subroutine read_wfncoeffs_file


    !> \brief   Reset I/O pointer to start of Ak-coeffs
    !> \author  J Benda
    !> \date    2021
    !>
    !> Rewind the unit to the start of the associated dataset and read through to the very beginning of the actual
    !> Ak-coeffs data. Also skip the given number of leading Ak-coeffs. This prepares the file for the subsequent
    !> calls to `get_wfncoeffs`.
    !>
    subroutine reset_wfncoeffs_position (ak, skip)

        class(ScatAkCoeffs), intent(in) :: ak
        integer, optional,   intent(in) :: skip

        integer :: i, ich

        ! rewind to the beginning of the K-matrix unit
        call getset(ak % lusct, 1, 88, 'UNFORMATTED', i)

        read (ak % lusct)
        read (ak % lusct)
        read (ak % lusct)
        read (ak % lusct)
        read (ak % lusct)

        if (present(skip)) then
            do i = 1, skip
                do ich = 1, ak % nchan
                    read (ak % lusct)
                    read (ak % lusct)
                end do
            end do
        end if

    end subroutine reset_wfncoeffs_position


    !> \brief   Read single Ak-matrix from the Ak-coeffs file
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Assuming that the Ak-coeffs file associated with this object is correctly positioned, read the next Ak-matrix record.
    !> Also returns the scattering energy in atomic units.
    !>
    subroutine get_wfncoeffs (ak, E, Re_Ak, Im_Ak, skip)

        use mpi_gbl, only: mpi_xermsg

        class(ScatAkCoeffs),   intent(in)    :: ak
        real(wp), allocatable, intent(inout) :: Re_Ak(:, :), Im_Ak(:, :)
        integer,  optional,    intent(in)    :: skip
        real(wp),              intent(inout) :: E

        character(len=250) :: msg, iomsg

        integer  :: i, ich, ierr

        do ich = 1, ak % nchan
            read (ak % lusct, iostat = ierr, iomsg = iomsg) E, i, Re_Ak(:, ich)
            read (ak % lusct, iostat = ierr, iomsg = iomsg) E, i, Im_Ak(:, ich)
            if (ierr == iostat_end) then
                write (msg, '(2x,a,i0,a)') 'ERROR: Unit ', ak % lusct, ' ended while reading Ak-coeffs!'
                call mpi_xermsg('multidip_io', 'get_wfncoeffs', trim(msg), 1, 1)
            else if (ierr /= 0) then
                write (msg, '(2x,a,2(i0,a),a)') 'ERROR ', ierr, ' while reading Ak-coeffs file unit ', ak % lusct, ': ', iomsg
                call mpi_xermsg('multidip_io', 'get_wfncoeffs', trim(msg), 1, 1)
            end if
        end do

        if (present(skip)) then
            do i = 1, skip
                do ich = 1, ak % nchan
                    read (ak % lusct, iostat = ierr)  ! ignore EOF
                    read (ak % lusct, iostat = ierr)  ! ignore EOF
                end do
            end do
        end if

    end subroutine get_wfncoeffs


    !> \brief   Read RMT molecular_data file
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> Read data from the RMT molecular_data file. This includes in particular both the inner and outer region
    !> transition dipole elements, as well as for instance Gaunt angular integrals for all partial waves used.
    !>
    subroutine read_molecular_data (moldat, filename, mpiio, read_wmat2)

        use multidip_params, only: check_dipoles

        type(MolecularData), intent(inout) :: moldat
        character(*),        intent(in)    :: filename
        logical,             intent(in)    :: mpiio, read_wmat2

        integer(int32), allocatable :: nconat(:)
        real(real64),   allocatable :: cf(:, :, :)

        integer(int32) :: i, j, u, s, s1, s2, ntarg, nrg, lrang2, lamax, inast, nchmx, nstmx, lmaxp1, nfdm
        integer(int32) :: lrgl, nspn, npty
        integer(int64) :: offset, length
        real(real64)   :: bbloch, delta_r

        open (newunit = u, file = filename, access = 'stream', status = 'old', action = 'read')

        print '(/,3A,/)', 'Reading file "', filename, '"'

        read (u) s, s1, s2
        length = int(s1, int64) * int(s2, int64) * wp_bytes
        allocate (moldat % iidipy(s), moldat % ifdipy(s), moldat % dipy(s))
        read (u) moldat % iidipy, moldat % ifdipy
        inquire (u, pos = offset)
        do i = 1, s
            moldat % dipy(i) % distributed = mpiio
            call moldat % dipy(i) % load(filename, u, offset, s1, s2)
            offset = offset + length
        end do
        print '(2x,A,*(1x,I0))', '[m = -1] iidip =', moldat % iidipy
        print '(2x,A,*(1x,I0))', '         ifdip =', moldat % ifdipy

        if (check_dipoles) then
            do i = 1, s
                if (sum(abs(moldat % dipy(i) % mat)) == 0) then
                    print '(/,2x,A,/)', 'WARNING: Norm of some Y dipoles is zero. Something went wrong in CDENPROP?'
                end if
            end do
        end if

        read (u, pos = offset) s, s1, s2
        length = int(s1, int64) * int(s2, int64) * wp_bytes
        allocate (moldat % iidipz(s), moldat % ifdipz(s), moldat % dipz(s))
        read (u) moldat % iidipz, moldat % ifdipz
        inquire (u, pos = offset)
        do i = 1, s
            moldat % dipz(i) % distributed = mpiio
            call moldat % dipz(i) % load(filename, u, offset, s1, s2)
            offset = offset + length
        end do
        print '(2x,A,*(1x,I0))', '[m =  0] iidip =', moldat % iidipz
        print '(2x,A,*(1x,I0))', '         ifdip =', moldat % ifdipz

        if (check_dipoles) then
            do i = 1, s
                if (sum(abs(moldat % dipz(i) % mat)) == 0) then
                    print '(/,2x,A,/)', 'WARNING: Norm of some Z dipoles is zero. Something went wrong in CDENPROP?'
                end if
            end do
        end if

        read (u, pos = offset) s, s1, s2
        length = int(s1, int64) * int(s2, int64) * wp_bytes
        allocate (moldat % iidipx(s), moldat % ifdipx(s), moldat % dipx(s))
        read (u) moldat % iidipx, moldat % ifdipx
        inquire (u, pos = offset)
        do i = 1, s
            moldat % dipx(i) % distributed = mpiio
            call moldat % dipx(i) % load(filename, u, offset, s1, s2)
            offset = offset + length
        end do
        print '(2x,A,*(1x,I0))', '[m = +1] iidip =', moldat % iidipx
        print '(2x,A,*(1x,I0))', '         ifdip =', moldat % ifdipx

        if (check_dipoles) then
            do i = 1, s
                if (sum(abs(moldat % dipx(i) % mat)) == 0) then
                    print '(/,2x,A,/)', 'WARNING: Norm of some X dipoles is zero. Something went wrong in CDENPROP?'
                end if
            end do
        end if

        read (u, pos = offset) ntarg
        print '(/,2x,A,I0)', 'ntarg = ', ntarg
        allocate (moldat % crlv(ntarg, ntarg, 3))
        read (u) moldat % crlv

        read (u) nrg
        allocate (moldat % rg(nrg), moldat % lm_rg(6, nrg))
        read (u) moldat % rg, moldat % lm_rg

        read (u) moldat % nelc, moldat % nz, lrang2, lamax, ntarg, inast, nchmx, nstmx, lmaxp1
        read (u) moldat % rmatr, bbloch
        allocate (moldat % etarg(ntarg), moldat % ltarg(ntarg), moldat % starg(ntarg))
        read (u) moldat % etarg, moldat % ltarg, moldat % starg
        read (u) nfdm, delta_r
        allocate (moldat % r_points(nfdm + 1))
        read (u) moldat % r_points
        print '(2x,A,*(1x,F8.3))', 'sampling radii:', moldat % r_points

        print '(/,2x,A)', 'number of inner eigenstates per symmetry'

        allocate (nconat(ntarg), moldat % l2p(nchmx, inast), moldat % m2p(nchmx, inast), moldat % eig(nstmx, inast), &
                  moldat % wamp(inast), cf(nchmx, nchmx, lamax), moldat % ichl(nchmx, inast), &
                  moldat % mgvns(inast), moldat % stot(inast), moldat % nchan(inast), moldat % mnp1(inast), &
                  moldat % wmat2(nfdm, inast))

        inquire (u, pos = offset)

        length = int(nchmx, int64) * int(nstmx, int64) * wp_bytes

        do i = 1, inast

            read (u, pos = offset) lrgl, nspn, npty, moldat % nchan(i), moldat % mnp1(i)
            moldat % mgvns(i) = lrgl - 1
            moldat % stot(i) = nspn
            print '(4x,I3,I8)', moldat % mgvns(i), moldat % mnp1(i)
            read (u) nconat, moldat % l2p(:, i), moldat % m2p(:, i)
            read (u) moldat % eig(:, i)
            inquire (u, pos = offset)
            call moldat % wamp(i) % load(filename, u, offset, nchmx, nstmx)
            offset = offset + length
            read (u, pos = offset) cf, moldat % ichl(:, i)
            read (u) s1, s2
            inquire (u, pos = offset)

            ! read or map wmat2 only when explicitly required; it is large and used only for debugging
            if (read_wmat2) then
                do j = 1, nfdm
                    call moldat % wmat2(j, i) % load(filename, u, offset, s1, s2)
                    offset = offset + int(s1, int64) * int(s2, int64) * wp_bytes
                end do
            else
                offset = offset + int(s1, int64) * int(s2, int64) * nfdm * wp_bytes
            end if

        end do

        do i = 1, inast
            print '(/,2x,A,I0,A,I0,A)', 'channels in symmetry #', i, ' (IRR ', moldat % mgvns(i), '):'
            do j = 1, moldat % nchan(i)
                print '(2x,I4,A,I6,I3,SP,I6)', j, ':', moldat % ichl(j, i), moldat % l2p(j, i), moldat % m2p(j, i)
                if (moldat % ichl(j, i) > ntarg) then
                    print '(4(A,I0),A)', 'Warning: Channel ', j, ' in symmetry ', moldat % mgvns(i), ' is coupled to state ', &
                                 moldat % ichl(j, i), ', which is beyond the number of targets ', ntarg, ' (bug in mpi-scatci!)'
                    moldat % ichl(j, i) = ntarg
                end if
            end do
        end do

        close (u)

        call setup_angular_integrals(moldat)

    end subroutine read_molecular_data


    !> \brief  Read ion transition dipoles
    !> \author J Benda
    !> \date   2021
    !>
    !> Read ion target transition dipoles from a DENPROP file.
    !>
    subroutine read_target_properties (lutarg, prop, etarg)

        integer,               intent(in)    :: lutarg
        real(wp), allocatable, intent(inout) :: etarg(:), prop(:,:,:)

        integer  :: i, key, set, nrec, nnuc, ntarg, nmom, itarg, jtarg, imgvn, jmgvn, l, m, isw
        real(wp) :: x, rmoi(3), dum(7)
        logical  :: is_open

        print '(/,A)', 'Reading target transition dipoles'

        inquire(lutarg, opened = is_open)
        if (.not. is_open) then
            open (lutarg, action = 'read')
        end if
        call readmh(lutarg, key, set, nrec, nnuc, ntarg, nmom, isw, rmoi)
        if (key /= 6) then
            print '(/,A,I0,A)', 'ERROR: Key ', key, ' is not DENPROP property file, aborting.'
            stop 1
        end if
        print '(/,1x,A,I0)', 'Number of records: ', nrec
        print '(1x,A,I0)', 'Number of nuclei: ', nnuc
        print '(1x,A,I0)', 'Number of targets: ', ntarg
        print '(1x,A,I0)', 'Number of properties: ', nmom
        do i = 1, nnuc
            read (lutarg, *) key
            if (key /= 8) then
                print '(/,A,I0,A)', 'ERROR: Key ', key, ' is not nuclear definition, aborting.'
                stop 1
            end if
        end do
        allocate (etarg(ntarg))
        do i = 1, ntarg
            read (lutarg, *) key, dum, etarg(i)
            if (key /= 5) then
                print '(/,A,I0,A)', 'ERROR: Key ', key, ' is not target definition, aborting.'
                stop 1
            end if
        end do
        allocate (prop(ntarg, ntarg, 3))
        prop = 0
        do i = 1, nmom
            read (lutarg, *) key, itarg, imgvn, jtarg, jmgvn, nnuc, l, m, x
            if (key /= 1) then
                print '(/,A,I0,A)', 'ERROR: Key ', key, ' is not property definition, aborting.'
                stop 1
            end if
            if (l == 1) then
                prop(itarg, jtarg, m + 2) = x
                prop(jtarg, itarg, m + 2) = x
            end if
        end do
        if (.not. is_open) then
            close (lutarg)
        end if

    end subroutine read_target_properties


    !> \brief   Store the angular integrals in a more convenient shape
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Convert the Gaunt integral storage from a list to an easily addressable matrix.
    !>
    subroutine setup_angular_integrals (moldat)

        use mpi_gbl, only: mpi_xermsg

        type(MolecularData), intent(inout) :: moldat

        integer :: maxl, i, l1, m1, l2, m2, l3, m3

        maxl = max(0_int32, maxval(moldat % lm_rg))

        allocate (moldat % gaunt((maxl + 1)**2, (maxl + 1)**2, -1:1))

        moldat % gaunt = 0

        do i = 1, size(moldat % rg)
            l1 = moldat % lm_rg(1, i); m1 = moldat % lm_rg(2, i)
            l2 = moldat % lm_rg(3, i); m2 = moldat % lm_rg(4, i)
            l3 = moldat % lm_rg(5, i); m3 = moldat % lm_rg(6, i)
            if (l3 /= 1 .or. abs(m3) > 1) then
                call mpi_xermsg('multidip_io', 'setup_angular_integrals', &
                                'Unexpected layout of the angular integral storage in molecular_data', 1, 1)
            end if
            moldat % gaunt(l1*(l1+1) + m1 + 1, l2*(l2+1) + m2 + 1, m3) = moldat % rg(i)
        end do

    end subroutine setup_angular_integrals


    !> \brief   Return dipole transition descriptors
    !> \author  J Benda
    !> \date    2020
    !>
    !> Based on the Cartesian component index I = 1, 2, 3, (re)allocate the arrays 'iidip' and 'ifdip'
    !> and fill them with initial/final irreducible representations pairs for dipole matrices present
    !> in the molecular_data file structure 'moldat'.
    !>
    subroutine get_diptrans (moldat, I, iidip, ifdip)

        type(MolecularData),  intent(in)    :: moldat
        integer,              intent(in)    :: I
        integer, allocatable, intent(inout) :: iidip(:), ifdip(:)

        if (allocated(iidip)) deallocate (iidip)
        if (allocated(ifdip)) deallocate (ifdip)

        select case (I)
            case (1)
                iidip = moldat % iidipx
                ifdip = moldat % ifdipx
            case (2)
                iidip = moldat % iidipy
                ifdip = moldat % ifdipy
            case (3)
                iidip = moldat % iidipz
                ifdip = moldat % ifdipz
        end select

    end subroutine get_diptrans


    !> \brief   Multiply by dipole matrix
    !> \author  J Benda
    !> \date    2020 - 2022
    !>
    !> Multiply matrix 'X' by the dipole matrix corresponding to dipole component 'I' and transition index 's'.
    !> Store the result in matrix 'Y'. Use 'nn' rows of 'X', and 'nf' rows of 'Y'.
    !>
    !> The matrix argument X and Y have the following one-dimensional distribution among processes:
    !> \verbatim
    !>            +---+---+---+---+
    !>            |   |   |   |   |
    !>            |   |   |   |   |
    !>            | 0 | 1 | 2 | 3 |
    !>            |   |   |   |   |
    !>            |   |   |   |   |
    !>            +---+---+---+---+
    !> \endverbatim
    !> However, when ScaLAPACK multiplication is desired, the arrays need to be redistributed into the standard
    !> two-dimensional block-cyclic form
    !> \verbatim
    !>            +-+-+-+-+-+-+-+-+
    !>            |0|1|2|3|0|1|2|3|
    !>            +-+-+-+-+-+-+-+-+
    !>            |1|2|3|0|1|2|3|0|
    !>            +-+-+-+-+-+-+-+-+
    !>            |2|3|0|1|2|3|0|1|
    !>            +-+-+-+-+-+-+-+-+
    !>            |3|0|1|2|3|0|1|2|
    !>            +-+-+-+-+-+-+-+-+
    !> \endverbatim
    !> and back. This is accomplished by the subroutine `pdgemr2d`. Note however, that this subroutine is prone
    !> to failure when the matrices exceed 4-byte addressable number of elements. Working with such matrices then
    !> requires patched version of ScaLAPACK that use long integers internally.
    !>
    subroutine apply_dipole_matrix (moldat, component, irrpair, transp, nf, nn, X, Y)

        use multidip_params,  only: rone, rzero
        use multidip_special, only: blas_dgemm => dgemm

        integer(blasint), parameter :: one = 1, zero = 0

        type(MolecularData), intent(in)    :: moldat
        integer,             intent(in)    :: component, irrpair, nf, nn
        character(len=1),    intent(in)    :: transp
        real(wp),            intent(inout) :: X(:, :, :), Y(:, :, :)

        integer(blasint) :: lda, ldb, ldc, m, n, k, info, Xdesc(9), Xdesc2D(9), Ydesc(9), Ydesc2D(9), desc(9), gn
        integer(blasint) :: Xlocr, Xlocc, Ylocr, Ylocc, ldX, ldY, rctx, bctx, rbksz, cbksz, myprow, mypcol, nprow, npcol
        logical          :: dist

        real(wp), allocatable :: Xloc(:, :), Yloc(:, :)
        real(wp), pointer     :: dips(:, :)

        ldb = size(X, 1)
        ldc = size(Y, 1)

        m = nf
        n = size(X, 2) * size(X, 3)
        k = nn

        ! pick the correct inner dipole block (or its local portion if MPI-IO was used)
        select case (component)
            case (1)
                dist = moldat % dipx(irrpair) % distributed
                desc = moldat % dipx(irrpair) % desc
                rctx = moldat % dipx(irrpair) % row_context
                bctx = moldat % dipx(irrpair) % blk_context
                lda  = size(moldat % dipx(irrpair) % mat, 1)
                dips => moldat % dipx(irrpair) % mat(:, :)
            case (2)
                dist = moldat % dipy(irrpair) % distributed
                desc = moldat % dipy(irrpair) % desc
                rctx = moldat % dipy(irrpair) % row_context
                bctx = moldat % dipy(irrpair) % blk_context
                lda  = size(moldat % dipy(irrpair) % mat, 1)
                dips => moldat % dipy(irrpair) % mat(:, :)
            case (3)
                dist = moldat % dipz(irrpair) % distributed
                desc = moldat % dipz(irrpair) % desc
                rctx = moldat % dipz(irrpair) % row_context
                bctx = moldat % dipz(irrpair) % blk_context
                lda  = size(moldat % dipz(irrpair) % mat, 1)
                dips => moldat % dipz(irrpair) % mat(:, :)
        end select

#ifdef WITH_SCALAPACK
        if (dist) then
            ! total number of columns in X, Y (summed over all parallel images)
            gn = n * nprocs

            ! get information about the BLACS process grid
            call blacs_gridinfo(bctx, nprow, npcol, myprow, mypcol)

            ! set up descriptors for the MULTIDIP distribution of vectors X and Y (round-robin in energies)
            call descinit(Xdesc, k, gn, ldb, n, zero, zero, rctx, ldb, info)
            call assert_status('Failed to set up X BLACS descriptor: ', int(info))
            call descinit(Ydesc, m, gn, ldc, n, zero, zero, rctx, ldc, info)
            call assert_status('Failed to set up Y BLACS descriptor: ', int(info))

            ! pick a reasonable block size for good load balancing (aim at one ScaLAPACK block per process at least)
            rbksz = clip(min(nf, nn) / int(nprow), 1, 64)
            cbksz = clip(int(gn / npcol), 1, 64)

            ! find out the number of rows and columns in the local portion of the redistributed X matrix
            Xlocr = numroc(k, rbksz, myprow, zero, nprow);  ldX = max(one, Xlocr)
            Xlocc = numroc(gn, cbksz, mypcol, zero, npcol)

            ! find out the number of rows and columns in the local portion of the redistributed Y matrix
            Ylocr = numroc(m, rbksz, myprow, zero, nprow);  ldY = max(one, Ylocr)
            Ylocc = numroc(gn, cbksz, mypcol, zero, npcol)

            ! allocate the local portions of the redistributed X and Y matrices
            allocate (Xloc(ldX, Xlocc), Yloc(ldY, Ylocc))

            ! set up descriptors for the ScaLAPACK distribution of vectors X and Y (proper two-dimensional block-cyclic)
            call descinit(Xdesc2D, k, gn, rbksz, cbksz, zero, zero, bctx, ldX, info)
            call assert_status('Failed to set up X2D BLACS descriptor: ', int(info))
            call descinit(Ydesc2D, m, gn, rbksz, cbksz, zero, zero, bctx, ldY, info)
            call assert_status('Failed to set up Y2D BLACS descriptor: ', int(info))

            ! redistribute X -> Xloc; multiply; redistribute Yloc -> Y
            call pdgemr2d(k, gn, X, one, one, Xdesc, Xloc, one, one, Xdesc2D, bctx)
            call pdgemm(transp, 'N', m, gn, k, rone, dips, one, one, desc, Xloc, one, one, Xdesc2D, rzero, Yloc, one, one, Ydesc2D)
            call pdgemr2d(m, gn, Yloc, one, one, Ydesc2D, Y, one, one, Ydesc, bctx)
        else
#endif
            call blas_dgemm(transp, 'N', m, n, k, rone, dips, lda, X, ldb, rzero, Y, ldc)
#ifdef WITH_SCALAPACK
        end if
#endif

    end subroutine apply_dipole_matrix


    !> \brief   Check for error code
    !> \author  J Benda
    !> \date    2022 - 2024
    !>
    !> If the provided error code is non-zero, print the error message and abort.
    !>
    subroutine assert_status (message, errcode)

        use mpi_gbl, only: mpi_xermsg

        character(len=*), intent(in) :: message
        integer,          intent(in) :: errcode

        if (errcode /= 0) then
            call mpi_xermsg('multidip_io', 'assert_status', trim(message), errcode, 1)
        end if

    end subroutine assert_status


    !> \brief   Clamp value in range
    !> \author  J Benda
    !> \date    2022
    !>
    integer function clip (x, a, b) result (y)

        integer :: x, a, b

        y = max(a, min(x, b))

    end function clip


    !> \brief   Multiply by boundary amplitudes
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> Multiply real matrix X by the matrix of boundary amplitudes and store the result into the real
    !> matrix Y. When transp is 'T', then the elements of the matrix X are expected to correspond to outer
    !> region channels, while the elements of the matrix Y to the inner region states. Otherwise,
    !> the opposite is assumed.
    !>
    subroutine apply_boundary_amplitudes (moldat, irr, transp, X, Y)

        use multidip_special, only: blas_dgemm => dgemm

        type(MolecularData),   intent(in)    :: moldat
        integer,               intent(in)    :: irr
        character(len=1),      intent(in)    :: transp
        real(wp),              intent(inout) :: X(:,:), Y(:,:)

        integer(blasint)  :: m, n, k, lda, ldb, ldc
        real(wp)          :: alpha = 1, beta = 0
        real(wp), pointer :: wamp(:, :)

        if (transp == 'T') then
            m = moldat % mnp1(irr)
            n = int(size(X, 2), blasint)
            k = moldat % nchan(irr)
        else
            m = moldat % nchan(irr)
            n = int(size(X, 2), blasint)
            k = moldat % mnp1(irr)
        end if

        lda = size(moldat % wamp(irr) % mat, 1)
        ldb = size(X, 1)
        ldc = size(Y, 1)

        if (lda < merge(m, k, transp == 'N') .or. ldb < k .or. ldc < m) then
            print '(a)', 'ERROR: Incompatible matrix dimensions in apply_boundary_amplitudes'
            print '(7x,a,6(", ",a,1x,i0))', transp, 'm =', m, 'n =', n, 'k =', k, 'lda =', lda, 'ldb =', ldb, 'ldc =', ldc
            stop 1
        end if

        wamp => moldat % wamp(irr) % mat(:, :)

        call blas_dgemm(transp, 'N', m, n, k, alpha, wamp, lda, X, ldb, beta, Y, ldc)

    end subroutine apply_boundary_amplitudes


    !> \brief   Scale boundary amplitudes matrix by a diagonal matrix
    !> \authors J Benda
    !> \date    2020 - 2024
    !>
    !> Multiply boundary amplitudes for each inner region state by an element of the provided vector 'v'.
    !> Writes the result into 'vw'.
    !>
    subroutine scale_boundary_amplitudes (moldat, irr, v, vw)

        type(MolecularData),   intent(in)    :: moldat
        integer,               intent(in)    :: irr
        real(wp),              intent(in)    :: v(:)
        real(wp),              intent(inout) :: vw(:, :)

        integer :: k, nstat, p, nchan

        nstat = moldat % mnp1(irr)
        nchan = moldat % nchan(irr)

        !$omp parallel default(none) private(p, k) firstprivate(nchan, nstat, irr) shared(vw, v, moldat)
        !$omp do collapse(2)
        do p = 1, nchan
            do k = 1, nstat
                vw(k, p) = v(k) * moldat % wamp(irr) % mat(p, k)
            end do
        end do
        !$omp end do
        !$omp end parallel

    end subroutine scale_boundary_amplitudes


    !> \brief   Write photoelectron energies to file
    !> \author  J Benda
    !> \date    2023
    !>
    !> Write the column list of photoelectron scattering energies to file. Atomic units are used.
    !>
    subroutine write_energy_grid (escat)

        real(wp), intent(in) :: escat(:)

        integer :: u

        open (newunit = u, file = 'pe_energies.txt', action = 'write', form = 'formatted')
        write (u, '(E25.15)') escat
        close (u)

    end subroutine write_energy_grid


    !> \brief   Write partial wave moments
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Produce tables with the multi-photon ionisation matrix elements per partial wave.
    !>
    subroutine write_partial_wave_moments (moldat, M, nesc, suffix)

        use mpi_gbl, only: mpi_reduce_inplace_sum_wp

        type(MolecularData),      intent(in) :: moldat
        character(len=*),         intent(in) :: suffix
        integer,                  intent(in) :: nesc
        complex(wp), allocatable, intent(in) :: M(:, :, :)

        complex(wp), allocatable :: val(:)
        real(wp),    allocatable :: reval(:), imval(:)

        integer            :: ie, irr, mgvn, u, ichan, ichlf, lf, mf, mye
        character(len=100) :: filename

        allocate (val(nesc))

        ! there is one file per irreducible representation and channel
        do irr = 1, size(moldat % mgvns)
            mgvn = moldat % mgvns(irr)
            do ichan = 1, size(M, 1)
                val = 0
                mye = 0
                do ie = myproc, nesc, nprocs
                    mye = mye + 1
                    val(ie) = M(ichan, mye, mgvn)
                end do

                ! combine data from all images to master
                reval = real(val)
                imval = aimag(val)
                call mpi_reduce_inplace_sum_wp(reval, nesc)
                call mpi_reduce_inplace_sum_wp(imval, nesc)
                val = cmplx(reval, imval, wp)

                ! the first image performs the writing
                if (myproc == 1 .and. any(val /= 0)) then
                    ichlf = moldat % ichl(ichan, irr)
                    lf = moldat % l2p(ichan, irr)
                    mf = moldat % m2p(ichan, irr)
                    write (filename, '(3(A,I0),A,SP,I0,3A)') 'pwdip-', mgvn, '-(', ichlf, ',', lf, ',', mf, ')', suffix, '.txt'
                    open (newunit = u, file = filename, action = 'write', form = 'formatted')
                    write (u, '(2E25.15)') val
                    close (u)
                end if
            end do
        end do

    end subroutine write_partial_wave_moments


    !> \brief   Write raw transition dipole moments
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Produce tables with the multi-photon ionisation matrix elements per partial wave and per ionisation chain.
    !>
    subroutine write_raw_dipoles (M, chains, nesc, stem)

        use mpi_gbl, only: mpi_reduce_inplace_sum_wp

        character(len=*),         intent(in) :: stem
        integer,     allocatable, intent(in) :: chains(:, :)
        complex(wp), allocatable, intent(in) :: M(:, :, :, :)
        integer,                  intent(in) :: nesc

        character(len=100)          :: filename, history
        character(len=1), parameter :: xyz(-1:1) = [ 'y', 'z', 'x' ]
        character(len=1), parameter :: sph(-1:1) = [ 'm', '0', 'p' ]

        complex(wp), allocatable :: val(:)
        real(wp),    allocatable :: reval(:), imval(:)

        integer :: u, itarg, ntarg, ichain, nchain, ipw, npw, icomp, lf, mf, ie, mye

        npw = size(M, 2)
        nchain = size(M, 3)
        ntarg = size(M, 4)

        allocate (val(nesc))

        ! there is one output file per target, absorption chain and partial wave
        do itarg = 1, ntarg
            do ichain = 1, nchain
                do ipw = 1, npw

                    val = 0
                    mye = 0
                    do ie = myproc, nesc, nprocs
                        mye = mye + 1
                        val(ie) = M(mye, ipw, ichain, itarg)
                    end do

                    ! combine data from all images to master
                    reval = real(val)
                    imval = aimag(val)
                    call mpi_reduce_inplace_sum_wp(reval, nesc)
                    call mpi_reduce_inplace_sum_wp(imval, nesc)
                    val = cmplx(reval, imval, wp)

                    ! the first image performs the writing
                    if (myproc == 1 .and. any(val /= 0)) then
                        lf = int(sqrt(real(ipw - 1, wp)))
                        mf = ipw - lf*lf - lf - 1
                        history = ''
                        do icomp = 1, size(chains, 1)
                            select case (trim(stem))
                                case ('xyz'); history = xyz(chains(icomp, ichain)) // trim(history)
                                case ('sph'); history = sph(chains(icomp, ichain)) // trim(history)
                            end select
                        end do
                        write (filename, '(3A,I0,A,I0,A,SP,I0,A)') 'rawdip-', trim(history), '-(', itarg, ',', lf, ',', mf, ').txt'
                        open (newunit = u, file = filename, action = 'write', form = 'formatted')
                        write (u, '(2E25.15)') val
                        close (u)
                    end if

                end do
            end do
        end do

    end subroutine write_raw_dipoles


    !> \brief   Write transition dipole moments in RSOLVE format
    !> \author  J Benda, Z Masin
    !> \date    2021 - 2024
    !>
    !> Produce tables with 1-photon ionisation matrix elements in the format of RSOLVE.
    !>
    subroutine write_rsolve_dipoles (moldat, M, chains, escat, lu_pw_dipoles)

        use photo_outerio, only: write_pw_dipoles
        use mpi_gbl, only: mpi_reduce_inplace_sum_wp
        use multidip_params, only: carti

        type(MolecularData),      intent(in) :: moldat
        integer,     allocatable, intent(in) :: chains(:, :)
        complex(wp), allocatable, intent(in) :: M(:, :, :, :)
        real(wp),                 intent(in) :: escat(:)
        integer,                  intent(in) :: lu_pw_dipoles

        complex(wp), allocatable :: val(:)
        real(wp),    allocatable :: reval(:), imval(:), re_pw_dipoles(:,:,:,:), im_pw_dipoles(:,:,:,:)

        integer :: itarg, ntarg, ichain, nchain, ipw, npw, icomp, lf, mf, ie, mye, irr, mgvn, stot, gutot, nch, ch

        character(len=11) ::  form_pw_dipoles
        character(len=80) ::  title

        integer               :: lmax_property, dip_comp_present(3), lu_pw_dipoles_m, nset_pw_dipoles, nesc, iprnt, iwrite, ifail
        integer, allocatable  :: ichl(:), lvchl(:), mvchl(:), starg(:), mtarg(:), gtarg(:)
        real(wp), allocatable :: evchl(:)
        real(wp)              :: bound_state_energies(1), target_energy

        npw = size(M, 2)
        nchain = size(M, 3)
        ntarg = size(M, 4)
        nesc = size(escat)
        ntarg = size(moldat % etarg)

        allocate (val(nesc), starg(ntarg), mtarg(ntarg), gtarg(ntarg))

        ! data for the header of the RSOLVE dipoles file
        title                = 'MULTIDIP 1-PHOTON DIPOLES'
        starg                = moldat % starg
        mtarg(:)             = moldat % ltarg(:) - 1    ! IRR of the states in the CONGEN format (starting from 0)
        gtarg                = 0                        ! HARD-CODED SINCE MOLDAT DOESN'T STORE THIS VALUE
        target_energy        = minval(moldat % etarg)
        bound_state_energies = moldat % eig(1,1)        ! Neutral state in agreement with `setup_initial_state`
        nset_pw_dipoles      = 1
        form_pw_dipoles      = 'FORMATTED'
        iwrite               = output_unit
        iprnt                = 0
        lmax_property        = 1
        ifail                = 0

        do irr = 1, size(moldat % mgvns)

           mgvn  = moldat % mgvns(irr)
           stot  = moldat % stot(irr)
           gutot = 0
           nch   = moldat % nchan(irr)

           allocate(re_pw_dipoles(1,nch,3,nesc))
           allocate(im_pw_dipoles(1,nch,3,nesc))

           re_pw_dipoles = 0.0_wp
           im_pw_dipoles = 0.0_wp

           dip_comp_present = 0

           ! transfer the dipoles from M to re_pw_dipoles, im_pw_dipoles and conjugate
           do ichain = 1, nchain
               do ch = 1, nch
   
                   itarg = moldat % ichl(ch,irr)
                   lf    = moldat % l2p(ch,irr)
                   mf    = moldat % m2p(ch,irr)

                   ipw   = lf * lf + lf + mf + 1

                   val = 0
                   mye = 0
                   do ie = myproc, nesc, nprocs
                       mye = mye + 1
                       val(ie) = M(mye, ipw, ichain, itarg)
                   end do
   
                   ! combine data from all images to master
                   reval = real(val)
                   imval = aimag(val)
                   call mpi_reduce_inplace_sum_wp(reval, nesc)
                   call mpi_reduce_inplace_sum_wp(imval, nesc)
                   val = cmplx(reval, imval, wp)
   
                   ! the first image saves the dipoles
                   if (myproc == 1) then

                       if (any(abs(val(:)) > 0.0_wp)) dip_comp_present(carti(ichain)) = 1
  
                       ! Save the dipoles and conjugate them so the scattering wf. is ket. 
                       do ie = 1, nesc
                          re_pw_dipoles(1,ch,carti(ichain),ie) =   real(val(ie),wp)
                          im_pw_dipoles(1,ch,carti(ichain),ie) = -aimag(val(ie))
                       enddo

                   end if
   
               end do
           end do

           lu_pw_dipoles_m = lu_pw_dipoles + mgvn

           allocate(ichl(nch), lvchl(nch), mvchl(nch), evchl(nch))

           ichl(1:nch)  = moldat % ichl(1:nch,irr)
           lvchl(1:nch) = moldat % l2p (1:nch,irr)
           mvchl(1:nch) = moldat % m2p (1:nch,irr)

           ! Channel energies in Rydbergs
           do ch = 1, nch
              evchl(ch) = 2 * ( moldat % etarg(ichl(ch)) - moldat % etarg(1) )
           enddo

           call write_pw_dipoles( lu_pw_dipoles_m, nset_pw_dipoles, form_pw_dipoles, title, mgvn,                     &
                                  stot, gutot, starg, mtarg, gtarg, ichl, lvchl, mvchl, evchl, escat, lmax_property,&
                                  dip_comp_present,  bound_state_energies, target_energy,                           &
                                  re_pw_dipoles, im_pw_dipoles, iprnt, iwrite, ifail )

           deallocate(re_pw_dipoles, im_pw_dipoles, ichl, lvchl, mvchl, evchl)

        enddo

    end subroutine write_rsolve_dipoles


    !> \brief   Write cross sections to a file
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Write cross sections to a file. The first column of the array is expected to be the photon energy.
    !>
    subroutine write_cross_section (cs, nesc, erange, filename)

        use mpi_gbl, only: mpi_reduce_inplace_sum_wp

        character(len=*), intent(in) :: filename
        real(wp),         intent(in) :: cs(:, :)
        integer,          intent(in) :: nesc, erange(2)

        real(wp), allocatable :: csg(:, :), buffer(:)

        integer :: u, ie, mye, nene, ntgt

        ntgt = size(cs, 1)
        nene = size(cs, 2)

        ! a local copy of 'cs' for gathering data from parallel images
        allocate (csg(ntgt, nesc))
        csg = 0
        mye = 0
        do ie = myproc, nesc, nprocs
            mye = mye + 1
            csg(:, ie) = cs(:, mye)
        end do

        ! combine data from all images to master
        buffer = reshape(csg, [size(csg)])
        call mpi_reduce_inplace_sum_wp(buffer, size(buffer))
        csg = reshape(buffer, shape(csg))

        ! the first image now writes (non-zero) cross sections to file
        if (myproc == 1 .and. any(csg(2:, :) /= 0)) then
            open (newunit = u, file = filename, action = 'write', form = 'formatted')
            do ie = max(1, erange(1)), min(nesc, erange(2))
                write (u, '(*(E25.15))') csg(:, ie)
            end do
            close (u)
        end if

    end subroutine write_cross_section

end module multidip_io
