! Copyright 2020
!
! For a comprehensive list of the developers that contributed to these codes
! see the UK-AMOR website.
!
! This file is part of UKRmol-out (UKRmol+ suite).
!
!     UKRmol-out is free software: you can redistribute it and/or modify
!     it under the terms of the GNU General Public License as published by
!     the Free Software Foundation, either version 3 of the License, or
!     (at your option) any later version.
!
!     UKRmol-out is distributed in the hope that it will be useful,
!     but WITHOUT ANY WARRANTY; without even the implied warranty of
!     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!     GNU General Public License for more details.
!
!     You should have received a copy of the GNU General Public License
!     along with  UKRmol-out (in source/COPYING). Alternatively, you can also visit
!     <https://www.gnu.org/licenses/>.
!
!> \brief   Main MULTIDIP routines
!> \author  J Benda
!> \date    2020 - 2021
!>
module multidip_routines

    use, intrinsic :: iso_c_binding,   only: c_double, c_int, c_f_pointer
    use, intrinsic :: iso_fortran_env, only: input_unit, int32, int64, real64, output_unit

#ifdef HAVE_NO_FINDLOC
    ! supply missing intrinsic functions
    use algorithms,  only: findloc
#endif

    ! GBTOlib
    use blas_lapack_gbl,  only: blasint
    use phys_const_gbl,   only: pi, imu, to_eV
    use precisn_gbl,      only: wp

    implicit none

    !> \brief   Intermediate state
    !> \author  J Benda
    !> \date    2020
    !>
    !> Type holding information about a specific irreducible representation of the intermediate state
    !> of a given perturbative order. The states are connected into a linked list to avoid the necessity
    !> of reallocating any large expansion during the calculation. Also, each state is linked to its
    !> "parent" state, which is another instance of this class, which stands on the right-hand side
    !> of the equation for the intermediate state.
    !>
    type IntermediateState

        integer :: order    !< Number of absorbed photons
        integer :: mgvn     !< Irreducible representation of this intermediate state
        integer :: dcomp    !< Dipole operator component responsible for populating this state

        !> Inner region expansion coefficients
        !> - First index: inner region eigenstate index
        !> - Second index: 1 = real part, 2 = imag part
        !> - Third index: scattering energy index
        real(wp), allocatable :: ck(:, :, :)

        !> Outer region expansion coefficients
        !> - First index:  outer region channel index
        !> - Second index: 1 = real part, 2 = imag part
        !> - Third index: scattering energy index
        real(wp), allocatable :: ap(:, :, :)

        !> Photoionisation transition matrix elements. Only used when the "intermediate" state
        !> is actually the final state. In such a case, the expansion coefficients 'ck' and 'ap'
        !> are not allocated.
        !> - First index: outer region channel index
        !> - Second index: 1 = real part, 2 = imag part
        !> - Third index: scattering energy index
        real(wp), allocatable :: dip(:, :, :)

        !> Pointer to the parent intermediate state (if any)
        type(IntermediateState), pointer :: parent => null()

        !> Linked list connections
        type(IntermediateState), pointer :: prev => null()
        type(IntermediateState), pointer :: next => null()

    end type IntermediateState


    !> \brief   Multi-photon integral cache
    !> \authors J Benda
    !> \date    2024
    !>
    !> Recursive data structure holding radial multi-photon integrals. Typically, the first object does not contain any data
    !> in `vals_xxx` and only defines the final reduced channels. [A reduced channel is a multi-index (i, l) given by the target
    !> index and partial wave angular momentum.] The nested objects then store integral values, e.g.
    !> \f[
    !>    [i_3 l_3 | m_{32} | i_2 l_2]
    !> \f]
    !> and also contain references to further nested blocks of higher orders,
    !> \f[
    !>    [i_3 l_3 | m_{32} | m_i_2 l_2 | m_{21} | i_1 l_1]
    !> \f]
    !> etc.
    !>
    type :: integral_cache_t

        integer,                allocatable :: rchs_pws(:)  !< Reduced channels dipole-coupled in the photoelectron (m = 1).
        integer,                allocatable :: rchs_ion(:)  !< Reduced channels dipole-coupled in the residual ion (m = 0).

        complex(wp),            allocatable :: vals_pws(:, :)  !< Integral values for pws-coupled channels (H+/H-).
        complex(wp),            allocatable :: vals_ion(:, :)  !< Integral values for ion-coupled channels (H+/H-).

        type(integral_cache_t), allocatable :: next_pws(:)  !< Nested caches for pws-copled channels.
        type(integral_cache_t), allocatable :: next_ion(:)  !< Nested caches for ion-copled channels.

    end type integral_cache_t


    ! for debugging purposes
    logical, parameter :: test_expansion = .false.

contains

    !> \brief   MULTIDIP main subroutine
    !> \authors J Benda
    !> \date    2020 - 2024
    !>
    !> Read the input namelist &mdip from the standard input, call routines for reading the input file, and then call
    !> the main computational routine. Alternatively, when the "--test" switch is present on the command line, it will
    !> run a few unit tests.
    !>
    subroutine multidip_main

        use, intrinsic :: iso_c_binding, only: c_ptr

        use linalg_cl,        only: initialize_cl, finalize_cl
        use mpi_gbl,          only: comm_rank => myrank, comm_size => nprocs, mpi_mod_finalize, mpi_mod_start, mpi_xermsg
        use multidip_io,      only: MolecularData, KMatrix, ScatAkCoeffs, read_wfncoeffs, read_kmatrices, read_molecular_data, &
                                    myproc, nprocs
        use multidip_params,  only: nMaxPhotons, read_input_namelist, num_integ_algo
#ifdef WITH_GSL
        use multidip_special, only: gsl_set_error_handler_off
#endif
        use multidip_tests,   only: run_tests

        type(MolecularData)             :: moldat
        type(KMatrix),      allocatable :: km(:)
        type(ScatAkCoeffs), allocatable :: ak(:)

        character(len=256) :: arg, rmt_data, raw, msg
        type(c_ptr)        :: dummy

        integer     :: order, nirr, lusct(8), lukmt(8), lubnd, nkset(8), iarg, narg, input, i, erange(2), p(nMaxPhotons)
        integer     :: lu_pw_dipoles
        logical     :: verbose, mpiio, gpu
        real(wp)    :: omega(nMaxPhotons), first_IP, rasym
        complex(wp) :: polar(3, nMaxPhotons)

        call print_ukrmol_header(output_unit)

        print '(/,A)', 'Program MULTIDIP: Calculation of multi-photon ionization'

        iarg = 1
        narg = command_argument_count()
        input = input_unit

#ifdef WITH_GSL
        dummy = gsl_set_error_handler_off()
#endif

        do while (iarg <= narg)
            call get_command_argument(iarg, arg)
            if (arg == '--test') then
                call run_tests
                return
            else if (arg == '--input') then
                iarg = iarg + 1
                call get_command_argument(iarg, arg)
                open (input, file = trim(arg), action = 'read', form = 'formatted')
            else if (arg == '--help') then
                print '(/,A,/)', 'Available options:'
                print '(A)', '  --help          display this help and exit'
                print '(A)', '  --input FILE    use FILE as the input file (instead of standard input)'
                print '(A)', '  --test          run internal sanity checks and exit'
                print *
                return
            else
                write (msg, '(3a)') 'Unknown command line argument "', trim(arg), '"'
                call mpi_xermsg('multidip_routines', 'multidip_main', trim(msg), 1, 1)
            end if
            iarg = iarg + 1
        end do

        call read_input_namelist(input, order, lusct, lukmt, lubnd, nkset, polar, omega, verbose, rmt_data, first_IP, rasym, raw, &
                                 erange, mpiio, gpu, p, lu_pw_dipoles)

        if (input /= input_unit) close (input)

        call mpi_mod_start

        myproc = comm_rank + 1
        nprocs = comm_size
        print '(/,A,I0,A,I0)', 'Parallel mode; image ', myproc, ' of ', nprocs

        print '(/,A,/)', 'Absorbed photons'
        do i = 1, order
            print '(2x,A,I0,3x,A,SP,2F6.3,A,2F6.3,A,2F6.3,A,F0.3,A)', &
                '#', i, 'eX: ', polar(1, i), 'i, eY: ', polar(2, i), 'i, eZ: ', polar(3, i), 'i, energy: ', omega(i) * to_eV, ' eV'
        end do

        if (first_IP > 0) then
            print '(/,A,F0.5,A)', 'Override first IP to ', first_IP * to_eV, ' eV'
        else
            print '(/,A)', 'Use calculated IP'
        end if

        nirr = count(lukmt > 0)

        allocate (km(nirr))

        if (any(lusct > 0)) then
            allocate (ak(nirr))
            call read_wfncoeffs(ak, lusct)
        end if

        call read_kmatrices(km, lukmt, nkset)
        call read_molecular_data(moldat, trim(rmt_data), mpiio, test_expansion)

        if (any(erange /= 0)) then
            if (erange(1) < 1 .or. erange(1) > erange(2) .or. erange(2) > minval(km % nescat)) then
                write (msg, '(a,i0)') 'Error: Energy range must satisfy 1 <= erange(1) <= erange(2) <= ', minval(km % nescat)
                call mpi_xermsg('multidip_routines', 'multidip_main', trim(msg), 1, 1)
            end if
        else
            erange(1) = 1
            erange(2) = minval(km % nescat)
        end if

        if (moldat % nz < moldat % nelc) then
            write (msg, '(a,i0,a,i0,a)') 'Error: Number of protons (nz = ', moldat % nz, ') is smaller than the number of &
                &electrons in residual ion (nelc = ', moldat % nelc, ').'
            call mpi_xermsg('multidip_routines', 'multidip_main', trim(msg), 1, 1)
        end if

        if (rasym > moldat % rmatr) then
            if (order <= 2) then
                write (*, '(/,A,F0.1,A,F0.1,A)', advance = 'no') 'Interval ', moldat % rmatr, ' to ', rasym, &
                    ' will be integrated numerically using '
                select case (num_integ_algo)
                    case (1);  print '(a)', 'Romberg quadrature'
                    case (2);  print '(a)', 'Levin quadrature'
                end select
            else
                print '(/,a)', 'Warning: Numerical integration (rasym) is currently only implemented for the second order.'
                rasym = 0
            end if
        end if

        if (gpu) call initialize_cl(-1_c_int, int(merge(-1, myproc - 1, nprocs == 1), c_int))
        call multidip_driver(order, moldat, km, ak, lubnd, omega(1:order), polar(:, 1:order), verbose, first_IP, rasym, raw, &
                             erange, p, lu_pw_dipoles)
        if (gpu) call finalize_cl

        call mpi_mod_finalize

    end subroutine multidip_main


    !> \brief   Central computation routine
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> This subroutine drives the calculation. First it obtains all intermediate states, then it calculates
    !> the final photoionization state, and evaluates the dipole elements and generalized cross sections.
    !> The core of the work is the evaluation of all matrix elements of the type
    !> \f[
    !>     M_{fi,k_n,\dots,k_i} = \langle \Psi_{f}^{(-)} | x_{k_n} \dots \hat{G}^{(+)} x_{k_2} \hat{G}^{(+)} x_{k_1}
    !>             | \Psi_i \rangle
    !> \f]
    !> where \f$ x_j \f$ is the j-th component of the dipole operator, followed by reduction of this tensor with all
    !> provided photon field polarisations \f$ \epsilon_j \f$:
    !> \f[
    !>     M_{fi} = \sum_{k_n,\dots,k_1} \epsilon_{k_n} \dots \epsilon_{k_1} M_{fi,k_n,\dots,k_1}
    !> \f]
    !> The sequence \f$ k_1, k_2, \dots \f$ is referred to as component "history" or "chain" in the code.
    !>
    !> \param[in]  order         Order of the process (= total number of absorbed photons).
    !> \param[in]  moldat        MolecularData object with data read from the file *molecular_data*.
    !> \param[in]  km            KMatrix objects for relevant irreducible representations with data read from RSOLVE K-matrix files.
    !> \param[in]  ak            Wave function coeffs (from RSOLVE) for the same set of irrs as km.
    !> \param[in]  lubnd         File with bound state wave function coefficients.
    !> \param[in]  omega         Fixed photon energies in a.u. or zeros for flexible photons.
    !> \param[in]  polar         Photon polarisation vectors or zeros for polarisation averaging.
    !> \param[in]  verbose       Debugging output intensity.
    !> \param[in]  first_IP      First ionization potential in a.u. as requested by the input namelist.
    !> \param[in]  r0            Radius from which to apply asymptotic integrals.
    !> \param[in]  raw           Write raw transition dipoles in spherical (= 'sph') or Cartesian (= 'xyz') basis, or both (= 'both').
    !> \param[in]  erange        Range (subset) of energies in K-matrix files to use for calculations.
    !> \param[in]  p             Lab-frame polarisation for calculations of orientationally averaged beta parameters.
    !> \param[in]  lu_pw_dipoles Base number for the logical unit for saving the pw dipoles in RSOLVE format (typically 410).
    !>
    subroutine multidip_driver (order, moldat, km, ak, lubnd, omega, polar, verbose, first_IP, r0, raw, erange, p, lu_pw_dipoles)

        use multidip_io, only: MolecularData, KMatrix, ScatAkCoeffs, nprocs, get_diptrans, write_energy_grid

        integer,                         intent(in) :: order, erange(2), lubnd, p(:), lu_pw_dipoles
        logical,                         intent(in) :: verbose
        type(MolecularData),             intent(in) :: moldat
        type(KMatrix),      allocatable, intent(in) :: km(:)
        type(ScatAkCoeffs), allocatable, intent(in) :: ak(:)
        real(wp),                        intent(in) :: omega(:), first_IP, r0
        complex(wp),                     intent(in) :: polar(:, :)
        character(len=*),                intent(in) :: raw

        type(integral_cache_t), allocatable :: integral_cache(:)

        type(IntermediateState), pointer :: states, state

        integer,     allocatable :: iidip(:), ifdip(:)
        real(wp),    allocatable :: escat(:)

        integer  :: j, s, mgvni, mgvnn, mgvn1, mgvn2, nesc, irri, iki, icomp, nstati, nchani
        real(wp) :: Ei

        iki    = 1                                       ! which K-matrix file (and Ak file) to use for the initial state
        mgvni  = km(iki) % mgvn                          ! IRR of the initial state
        nesc   = erange(2) - erange(1) + 1               ! number of scattering energies
        irri   = findloc(moldat % mgvns, mgvni, 1)       ! internal index of initial IRR in molecular_data
        nstati = moldat % mnp1(irri)                     ! number of inner region states in initial IRR
        nchani = moldat % nchan(irri)                    ! number of outer region channels in initial IRR

        allocate (states)
        allocate (states % ck(nstati, 2, (nesc + nprocs - 1) / nprocs))
        allocate (states % ap(nchani, 2, (nesc + nprocs - 1) / nprocs))

        ! prepate the initial state
        call setup_initial_state(states, moldat, irri, lubnd, Ei)

        ! for all absorbed photons
        do j = 1, order

            print '(/,2(A,I0))', 'Photon ', j, ' of ', order

            ! precompute asymptotic integrals
            call precompute_integral_cache(integral_cache, moldat, km(iki) % escat, j, r0, erange, Ei, first_IP, omega, verbose)

            ! for all components of the dipole operator
            do icomp = 1, 3

                call get_diptrans(moldat, icomp, iidip, ifdip)

                ! for all transitions mediated by this component that are stored in molecular_data
                do s = 1, size(iidip)

                    mgvn1 = iidip(s) - 1
                    mgvn2 = ifdip(s) - 1

                    ! apply this transition to all relevant previous intermediate states
                    state => states
                    do while (associated(state))

                        ! process states of previous order
                        if (state % order + 1 == j) then

                            ! IRR of the previous intermediate state (i.e. last element in MGVN history chain)
                            mgvnn = state % mgvn

                            ! skip this transition if it is not applicable to the current IRR or there are no K-matrices for it
                            if ((mgvnn == mgvn1 .or. mgvnn == mgvn2) .and. &
                                any(km(:) % mgvn == mgvn1) .and. &
                                any(km(:) % mgvn == mgvn2)) then

                                ! either calculate the next intermediate state or evaluate the dipole elements
                                if (j < order) then
                                    call solve_intermediate_state(moldat, order, omega, icomp, s, integral_cache, mgvnn, &
                                                                  mgvn1, mgvn2, km, state, verbose, Ei, first_IP, r0, erange)
                                else
                                    call extract_dipole_elements(moldat, order, omega, icomp, s, integral_cache, mgvnn, &
                                                                 mgvn1, mgvn2, km, ak, state, verbose, Ei, first_IP, r0, erange)
                                end if

                            end if

                        end if

                        ! move on to the next state
                        state => state % next

                    end do

                end do

            end do

            if (allocated(integral_cache)) then
                deallocate (integral_cache)
            end if

        end do

        ! calculate observables
        escat = km(iki) % escat(erange(1):erange(2))
        call write_energy_grid(escat)
        call calculate_pw_transition_elements(moldat, order, states, escat, Ei, first_IP, omega, polar, erange)
        call calculate_asymmetry_parameters(moldat, order, states, escat, Ei, first_IP, omega, raw, erange, p, lu_pw_dipoles)

        ! clean up the linked list
        state => states
        do while (associated(state % next))
            state => state % next
            deallocate (state % prev)
        end do
        deallocate (state)

    end subroutine multidip_driver


    !> \brief   Construct initial state
    !> \authors J Benda
    !> \date    2023
    !>
    !> Construct inner and outer expansion coefficients of the initial state. When no input from BOUND is provided, this is
    !> trivial: The initial state coincides with the first inner region eigenstate in the irreducible representation given
    !> by the first K-matrix file and there is nothing in the outer region.
    !>
    !> However, when bound state coefficients are given, they are adopted here and the wave function is correctly continued
    !> into the outer region using a single-free-electron expansion. The energy shift is not considered, though.
    !>
    !> \todo Take into account modified bound state energy calculated by BOUND.
    !>
    subroutine setup_initial_state (states, moldat, irr, lubnd, Ei)

        use multidip_io,     only: MolecularData, apply_boundary_amplitudes, read_bndcoeffs
        use multidip_outer,  only: test_outer_expansion, evaluate_fundamental_solutions
        use multidip_params, only: extend_istate

        type(IntermediateState), pointer, intent(inout) :: states
        type(MolecularData),              intent(in)    :: moldat
        integer,                          intent(in)    :: irr, lubnd
        real(wp),                         intent(out)   :: Ei

        complex(wp), allocatable :: ap(:)
        real(wp),    allocatable :: Ek(:), bnd(:, :), wbnd(:, :), fc(:), gc(:), fcp(:), gcp(:)
        real(wp)                 :: r
        integer                  :: mye, nmye, nchan, nstat, ichan, stot, mgvn, nbnd

        nmye = size(states % ck, 3)
        mgvn = moldat % mgvns(irr)
        stot = moldat % stot(irr)
        nchan = moldat % nchan(irr)
        nstat = moldat % mnp1(irr)
        nbnd = 1
        r = moldat % rmatr
        Ei = moldat % eig(1, irr)

        ! construct representation of the initial state in the inner region
        states % order = 0
        states % mgvn = mgvn
        states % dcomp = 0

        allocate (bnd(nstat, nbnd))

        if (lubnd <= 0) then
            bnd(:, 1) = 0    ! all inner region eigenstates are unpopulated...
            bnd(1, 1) = 1    ! ... except for the first one, which has the coefficient 1.0
        else
            call read_bndcoeffs(bnd(:, 1), Ei, lubnd, mgvn, stot)   ! get calculated coefs from BOUND
        end if

        do mye = 1, nmye
            states % ck(:, 1, mye) = bnd(:, 1)
            states % ck(:, 2, mye) = 0
        end do

        print '(/,a,/)', 'Bound state information'
        print '(4x,a,e25.15)', 'First R-matrix pole       = ', moldat % eig(1, irr)
        print '(4x,a,e25.15)', 'Calculated initial energy = ', Ei

        ! construct representation of the initial state in the outer region
        if (extend_istate) then

            allocate (wbnd(nchan, nbnd), fc(nchan), gc(nchan), fcp(nchan), gcp(nchan), ap(nchan))

            call apply_boundary_amplitudes(moldat, irr, 'N', bnd, wbnd)

            ! "photoelectron" "kinetic" energies (actually negative closed channel thresholds)
            Ek = Ei - moldat % etarg(moldat % ichl(: , irr))

            call evaluate_fundamental_solutions(moldat, r, irr, nchan, Ek, fc, gc, fcp, gcp, sqrtknorm = .false.)

            ! obtain outer coeffs by projecting the inner state on channels at boundary and dividing by decaying Whittaker function
            ap = wbnd(:, 1) / gc

            do mye = 1, nmye
                states % ap(:, 1, mye) = real(ap)
                states % ap(:, 2, mye) = aimag(ap)
            end do

            print '(4x,a,e25.15)', 'Total inner norm          = ', norm2(bnd(:, 1))
            print '(/,a,/)', 'Bound state outer region channel coefficients and amplitudes'
            print '(4x,a,4x,a,23x,a,20x,a,20x,a)', 'chan', 'Ek', 'Re ap', 'Im ap', 'f_p'
            do ichan = 1, nchan
                print '(i6,a,4e25.15)', ichan, ': ', Ek(ichan), ap(ichan), wbnd(ichan, 1)
            end do

        else

            states % ap(:, :, :) = 0    ! outer region is empty

        end if

        if (test_expansion) then
            call test_outer_expansion('initial-state.txt', moldat, irr, states % ck(:, :, 1), states % ap(:, :, 1), Ei)
        end if

    end subroutine setup_initial_state


    !> \brief   Calculate intermediate photoionisation state
    !> \authors J Benda
    !> \date    2020 - 2024
    !>
    !> Solve the intermediate state equation
    !> \f[
    !>     (E_i + j \omega - \hat{H}) \Psi_j = \hat{D} \Psi_{j-1}
    !> \f]
    !> with the right-hand side based on the state provided as argument 'state'.
    !>
    !> \param[in]  moldat   MolecularData object with data read from the file *molecular_data*.
    !> \param[in]  order    Perturbation order of the intermediate state to calculate.
    !> \param[in]  Ephoton  Fixed photon energies in a.u. or zeros for flexible photons.
    !> \param[in]  icomp    Which Cartesian component of the dipole operator will give rise to the intermediate state.
    !> \param[in]  s        Which "transition" in *molecular_data* corresponds to the action of this dipole component on parent.
    !> \param[in]  cache    Precomputed integrals.
    !> \param[in]  mgvnn    MGVN of the previous intermediate state.
    !> \param[in]  mgvn1    Ket MGVN for the transition "s" as stored in *molecular_data*.
    !> \param[in]  mgvn2    Bra MGVN for the transition "s" as stored in *molecular_data*.
    !> \param[in]  km       KMatrix objects for relevant irreducible representations with data read from RSOLVE K-matrix files.
    !> \param[inout] state  Tree of intermediate states, pointing at the previous intermediate state to develop into next state.
    !> \param[in]  verbose  Debugging output intensity.
    !> \param[in]  calc_Ei  Initial state energy calculated by (MPI-)SCATCI or BOUND.
    !> \param[in]  first_IP Ionization potential according to the input namelist.
    !> \param[in]  r0       Radius from which to apply asymptotic integration.
    !> \param[in]  erange   Range (subset) of energies in K-matrix files to use for calculations.
    !>
    subroutine solve_intermediate_state (moldat, order, Ephoton, icomp, s, cache, mgvnn, mgvn1, mgvn2, &
                                         km, state, verbose, calc_Ei, first_IP, r0, erange)

        use multidip_io,      only: MolecularData, KMatrix, ScatAkCoeffs, myproc, nprocs, apply_boundary_amplitudes, &
                                    apply_dipole_matrix, scale_boundary_amplitudes
        use multidip_outer,   only: evaluate_fundamental_solutions, test_outer_expansion
        use multidip_params,  only: closed_interm, closed_range, compt
        use multidip_special, only: coul, solve_complex_system

        type(MolecularData),        intent(in) :: moldat
        type(KMatrix), allocatable, intent(in) :: km(:)
        type(integral_cache_t),     intent(in) :: cache(:)

        type(IntermediateState), pointer, intent(inout) :: state
        type(IntermediateState), pointer :: last

        integer,  intent(in) :: order, icomp, s, mgvnn, mgvn1, mgvn2, erange(2)
        logical,  intent(in) :: verbose
        real(wp), intent(in) :: Ephoton(:), calc_Ei, first_IP, r0

        integer  :: nstatn, nstatf, nchann, nchanf, nopen, mgvnf, irrn, irrf, ikn, ikf, ipw, nesc, ie, mye, t, dt, i
        real(wp) :: Ei, Ek, Etotf
        character(len=1) :: transp
        character(len=1024) :: filename

        complex(wp), allocatable :: beta(:), app(:), H(:, :), lambda(:), hc(:), hcp(:)
        real(wp),    allocatable :: fc(:), fcp(:), gc(:), gcp(:)

        integer,  allocatable :: lf(:)
        real(wp), allocatable :: Ef(:), P(:), rhs(:, :), omega(:), echl(:)
        real(wp), allocatable :: Pw(:, :), wPw(:, :), wPD(:, :), IwPD(:, :), wckp(:, :), Dpsi(:, :, :), corr(:, :), ckp(:, :)

        if (mgvnn == mgvn1) then
            mgvnf = mgvn2
            transp = 'T'
        else
            mgvnf = mgvn1
            transp = 'N'
        end if

        irrn = findloc(moldat % mgvns, mgvnn, 1)
        irrf = findloc(moldat % mgvns, mgvnf, 1)

        ikn = findloc(km(:) % mgvn, mgvnn, 1)
        ikf = findloc(km(:) % mgvn, mgvnf, 1)

        nstatn = moldat % mnp1(irrn)
        nstatf = moldat % mnp1(irrf)

        nchann = moldat % nchan(irrn)
        nchanf = moldat % nchan(irrf)

        nesc = erange(2) - erange(1) + 1
        mye = 0
        t = 0

        allocate (Pw(nstatf, nchanf), wPD(nchanf, 2), IwPD(nchanf, 2), wPw(nchanf, nchanf), wckp(nchanf, 2), omega(order), &
                  Dpsi(nstatf, 2, (nesc + nprocs - 1) / nprocs), fc(nchanf), gc(nchanf), fcp(nchanf), gcp(nchanf), &
                  hc(nchanf), hcp(nchanf), corr(nchanf, 2), echl(nchanf), &
                  Ef(nchanf), lf(nchanf), ckp(nstatf, 2), H(nchanf, nchanf), lambda(nchanf), beta(nchanf), &
                  app(nchanf))

        echl(1:nchanf) = moldat % etarg(moldat % ichl(1:nchanf, irrf)) - moldat % etarg(1)

        ! set up a new intermediate state at the end of the storage chain
        last => state
        do while (associated(last % next))
            last => last % next
        end do
        allocate (last % next)
        last % next % prev => last
        last => last % next
        last % order = state % order + 1
        last % mgvn = mgvnf
        last % dcomp = icomp
        last % parent => state
        allocate (last % ck(nstatf, 2, (nesc + nprocs - 1) / nprocs))
        allocate (last % ap(nchanf, 2, (nesc + nprocs - 1) / nprocs))

        call print_transition_header(last)

        ! apply the inner dipoles matrix on the previous inner region solution expansions
        call reset_timer(t, dt)
        call apply_dipole_matrix(moldat, icomp, s, transp, nstatf, nstatn, state % ck, Dpsi)
        call reset_timer(t, dt)
        print '(4x,A,I0,A)', 'inner dipoles applied in ', dt, ' s'

        ! initialize computation of the R-matrix
        call calculate_R_matrix(0, moldat, irrf, 0._wp, [ 0._wp ], Pw, wPw);

        ! for all scattering energies
        do ie = erange(1) + myproc - 1, erange(2), nprocs

            mye = mye + 1

            ! calculate photon energies and the initial state energy (take into account user-specified first ionization potential)
            Ei = calc_Ei
            call calculate_photon_energies(first_IP, km(ikf) % escat(ie), moldat % etarg(1), Ei, Ephoton, omega)

            ! quantum numbers of the final state
            Etotf = Ei + sum(omega(1 : last % order))
            Ef = km(ikf) % escat(ie) - echl - sum(omega(last % order + 1 :))
            lf = moldat % l2p(1:nchanf, irrf)
            nopen = merge(count(Ef + closed_range > 0), count(Ef > 0), closed_interm)

            print '(4x,A,*(I0,A))', 'energy ', ie, ' of ', km(ikf) % nescat, &
                                    ' (', count(Ef > 0), ' open / ', nopen, ' open + weakly closed channels of ', nchanf, ')'

            ! evaluate Coulomb functions at this energy for all partial waves in the final IRR
            fc = 0; fcp = 0; gc = 0; gcp = 0; hc = 0; hcp = 0; lambda = 0
            call evaluate_fundamental_solutions(moldat, moldat % rmatr, irrf, nopen, Ef, fc, gc, fcp, gcp, sqrtknorm = .false.)
            do ipw = 1, nopen
                hc(ipw)  = gc(ipw) + imu*fc(ipw)
                hcp(ipw) = gcp(ipw) + imu*fcp(ipw)
                lambda(ipw) = hcp(ipw) / hc(ipw)
            end do

            ! inner region part of the right-hand side of the master equation
            rhs = Dpsi(:, :, mye)

            ! outer region correction to the right-hand side
            Ek = km(ikf) % escat(ie) - sum(omega(last % order + 1 :))
            call multiint(moldat, r0, Ei, Ek, omega, mye, last, +1, beta, cache)
            beta = -2 * beta / sqrt(2*abs(Ef))
            where (Ef < 0) beta = beta / imu
            corr(:, 1) = real(beta * (fcp - lambda * fc), wp)
            corr(:, 2) = aimag(beta * (fcp - lambda * fc))
            call apply_boundary_amplitudes(moldat, irrf, 'T', corr, Dpsi(:, :, mye))
            rhs = rhs - 0.5 * Dpsi(:, :, mye)

            ! solve the master equation using the Sherman-Morrison-Woodbury formula
            P = 1 / (Etotf - moldat % eig(1:nstatf, irrf))
            ckp(:, 1) = P * rhs(:, 1)
            ckp(:, 2) = P * rhs(:, 2)

            if (nopen > 0) then

                ! compose the reduced inverse Hamiltonian
                H = 0
                call calculate_R_matrix(1, moldat, irrf, Etotf, P, Pw, wPw)
                H(1:nopen, 1:nopen) = wPw(1:nopen, 1:nopen)
                do i = 1, nopen
                    H(i, i) = H(i, i) + 2/lambda(i)
                end do
                do i = nopen + 1, nchanf
                    H(i, i) = 1
                end do
                call apply_boundary_amplitudes(moldat, irrf, 'N', ckp, wPD)

                ! solve the reduced system
                IwPD = 0
                call solve_complex_system(nopen, H, wPD, IwPD)

                ! expand the reduced solution to the full solution
                call apply_boundary_amplitudes(moldat, irrf, 'T', IwPD, ckp)
                ckp(:, 1) = P * (rhs(:, 1) - ckp(:, 1))
                ckp(:, 2) = P * (rhs(:, 2) - ckp(:, 2))

            end if

            ! extract the outer region channel amplitudes
            call apply_boundary_amplitudes(moldat, irrf, 'N', ckp, wckp)
            app = cmplx(wckp(:, 1), wckp(:, 2), wp)
            where (hc /= 0)
                app = (app - beta*fc) / hc
            elsewhere
                app = 0
            end where

            ! for debugging purposes
            if (test_expansion) then
                wckp(:, 1) = real(app)
                wckp(:, 2) = aimag(app)
                write (filename, '(a,i0,a,i0,a)') 'intermediate-state-', irrf, '-', ie, '.txt'
                call test_outer_expansion(trim(filename), moldat, irrf, ckp, wckp, Etotf)
            end if

            if (verbose) print '(4x,A,*(E25.15))', ' - beta: ', beta(1:nopen)
            if (verbose) print '(4x,A,*(E25.15))', ' - ap:   ', app(1:nopen)

            last % ck(:, 1, mye) = ckp(:, 1)
            last % ck(:, 2, mye) = ckp(:, 2)

            last % ap(:, 1, mye) = real(app, wp)
            last % ap(:, 2, mye) = aimag(app)

        end do

        ! finalize computation of the R-matrix
        call calculate_R_matrix(2, moldat, irrf, 0._wp, [ 0._wp ], Pw, wPw);

        call reset_timer(t, dt)
        print '(4x,A,I0,A)', 'intermediate states solved in ', dt, ' s'

    end subroutine solve_intermediate_state


    !> \brief   Calculate dipole elements from intermediate and final states
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Calculates the transition dipole matrix element between the last intermediate state and the final
    !> stationary photoionization state.
    !>
    !> \param[in]  moldat   MolecularData object with data read from the file *molecular_data*.
    !> \param[in]  order    Perturbation order of the intermediate state to calculate.
    !> \param[in]  Ephoton  Fixed photon energies in a.u. or zeros for flexible photons.
    !> \param[in]  icomp    Which Cartesian component of the dipole operator will give rise to the intermediate state.
    !> \param[in]  s        Which "transition" in *molecular_data* corresponds to the action of this dipole component on parent.
    !> \param[in]  cache    Precomputed integrals.
    !> \param[in]  mgvnn    MGVN of the previous intermediate state.
    !> \param[in]  mgvn1    Ket MGVN for the transition "s" as stored in *molecular_data*.
    !> \param[in]  mgvn2    Bra MGVN for the transition "s" as stored in *molecular_data*.
    !> \param[in]  km       KMatrix objects for relevant irreducible representations with data read from RSOLVE K-matrix files.
    !> \param[in]  ak       Wave function coeffs (from RSOLVE) for the same set of irrs as km.
    !> \param[inout] state  Tree of intermediate states, pointing at the previous intermediate state to develop into next state.
    !> \param[in]  verbose  Debugging output intensity.
    !> \param[in]  calc_Ei  Initial state energy calculated by (MPI-)SCATCI or BOUND.
    !> \param[in]  first_IP Ionization potential according to the input namelist.
    !> \param[in]  r0       Radius from which to apply asymptotic integrals.
    !> \param[in]  erange   Range (subset) of energies in K-matrix files to use for calculations.
    !>
    subroutine extract_dipole_elements (moldat, order, Ephoton, icomp, s, cache, mgvnn, mgvn1, mgvn2, &
                                        km, ak, state, verbose, calc_Ei, first_IP, r0, erange)

        use multidip_io,      only: MolecularData, KMatrix, ScatAkCoeffs, myproc, nprocs, apply_dipole_matrix
        use multidip_outer,   only: evaluate_fundamental_solutions, calculate_K_matrix
        use multidip_params,  only: cone, czero, compt, maxtarg, custom_kmatrices
        use multidip_special, only: calculate_S_matrix, calculate_T_matrix, blas_zgemv => zgemv

        type(MolecularData),             intent(in) :: moldat
        type(KMatrix),      allocatable, intent(in) :: km(:)
        type(ScatAkCoeffs), allocatable, intent(in) :: ak(:)
        type(integral_cache_t),          intent(in) :: cache(:)

        type(IntermediateState), pointer, intent(inout) :: state
        type(IntermediateState), pointer                :: last

        integer,  intent(in) :: order, icomp, s, mgvnn, mgvn1, mgvn2, erange(2)
        logical,  intent(in) :: verbose
        real(wp), intent(in) :: Ephoton(:), calc_Ei, first_IP, r0

        character(len=1) :: transp
        character(len=1024) :: filename

        complex(wp), allocatable :: Af(:, :), dm(:), dp(:), Sm(:, :), tmat(:, :)
        real(wp), allocatable :: Dpsi(:, :, :), Ef(:), echlf(:), d_inner(:, :), d_outer(:, :), Re_Af(:, :), Im_Af(:, :)
        real(wp), allocatable :: omega(:), d_total(:, :), kmat(:, :), Sf(:), Cf(:), Sfp(:), Cfp(:)
        integer,  allocatable :: lf(:)

        real(wp) :: Etotf, Ei, Ek
        integer  :: mgvnf, irrn, irrf, ikn, ikf, nstatn, nstatf, nchann, nchanf, nesc, ie, nopen, nochf, mye, nene, maxchan, i
        integer  :: t, dt

        integer(blasint) :: m, n, lds, inc = 1

        if (mgvnn == mgvn1) then
            mgvnf = mgvn2
            transp = 'T'
        else
            mgvnf = mgvn1
            transp = 'N'
        end if

        irrn = findloc(moldat % mgvns, mgvnn, 1)
        irrf = findloc(moldat % mgvns, mgvnf, 1)

        ikn = findloc(km(:) % mgvn, mgvnn, 1)
        ikf = findloc(km(:) % mgvn, mgvnf, 1)

        nstatn = moldat % mnp1(irrn)
        nstatf = moldat % mnp1(irrf)

        nchann = km(ikn) % nchan
        nchanf = km(ikf) % nchan

        nesc = erange(2) - erange(1) + 1
        nene = (nesc + nprocs - 1) / nprocs
        mye = 0
        t = 0

        maxchan = nchanf
        if (maxtarg > 0) then
            maxchan = count(moldat % ichl(1:nchanf, irrf) <= maxtarg)
        end if

        allocate (Dpsi(nstatf, 2, nene), omega(order), Ef(nchanf), lf(nchanf), echlf(nchanf), &
                  Re_Af(nstatf, maxchan), Im_Af(nstatf, maxchan), Af(nstatf, maxchan), Sm(nchanf, nchanf), &
                  d_inner(maxchan, 2), d_outer(maxchan, 2), d_total(maxchan, 2), dm(maxchan), dp(nchanf), &
                  Sf(nchanf), Cf(nchanf), Sfp(nchanf), Cfp(nchanf), kmat(nchanf, nchanf), tmat(nchanf, nchanf))

        echlf(1:nchanf) = moldat % etarg(moldat % ichl(1:nchanf, irrf)) - moldat % etarg(1)

        ! set up a final state at the end of the storage chain
        last => state
        do while (associated(last % next))
            last => last % next
        end do
        allocate (last % next)
        last % next % prev => last
        last => last % next
        last % order = order
        last % mgvn = mgvnf
        last % dcomp = icomp
        last % parent => state
        allocate (last % dip(maxchan, 2, nene))

        call print_transition_header(last)

        ! apply the inner dipoles matrix on the previous inner region solution expansions
        call reset_timer(t, dt)
        call apply_dipole_matrix(moldat, icomp, s, transp, nstatf, nstatn, state % ck, Dpsi)
        call reset_timer(t, dt)
        print '(4x,A,I0,A)', 'inner dipoles applied in ', dt, ' s'

        ! position K-matrix file pointer at the beginning of (this process') K-matrix data
        call km(ikf) % reset_kmatrix_position(skip = myproc - 1)
        if (allocated(ak)) call ak(ikf) % reset_wfncoeffs_position(skip = myproc - 1)

        ! for all scattering energies
        do ie = myproc, erange(2), nprocs

            ! read the K-matrix and Ak coeffs from the file and then skip the next 'nprocs - 1' records used by other images
            call km(ikf) % get_kmatrix(kmat, skip = nprocs - 1)
            if (allocated(ak)) call ak(ikf) % get_wfncoeffs(Ek, Re_Af, Im_Af, skip = nprocs - 1)
            if (ie < erange(1)) cycle

            mye = mye + 1
            last % dip(:, :, mye) = 0

            ! calculate photon energies and the initial state energy (take into account user-specified first ionization potential)
            Ei = calc_Ei
            call calculate_photon_energies(first_IP, km(ikf) % escat(ie), moldat % etarg(1), Ei, Ephoton, omega)

            ! quantum numbers of the final state
            Etotf = Ei + sum(omega)
            Ef = km(ikf) % escat(ie) - echlf        ! photoelectron energies in all (open and closed) channels
            lf = moldat % l2p(1:nchanf, irrf)       ! photoelectron angular momentum in all (open and closed) channels
            nopen = count(Ef > 0)                   ! number of all open channels
            nochf = min(nopen, maxchan)             ! number of open channels for which we want to obtain dipoles

            print '(4x,A,*(I0,A))', 'energy ', ie, ' of ', km(ikf) % nescat, ' (', nopen, ' open channels of ', nchanf, ')'

            ! get values and derivatives of the fundamental solutions of uncoupled (or dipole-coupled) outer region equations
            call evaluate_fundamental_solutions(moldat, moldat % rmatr, irrf, nchanf, Ef, Sf, Cf, Sfp, Cfp, sqrtknorm = .true.)

            ! if desired, recalculate the K-matrix here
            if (custom_kmatrices) then
                call calculate_K_matrix(moldat, nopen, irrf, Etotf, Sf, Cf, Sfp, Cfp, kmat)
            end if
            call calculate_T_matrix(kmat, tmat, nopen)

            ! contract D*psi with complex-conjugated wave function coefficients Ak to get inner region contribution
            if (.not. allocated(ak)) then
                call apply_Ak_coefficients_multidip(Dpsi(:, :, mye), d_inner, moldat, nopen, irrf, Etotf, &
                                                    Sfp, Cfp, kmat, tmat, .true.)
            else
                call apply_Ak_coefficiens_compak(Dpsi(:, :, mye), d_inner, Re_Af(:, 1:nochf), Im_Af(:, 1:nochf))
            end if

            ! outer region correction
            dm = 0
            dp = 0
            call multiint(moldat, r0, Ei, km(ikf) % escat(ie), omega, mye, last, -1, dm(1:nochf), cache)
            call multiint(moldat, r0, Ei, km(ikf) % escat(ie), omega, mye, last, +1, dp(1:nopen), cache)
            dm(1:nochf) = dm(1:nochf) * imu / sqrt(2*pi*sqrt(2*Ef(1:nochf)))
            dp(1:nopen) = dp(1:nopen) * imu / sqrt(2*pi*sqrt(2*Ef(1:nopen)))
            if (verbose) print '(5x,A,1x,*(1x,E25.15))', 'dm = ', dm(1:nochf)
            if (verbose) print '(5x,A,1x,*(1x,E25.15))', 'dp = ', dp(1:nopen)
            if (any(dp /= 0) .or. test_expansion) then
                m   = int(nopen, blasint)
                n   = int(nochf, blasint)
                lds = int(nchanf, blasint)
                call calculate_S_matrix(tmat, Sm, nopen)
                call blas_zgemv('T', m, n, -cone, Sm, lds, dp, inc, cone, dm, inc)  ! dm = dm - S^T dp
            end if
            d_outer(:, 1) = real(dm)
            d_outer(:, 2) = aimag(dm)

            ! for debugging purposes
            if (test_expansion) then
                write (filename, '(a,i0,a,i0,a)') 'final-state-', irrf, '-', ie, '.txt'
                call test_final_expansion(trim(filename), moldat, irrf, nopen, Etotf, Ef, Sfp, Cfp, kmat, tmat)
            end if

            ! combine and store the partial wave dipoles
            d_total = d_inner + d_outer
            last % dip(:, :, mye) = d_total

            if (verbose) print '(5x,A,1x,*(1x,E25.15))', 'd_inner = ', (d_inner(i, 1), d_inner(i, 2), i = 1, nochf)
            if (verbose) print '(5x,A,1x,*(1x,E25.15))', 'd_outer = ', (d_outer(i, 1), d_outer(i, 2), i = 1, nochf)
            if (verbose) print '(5x,A,1x,*(1x,E25.15))', 'd_total = ', (d_total(i, 1), d_total(i, 2), i = 1, nochf)

        end do

        call reset_timer(t, dt)
        print '(4x,A,I0,A)', 'matrix elements calculated in ', dt, ' s'

    end subroutine extract_dipole_elements


    !> \brief   Prints a one-line summary of the transition
    !> \authors J Benda
    !> \date    2023
    !>
    !> A sample output can look like this:
    !> \verbatim
    !>    Transtion 0 -[x]-> 1 -[y]-> 3
    !> \endverbatim
    !>
    subroutine print_transition_header (state)

        use multidip_params, only: compt

        type(IntermediateState), pointer, intent(in) :: state
        type(IntermediateState), pointer :: ptr

        integer, allocatable :: irreds(:), compts(:)
        integer :: order, i

        order = state % order

        allocate (irreds(order + 1), compts(order))

        ptr => state
        irreds(order + 1) = ptr % mgvn
        do while (associated(ptr % parent))
            compts(order) = ptr % dcomp
            ptr => ptr % parent
            irreds(order) = ptr % mgvn
            order = order - 1
        end do

        write (*, '(2x,a)', advance = 'no') 'Transition'
        do i = 1, state % order
            write (*, '(1x,i0,1x,3a)', advance = 'no') irreds(i), '-[', compt(compts(i)), ']->'
        end do
        write (*, '(1x,i0)') irreds(state % order + 1)

    end subroutine print_transition_header


    !> \brief   Multiply vector by the (complex-conjugated) wave function coefficients
    !> \authors J Benda
    !> \date    2023
    !>
    !> Contracts the provided wave function coefficients with the given inner region expansion.
    !>
    !> \param[in]  psi      Inner region wave function expansion coefficients as two columns: real and imaginary part.
    !> \param[in]  Apsi     The result, in the same form as psi.
    !> \param[in]  ReAk     Real part of the wave function coefficients of shape (nstat, nopen).
    !> \param[in]  ImAk     Imag part of the wave function coefficients of shape (nstat, nopen).
    !>
    subroutine apply_Ak_coefficiens_compak (psi, Apsi, ReAk, ImAk)

        use multidip_params,  only: rone, rzero
        use multidip_special, only: blas_dgemv => dgemv

        real(wp), intent(in)    :: psi(:, :), ReAk(:, :), ImAk(:, :)
        real(wp), intent(inout) :: Apsi(:, :)

        integer(blasint) :: m, n, inc = 1

        m = int(size(ReAk, 1), blasint)
        n = int(size(ReAk, 2), blasint)

        call blas_dgemv('T', m, n,  rone, ReAk, m, psi(:, 1), inc, rzero, Apsi(:, 1), inc)
        call blas_dgemv('T', m, n, +rone, ImAk, m, psi(:, 2), inc, +rone, Apsi(:, 1), inc)
        call blas_dgemv('T', m, n,  rone, ReAk, m, psi(:, 2), inc, rzero, Apsi(:, 2), inc)
        call blas_dgemv('T', m, n, -rone, ImAk, m, psi(:, 1), inc, +rone, Apsi(:, 2), inc)

    end subroutine apply_Ak_coefficiens_compak


    !> \brief   Multiply vector by the (complex-conjugated) wave function coefficients
    !> \authors J Benda
    !> \date    2023 - 2024
    !>
    !> Contracts the provided wave function coefficients with the inner region expansion, which will be computed on the fly.
    !> The following formula is used:
    !> \f[
    !>      A_{rj}^{(-)}(E) = \frac{1}{2} \frac{1}{E_j - E} \sum_{pq} w_{pj} F_{pq}^{(-)\prime} \,,
    !> \f]
    !> where
    !> \f[
    !>      F_{pq}^{(-)\prime} = \sum_{s} \sqrt{\frac{2}{\pi k_p}} (S_p' \delta_{ps} + C_p' K_{ps})
    !>         [(1 + i K_{oo})^{-1}]_{sq}\,.
    !> \f]
    !> The last factor uses only the open-open subset of the K-matrix. In this subroutine it is evaluated by means
    !> of the T-matrix (defined as S - 1) as
    !> \f[
    !>      (1 + iK)^{-1} = 1 + \frac{1}{2}T^* \,.
    !> \f]
    !>
    !> Altogether, application of this subroutine does the following:
    !> \f[
    !>      \chi = \psi \mathsf{A}^{(-)*} = \psi \frac{1}{\mathsf{E} - E} \mathsf{w}^\top \sqrt{ \frac{2}{\pi \mathsf{k}} }
    !>              (\mathsf{T}\mathsf{S}' + \mathsf{T} \mathsf{C}' \mathsf{K})^* (1 + i\mathsf{K})^{-1*} \,.
    !> \f]
    !>
    !> \param[in]  psi      Inner region wave function expansion coefficients as two columns: real and imaginary part.
    !> \param[in]  Apsi     The result, in the same form as psi.
    !> \param[in]  moldat   Molecular data class.
    !> \param[in]  nopen    Number of open channels.
    !> \param[in]  irr      Irreducible representation index.
    !> \param[in]  Etot     Total energy of the system.
    !> \param[in]  Sp       Derivative of the regular asymptotic solution at boundary per partial wave channel.
    !> \param[in]  Cp       Derivative of the irregular asymptotic solution at boundary per partial wave channel.
    !> \param[in]  kmat     K-matrix
    !> \param[in]  tmat     T-matrix defined as T = S - 1
    !> \param[in]  conj     Apply a conjugated wave-function coefficient (for photodipoles)
    !>
    subroutine apply_Ak_coefficients_multidip (psi, Apsi, moldat, nopen, irr, Etot, Sp, Cp, kmat, tmat, conj)

        use multidip_io,      only: MolecularData, apply_boundary_amplitudes
        use multidip_params,  only: czero, rzero

        type(MolecularData), intent(in)    :: moldat
        real(wp),            intent(in)    :: psi(:, :), Etot
        real(wp),            intent(inout) :: Apsi(:, :)
        integer,             intent(in)    :: nopen, irr
        real(wp),            intent(in)    :: Sp(:), Cp(:), kmat(:, :)
        complex(wp),         intent(in)    :: tmat(:, :)
        logical,             intent(in)    :: conj

        real(wp),    allocatable :: tmps(:, :), tmpc(:, :)
        complex(wp), allocatable :: tmpo(:)

        integer     :: nchan, nstat, i, j
        real(wp)    :: Fij
        complex(wp) :: z, T

        nstat = moldat % mnp1(irr)
        nchan = moldat % nchan(irr)

        allocate (tmps(nstat, 2), tmpc(nchan, 2), tmpo(nopen))

        ! multiply solution by R-matrix poles
        tmps(:, 1) = psi(:, 1) / (moldat % eig(1:nstat, irr) - Etot)
        tmps(:, 2) = psi(:, 2) / (moldat % eig(1:nstat, irr) - Etot)

        ! contract with the boundary amplitudes
        call apply_boundary_amplitudes(moldat, irr, 'N', tmps, tmpc)

        ! multiply by complex-conjugated matrix F' = (TS' + TC'K) / √2π
        do j = 1, nopen
            tmpo(j) = 0
            do i = 1, nchan
                Fij = (merge(Sp(i), rzero, i == j) + Cp(i)*kmat(i, j)) / sqrt(2*pi)
                tmpo(j) = tmpo(j) + cmplx(tmpc(i, 1), tmpc(i, 2), wp)*Fij
            end do
        end do

        ! multiply by complex-conjugated T (nopen-by-nopen)
        do j = 1, min(nopen, size(Apsi, 1))
            Apsi(j, 1) = 0
            Apsi(j, 2) = 0
            do i = 1, nopen
                T = merge(1, 0, i == j) + conjg(tmat(i, j))/2  ! = (1 + iK)⁻¹
                if (conj) T = conjg(T)
                z = tmpo(i) * T
                Apsi(j, 1) = Apsi(j, 1) + real(z)
                Apsi(j, 2) = Apsi(j, 2) + aimag(z)
            end do
        end do

        ! erase the closed-channel amplitudes
        do j = nopen + 1, min(nchan, size(Apsi, 1))
            Apsi(j, 1) = 0
            Apsi(j, 2) = 0
        end do

    end subroutine apply_Ak_coefficients_multidip


    !> \brief   Write radially sampled final wave-function to file
    !> \authors J Benda
    !> \date    2023 - 2024
    !>
    !> Sample the final wave function in all channels at several radii and print results to the provided file. The sampling
    !> inside the inner region is fully given by the sampling points defined in the molecular_data file. In the outer region,
    !> the values are obtained at equidistant points stretching to twice the R-matrix sphere radius.
    !>
    subroutine test_final_expansion (filename, moldat, irr, nopen, Etot, Ek, Sp, Cp, kmat, tmat)

        use multidip_io,    only: MolecularData
        use multidip_outer, only: evaluate_fundamental_solutions

        character(len=*),       intent(in) :: filename
        type(MolecularData),    intent(in) :: moldat
        integer,                intent(in) :: irr, nopen
        real(wp),               intent(in) :: Etot, Ek(:), Sp(:), Cp(:), kmat(:, :)
        complex(wp),            intent(in) :: tmat(:, :)

        real(wp),    allocatable :: psi(:, :), Apsi(:, :), S(:), C(:), dSdr(:), dCdr(:)
        complex(wp), allocatable :: Fqp(:, :), Hp(:), Hm(:)

        real(wp)    :: dr, r
        integer     :: i, nfdm, u, nchan, p, q, nstat

        nfdm = size(moldat % r_points) - 1
        nchan = moldat % nchan(irr)
        nstat = moldat % mnp1(irr)
        dr = 0.1

        allocate (Fqp(nchan, nopen), psi(nstat, 2), Apsi(nchan, 2), S(nchan), C(nchan), Hp(nchan), Hm(nchan), dSdr(nchan), &
                  dCdr(nchan))

        open (newunit = u, file = filename, action = 'write', form = 'formatted')

        ! inner region
        do i = 1, nfdm
            write (u, '(e25.15)', advance = 'no') moldat % r_points(i)
            do q = 1, nchan
                if (.not. associated(moldat % wmat2(i, irr) % mat)) then
                    print '(a)', 'Error: wmat2 has not been read from molecular_data'
                    stop 1
                else if (moldat % wmat2(i, irr) % distributed) then
                    print '(a)', 'Error: test_outer_expansion not implemented in MPI-IO mode'
                    stop 1
                end if
                ! the "wave-function" is a row of a boundary amplitudes matrix to reduce the Ak coefficient with
                psi(:, 1) = moldat % wmat2(i, irr) % mat(q, :)
                psi(:, 2) = 0
                ! perform the reduction w*A
                call apply_Ak_coefficients_multidip(psi, Apsi, moldat, nopen, irr, Etot, Sp, Cp, kmat, tmat, .false.)
                ! combine components to a single complex number
                Fqp(q, 1:nopen) = cmplx(Apsi(1:nopen, 1), Apsi(1:nopen, 2), wp)
            end do
            write (u, '(*(e25.15))') Fqp(1:nchan, 1:nopen)
        end do

        ! outer region
        do i = 0, nint(moldat % rmatr/dr)
            r = moldat % rmatr + i*dr
            write (u, '(e25.15)', advance = 'no') r
            call evaluate_fundamental_solutions(moldat, r, irr, nchan, Ek, S, C, dSdr, dCdr)
            Hp = (C + imu*S) * (-imu) / sqrt(2*pi)
            Hm = (C - imu*S) * (-imu) / sqrt(2*pi)
            do p = 1, nopen
                do q = 1, nchan
                    if (q == p) Fqp(q, p) = Hp(q)
                    if (q /= p) Fqp(q, p) = 0
                    Fqp(q, p) = Fqp(q, p) - Hm(q) * conjg(1 + tmat(q, p))
                    write (u, '(*(e25.15))', advance = 'no') Fqp(q, p)
                end do
            end do
            write (u, '()')
        end do

        close (u)

    end subroutine test_final_expansion


    !> \brief   Get current time stamp
    !> \author  J Benda
    !> \date    2022
    !>
    !> Update the parameter "t" with the current system time stamp. Store into "dt" the elapsed time in seconds.
    !> If the elapsed time is longer than an integer multiple of the system clock period, only the remainder will
    !> be reported.
    !>
    !> In parallel mode individual processes work in sync, so we have to wait for all before reading out the time.
    !>
    subroutine reset_timer (t, dt)

        use mpi_gbl, only: mpi_mod_barrier

        integer, intent(inout) :: t
        integer, intent(out)   :: dt
        integer                :: clk_now, clk_rate, clk_max, ierr

        call mpi_mod_barrier(ierr)

        call system_clock(clk_now, clk_rate, clk_max)

        if (t < clk_now) then
            dt = (clk_now - t) / clk_rate
        else
            ! Cater for possible system clock wrap-around. Also pay attention to integer overflow; clk_max
            ! is typically the upper limit of the given integer type.
            dt = ((clk_max - t) + clk_now) / clk_rate
        end if

        t = clk_now

    end subroutine reset_timer


    !> \brief   Adjust ionization potential and calculate energy of each photon
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Calculate energies of all photons. The positive elements of the input array `Ephoton` specify fixed
    !> energies of selected photons. The remaining energy needed to reach the ionisation potential is divided
    !> among the remaining photons.
    !>
    !> The user-specified parameter `first_IP` will be used to redefine the energy of the initial state before ionization.
    !>
    !> \param[in]    first_IP Total energy needed to ionize the target to the first ionic state.
    !> \param[in]    escat    Photoelectron kinetic energy after absorption of all photons (only needed for "floating IP").
    !> \param[in]    etarg    Energy of the first ionic state.
    !> \param[inout] Ei       Energy of the initial state before ionization; will be shifted if `first_IP` is non-zero.
    !> \param[in]    Ephoton  User-specified fixed energies of photons or zeros for flexible photons.
    !> \param[inout] omega    Calculated energies of all photons.
    !>
    subroutine calculate_photon_energies (first_IP, escat, etarg, Ei, Ephoton, omega)

        real(wp), intent(in)    :: first_IP, escat, etarg, Ephoton(:)
        real(wp), intent(inout) :: Ei, omega(:)
        real(wp)                :: IP, Ef

        ! calculate total final energy
        Ef = etarg + escat

        ! obtain the calculated ionization potential (including non-zero photoelectron energy)
        IP = Ef - Ei

        ! set the requested ionization potential by shifting the initial state energy
        if (first_IP > 0) then
            IP = first_IP + escat
            Ei = Ef - IP
        end if

        ! calculate photon energies
        where (Ephoton /= 0)
            omega = Ephoton
        elsewhere
            omega = (IP - sum(Ephoton)) / (size(omega) - count(Ephoton /= 0))
        end where

    end subroutine calculate_photon_energies


    !> \brief   Precompute outer radial integrals (driver)
    !> \authors J Benda
    !> \date    2024
    !>
    !> If `cache_integrals` is enabled, precompute all needed radial outer region multiphoton integrals.
    !>
    !> \param[inout] integral_cache  Precomputed integrals per this task's energy.
    !> \param[in]    moldat          Molecular data.
    !> \param[in]    esc             Scattering energies.
    !> \param[in]    nphot           Number of absorbed photons.
    !> \param[in]    r0              Numerical integration distance.
    !> \param[in]    erange          Index of the first and last scatternig energy to consider.
    !> \param[in]    calc_Ei         The calculated energy of the initial state.
    !> \param[in]    first_IP        First ionization potential needed to obtain photon energies from known kinetic energies.
    !> \param[in]    Ephoton         Absorbed photon energies.
    !> \param[in]    verbose         Whether to print the cache to standard output.
    !>
    subroutine precompute_integral_cache (integral_cache, moldat, esc, nphot, r0, erange, calc_Ei, first_IP, Ephoton, verbose)

        use multidip_integ,  only: nested_cgreen_integ
        use multidip_io,     only: MolecularData, myproc, nprocs
        use multidip_params, only: cache_integrals, extend_istate

        type(integral_cache_t), allocatable, intent(inout) :: integral_cache(:)
        type(MolecularData),                 intent(in)    :: moldat

        real(wp), intent(in) :: esc(:), Ephoton(:), r0, calc_Ei, first_IP
        integer,  intent(in) :: nphot, erange(2)
        logical,  intent(in) :: verbose

        complex(wp), allocatable :: ks(:)
        integer,     allocatable :: rchs(:), ms(:), ls(:), pws_coupled(:, :), ion_coupled(:, :)

        real(wp) :: ebase, etarg, escat, Ei, Ek, omega(size(Ephoton))
        integer  :: nesc, nene, mye, rchi, rchj, nredch, ntarg, itarg, jtarg, li, lj, order, ie, lmax, nrchs, t, dt

        order = merge(nphot, nphot - 1, extend_istate)  ! outer integral dimension

        if (order <= 0 .or. .not. cache_integrals) return

        print '(2x,a)', 'Precomputing outer integrals'

        nesc = size(esc)                        ! total number of scattering energies
        nene = (nesc + nprocs - 1) / nprocs     ! number of scattering energies assigned to this process
        ntarg = size(moldat % etarg)            ! total number of target states in outer expansion
        lmax = maxval(moldat % l2p)             ! highest partial wave angular momentum
        nredch = ntarg * (lmax + 1)             ! total number of reduced channels
        ebase = moldat % etarg(1)               ! energy of the first residual ion state

        call reset_timer(t, dt)

        allocate (integral_cache(nene), rchs(order + 1), ms(order), ks(order + 1), ls(order + 1), &
                  pws_coupled(nredch + 1, nredch), ion_coupled(nredch + 1, nredch))

        ! find out which reduced channels are dipole-coupled in partial waves
        pws_coupled = 0
        do li = 0, lmax
            do lj = max(0, li - 1), min(lmax, li + 1)
                do itarg = 1, ntarg
                    rchi = li*ntarg + itarg
                    rchj = lj*ntarg + itarg
                    pws_coupled(rchi + 1, rchj) = rchi
                    pws_coupled(rchj + 1, rchi) = rchj
                end do
            end do
        end do

        ! find out which reduced channels are dipole-coupled in residual ion
        ion_coupled = 0
        do itarg = 1, ntarg
            do jtarg = 1, ntarg
                if (any(moldat % crlv(itarg, jtarg, :) /= 0)) then
                    do li = 0, lmax
                        rchi = li*ntarg + itarg
                        rchj = li*ntarg + jtarg
                        ion_coupled(rchi + 1, rchj) = rchi
                        ion_coupled(rchj + 1, rchi) = rchj
                    end do
                end if
            end do
        end do

        ! for each reduced channel assemble a list of dipole-coupled reduced channels
        do rchi = 1, nredch
            ! coupled partial waves
            nrchs = count(pws_coupled(2:, rchi) /= 0, 1)
            pws_coupled(1, rchi) = nrchs
            pws_coupled(2:nrchs + 1, rchi) = pack(pws_coupled(2:, rchi), pws_coupled(2:, rchi) /= 0)
            ! coupled residual ions
            nrchs = count(ion_coupled(2:, rchi) /= 0, 1)
            ion_coupled(1, rchi) = nrchs
            ion_coupled(2:nrchs + 1, rchi) = pack(ion_coupled(2:, rchi), ion_coupled(2:, rchi) /= 0)
        end do

        ! allocate integral storage
        do mye = 1, nene
            allocate (integral_cache(mye) % next_pws(nredch))
        end do

        ! precompute the integrals in parallel
        !$omp parallel do schedule(dynamic, 1) default(none) &
        !$omp& private(mye, rchi, itarg, ie, Ei, Ek, etarg, escat) &
        !$omp& shared(nene, nredch, integral_cache, moldat, pws_coupled, ion_coupled, nprocs, myproc, esc) &
        !$omp& firstprivate(rchs, ms, ks, ls, order, ntarg, nphot, ebase, r0, omega, erange, Ephoton, calc_Ei, first_IP)
        do mye = 1, nene
            ! calculate global energy index
            ie = (mye - 1)*nprocs + myproc
            if (ie < erange(1) .or. ie > erange(2)) cycle
            ! calculate unspecified photon energies
            Ei = calc_Ei
            call calculate_photon_energies(first_IP, esc(ie), ebase, Ei, Ephoton, omega)
            ! process all final reduced channels
            do rchi = 1, nredch
                rchs(order + 1) = rchi
                itarg = mod(rchi - 1, ntarg) + 1
                etarg = moldat % etarg(itarg)
                escat = esc(ie) - sum(omega(nphot + 1:))
                Ek = escat + ebase - etarg
                if (Ek >= 0) then
                    ks(order + 1) = sqrt(2*Ek)
                else
                    ks(order + 1) = cmplx(0._wp, sqrt(-2*Ek), wp)
                    cycle  ! FIXME: closed channels seem to be used in 3+ photon ionization?
                end if
                ls(order + 1) = (rchi - 1) / ntarg
                call precompute_integral_cache_block(integral_cache(mye) % next_pws(rchi), moldat, pws_coupled, ion_coupled, &
                                                     order, omega(1:nphot), escat, r0, rchs, ms, ks, ls)
            end do
        end do

        if (verbose) then
            call print_integral_cache(integral_cache, erange, ntarg)
        end if

        call reset_timer(t, dt)

        print '(4x,a,i0,a)', 'radial integrals precomputed in ', dt, ' s'

    end subroutine precompute_integral_cache


    !> \brief   Precompute outer radial integrals (implementation)
    !> \authors J Benda
    !> \date    2024
    !>
    !> Follow on the provided chain of dipole-coupled reduced channels, precomputing all outer-region multi-photon integrals along
    !> the way.
    !>
    !> \param[inout] integral_cache  A block of precomputed integrals.
    !> \param[in]    moldat          Molecular data.
    !> \param[in]    pws_coupled     One column for each reduced chan.: number of pws-coupled reduced chans. followed by their list.
    !> \param[in]    ion_coupled     One column for each reduced chan.: number of ion-coupled reduced chans. followed by their list.
    !> \param[in]    order           Maximal dimension of integrals to recompute.
    !> \param[in]    omega           Absorbed photon energies.
    !> \param[in]    escat           Energy of the photoelectron in the first channel after absorption of all photons.
    !> \param[in]    r0              Numerical integration distance.
    !> \param[inout] rchs            Reduced channels chain: (i,l), (i',l'), (i'',l''), ...
    !> \param[inout] ms              Coordinate powers per photon absorption.
    !> \param[inout] ks              Final and intermediate momenta (possibly complex) of the photoelectron in this chain.
    !> \param[inout] ls              Angular momenta of the photoelectron in this chain.
    !>
    recursive subroutine precompute_integral_cache_block (integral_cache, moldat, pws_coupled, ion_coupled, order, omega, &
                                                          escat, r0, rchs, ms, ks, ls)

        use multidip_integ,  only: nested_cgreen_integ
        use multidip_io,     only: MolecularData
        use multidip_params, only: closed_interm, closed_range

        type(integral_cache_t), intent(inout) :: integral_cache
        type(MolecularData),    intent(in)    :: moldat

        real(wp),    intent(in)    :: omega(:), escat, r0
        integer,     intent(in)    :: order, pws_coupled(:, :), ion_coupled(:, :)
        integer,     intent(inout) :: rchs(:), ms(:), ls(:)
        complex(wp), intent(inout) :: ks(:)

        integer  :: rchi, rchj, N, ntarg, ncoup, icoup, jtarg
        real(wp) :: ebase, etarg, a, Z, c, Ek, Ekj

        rchi = rchs(order + 1)              ! current reduced channel
        N = size(rchs) - order              ! dimension of outer integrals that end at this absorption level
        ebase = moldat % etarg(1)           ! energy of the first residual ion
        ntarg = size(moldat % etarg)        ! number of residual ion states
        a = moldat % rmatr                  ! R-matrix radius
        Z = moldat % nz - moldat % nelc     ! residual ion charge
        c = 0                               ! dipole damping factor

        ! kinetic energy of photoelectron in the first channel before absorbing the current photon
        Ek = escat - sum(omega(size(omega) - order + 1:))

        ! process all reduced channels that are pws-coupled to the current reduced channel chain
        ncoup = pws_coupled(1, rchi)
        allocate (integral_cache % rchs_pws(ncoup), integral_cache % vals_pws(2, ncoup), integral_cache % next_pws(ncoup))
        do icoup = 1, ncoup
            rchj = pws_coupled(1 + icoup, rchi)                     ! index of reduced channel pws-coupled to rchi
            jtarg = mod(rchj - 1, ntarg) + 1                        ! corresponding target state
            etarg = moldat % etarg(jtarg)                           ! energy of the target state
            Ekj = Ek + ebase - etarg                                ! photoelectron kinetic energy in this channel
            if (Ekj + closed_range <= 0) cycle                      ! ignore strongly closed channels
            if (Ekj < 0 .and. .not. closed_interm) cycle            ! optionally ignore also weakly closed channels
            rchs(order) = rchj                                      ! store the reduced channel before absorption
            if (Ekj >= 0) then
                ks(order) = sqrt(2*Ekj)                             ! photoelectron momentum before absorption
            else
                ks(order) = cmplx(0._wp, sqrt(-2*Ekj), wp)          ! use imaginary momentum for closed channels
            end if
            ls(order) = (rchj - 1)/ntarg                            ! photoelectron angular momentum before absorption
            ms(order) = 1                                           ! coordinate power (dipole operator)
            integral_cache % rchs_pws(icoup) = rchs(order)
            integral_cache % vals_pws(1, icoup) = nested_cgreen_integ(Z, a, r0, c, N, +1, +1, ms(order:), ls(order:), ks(order:))
            integral_cache % vals_pws(2, icoup) = nested_cgreen_integ(Z, a, r0, c, N, +1, -1, ms(order:), ls(order:), ks(order:))
            if (order > 1) then
                call precompute_integral_cache_block(integral_cache % next_pws(icoup), moldat, pws_coupled, ion_coupled, &
                                                     order - 1, omega, escat, r0, rchs, ms, ks, ls)
            end if
        end do

        ! process all reduced channels that are ion-coupled to the current reduced channel chain
        ncoup = ion_coupled(1, rchi)
        allocate (integral_cache % rchs_ion(ncoup), integral_cache % vals_ion(2, ncoup), integral_cache % next_ion(ncoup))
        do icoup = 1, ncoup
            rchj = ion_coupled(1 + icoup, rchi)                     ! index of reduced channel pws-coupled to rchi
            jtarg = mod(rchj - 1, ntarg) + 1                        ! corresponding target state
            etarg = moldat % etarg(jtarg)                           ! energy of the target state
            Ekj = Ek + ebase - etarg                                ! photoelectron kinetic energy in this channel
            if (Ekj + closed_range <= 0) cycle                      ! ignore strongly closed channels
            if (Ekj < 0 .and. .not. closed_interm) cycle            ! optionally ignore also weakly closed channels
            rchs(order) = rchj                                      ! store the reduced channel before absorption
            if (Ekj >= 0) then
                ks(order) = sqrt(2*Ekj)                             ! photoelectron momentum before absorption
            else
                ks(order) = cmplx(0._wp, sqrt(-2*Ekj), wp)          ! use imaginary momentum for closed channels
            end if
            ls(order) = (rchj - 1)/ntarg                            ! photoelectron angular momentum before absorption
            ms(order) = 0                                           ! coordinate power (overlap operator)
            integral_cache % rchs_ion(icoup) = rchs(order)
            integral_cache % vals_ion(1, icoup) = nested_cgreen_integ(Z, a, r0, c, N, +1, +1, ms(order:), ls(order:), ks(order:))
            integral_cache % vals_ion(2, icoup) = nested_cgreen_integ(Z, a, r0, c, N, +1, -1, ms(order:), ls(order:), ks(order:))
            if (order > 1) then
                call precompute_integral_cache_block(integral_cache % next_ion(icoup), moldat, pws_coupled, ion_coupled, &
                                                     order - 1, omega, escat, r0, rchs, ms, ks, ls)
            end if
        end do

    end subroutine precompute_integral_cache_block


    !> \brief   Print precomputed integrals
    !> \authors J Benda
    !> \date    2024
    !>
    subroutine print_integral_cache (cache, erange, ntarg)

        use multidip_io, only: myproc, nprocs

        type(integral_cache_t), intent(in) :: cache(:)
        integer,                intent(in) :: erange(2), ntarg

        integer :: i, j

        do i = myproc, erange(2), nprocs
            if (i < erange(1)) cycle
            print '(4x,a,i0)', 'cache for energy #', i
            do j = 1, size(cache(i) % next_pws)
                call print_integral_cache_block(cache(i) % next_pws(j), ntarg, 1, [j])
            end do
        end do

    end subroutine print_integral_cache


    !> \brief   Print precomputed integrals
    !> \authors J Benda
    !> \date    2024
    !>
    recursive subroutine print_integral_cache_block (cache, ntarg, level, chain)

        type(integral_cache_t), intent(in) :: cache
        integer,                intent(in) :: ntarg, level, chain(level)

        integer     :: i, j, l, rch, ichl
        complex(wp) :: val(2)

        if (allocated(cache % rchs_pws)) then
            do i = 1, size(cache % rchs_pws)
                rch = chain(1)
                ichl = mod(rch - 1, ntarg) + 1
                l = (rch - 1)/ntarg
                write (*, '(6x,a,i0,a,i0,1x,i0)', advance = 'no') '( [', rch, '] ', ichl, l
                do j = 2, level
                    rch = chain(j)
                    ichl = mod(rch - 1, ntarg) + 1
                    l = (rch - 1)/ntarg
                    write (*, '(a,i0,a,i0,1x,i0)', advance = 'no') ' | [', rch, '] ', ichl, l
                end do
                rch = cache % rchs_pws(i)
                val = cache % vals_pws(:, i)
                ichl = mod(rch - 1, ntarg) + 1
                l = (rch - 1)/ntarg
                write (*, '(a,i0,a,i0,1x,i0,a,4e25.15)') ' | [', rch, '] ', ichl, l, ' ):', val
                call print_integral_cache_block(cache % next_pws(i), level + 1, ntarg, [chain, rch])
            end do
        end if

        if (allocated(cache % rchs_ion)) then
            do i = 1, size(cache % rchs_ion)
                rch = chain(1)
                ichl = mod(rch - 1, ntarg) + 1
                l = (rch - 1)/ntarg
                write (*, '(6x,a,i0,a,i0,1x,i0)', advance = 'no') '( [', rch, '] ', ichl, l
                do j = 2, level
                    rch = chain(j)
                    ichl = mod(rch - 1, ntarg) + 1
                    l = (rch - 1)/ntarg
                    write (*, '(a,i0,a,i0,1x,i0)', advance = 'no') ' | [', rch, '] ', ichl, l
                end do
                rch = cache % rchs_ion(i)
                val = cache % vals_ion(:, i)
                ichl = mod(rch - 1, ntarg) + 1
                l = (rch - 1)/ntarg
                write (*, '(a,i0,a,i0,1x,i0,a,4e25.15)') ' | [', rch, '] ', ichl, l, ' ):', val
                call print_integral_cache_block(cache % next_ion(i), level + 1, ntarg, [chain, rch])
            end do
        end if

    end subroutine print_integral_cache_block


    !> \brief   Calculate R-matrix from boundary amplitudes
    !> \authors J Benda
    !> \date    2024
    !>
    !> Evaluate the R-matrix for given irreducible representation from the formula
    !> \f[
    !>    R_{pq}(E) = \sum_j w_{jp} [E - e_j]^{-1} w_{jq}
    !> \f]
    !>
    !> \param[in] stage   Stage of the calculation (0 = initialize OpenCL buffers, 1 = calculate, 2 = release OpenCL buffers).
    !> \param[in] moldat  MolecularData object.
    !> \param[in] irr     Irreducible representation index.
    !> \param[in] etot    Total energy of the system.
    !> \param[in] P       Poles (1/(E - etot)).
    !> \param[inout] Pw   Ampae workspace (poles times boundary amplitudes).
    !> \param[inout] wPw  The resulting R-matrix.
    !>
    subroutine calculate_R_matrix (stage, moldat, irr, etot, P, Pw, wPw)

        use linalg_cl,        only: is_initialized_cl, residr_cl
        use multidip_io,      only: MolecularData, scale_boundary_amplitudes, apply_boundary_amplitudes

        type(MolecularData), intent(in) :: moldat

        integer,  intent(in)    :: stage, irr
        real(wp), intent(in)    :: etot, P(:)
        real(wp), intent(inout) :: Pw(:, :), wPw(:, :)

        integer(c_int)          :: cstge, nchan, nstat, ldwam
        real(wp), pointer       :: wamp(:, :)

        ! gpu code path
        if (is_initialized_cl() /= 0) then

            ! convert parameters to C integers
            cstge = int(stage, c_int)
            nchan = int(moldat % nchan(irr), c_int)
            nstat = int(moldat % mnp1(irr), c_int)
            ldwam = int(size(moldat % wamp(irr) % mat, 1), c_int)

            ! set up a pointer to the (possibly dummy) boundary amplitudes matrix
            if (stage == 0) then
                wamp => moldat % wamp(irr) % mat
            else
                allocate (wamp(0, 0))
            end if

            ! perform the calculation and go
            call residr_cl(cstge, nchan, nstat, 0_c_int, -1._wp, moldat % eig(:, irr), etot, wamp, ldwam, wPw)

            ! clean up the allocation
            if (stage /= 0) then
                deallocate (wamp)
            end if

            return

        end if

        ! non-gpu code path
        if (stage == 1) then
            call scale_boundary_amplitudes(moldat, irr, P, Pw)
            call apply_boundary_amplitudes(moldat, irr, 'N', Pw, wPw)
        end if

    end subroutine calculate_R_matrix


    !> \brief   Evaluate the correction dipole integral for all orders
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> If the parent intermediate state has a non-vanishing outer region part, integrate the final wave function with it
    !> and multiply by the associated expansion coefficients 'ap'. Then, iteratively, for all parent states of the parent
    !> state, add their contribution by means of the Coulomb-Green's function (multiplied by their expansion coefficients).
    !> The deeper in the absorption chain we get, the more dimensions the resulting Coulomb-Green's integral has.
    !>
    !> \param[in]  moldat  multidip_io::MolecularData object with data read from *molecular_data*.
    !> \param[in]  r0      Radius from which to apply asymptotic integrals.
    !> \param[in]  Ei      Total energy of the initial state (no photons absorbed).
    !> \param[in]  esc     Photoelectron energy in the first channel after absorption of photons at this order.
    !> \param[in]  omega   Energy of all photons.
    !> \param[in]  ie      Local energy index.
    !> \param[in]  state   Tree of parent intermediate states pointed to the last intermediate state.
    !> \param[in]  sb      Kind (sign) of the outer-most Coulomb-Hankel function.
    !> \param[out] dip     Evaluated multi-photon matrix element.
    !> \param[in]  cache   Precomputed integrals.
    !>
    subroutine multiint (moldat, r0, Ei, esc, omega, ie, state, sb, dip, cache)

        use mpi_gbl,         only: mpi_xermsg
        use multidip_io,     only: MolecularData
        use multidip_params, only: cache_integrals, closed_interm, closed_range, extend_istate

        type(MolecularData),              intent(in) :: moldat
        type(IntermediateState), pointer, intent(in) :: state
        type(integral_cache_t),           intent(in) :: cache(:)

        complex(wp),              intent(inout) :: dip(:)
        real(wp),    allocatable, intent(inout) :: omega(:)
        real(wp),                 intent(in)    :: Ei, esc, r0
        integer,                  intent(in)    :: ie, sb

        complex(wp), allocatable :: k(:)
        integer,     allocatable :: l(:), m(:)

        real(wp)    :: c, a, Ekf, ebase, echl, etarg
        integer     :: nphot, mgvn, ichan, irr, ichl, irch, ntarg

        dip = 0

        if (.not. associated(state % parent)) then      ! state with no parent should not be a final state in the first place
            call mpi_xermsg('multidip_routines', 'multiint', 'Runtime error: unexpected parent state', 1, 1)
        end if

        a = moldat % rmatr                              ! R-matrix radius
        c = 0                                           ! dipole damping factor
        nphot = state % order                           ! number of photons absorbed by the final state
        mgvn = state % mgvn                             ! MGVN of the final state
        irr = findloc(moldat % mgvns, mgvn, 1)          ! index of this spin-symmetry in molecular_data
        ebase = moldat % etarg(1)                       ! energy of the first ion state
        ntarg = size(moldat % etarg)                    ! number of residual ion states

        if (.not. extend_istate .and. nphot <= 1) return

        !$omp parallel default(none) &
        !$omp& shared(moldat, state, omega, dip, closed_range, closed_interm, cache_integrals, cache) &
        !$omp& private(k, l, m, ichan, ichl, irch, etarg, echl, Ekf) &
        !$omp& firstprivate(r0, c, esc, sb, ie, Ei, ebase, irr, nphot, ntarg)

        allocate (k(nphot + 1), l(nphot + 1), m(nphot))

        !$omp do schedule(dynamic)

        do ichan = 1, size(dip)

            ! set up quantum numbers for this channel
            ichl = moldat % ichl(ichan, irr)                        ! index of ion state this partial wave is coupled to
            etarg = moldat % etarg(ichl)                            ! energy of that ion state
            echl = etarg - ebase                                    ! excitation threshold of this ion state w.r.t. ion ground
            Ekf = esc - echl                                        ! always positive for final states (but not for intermediate)
            if (Ekf + closed_range <= 0) cycle                      ! ignore strongly closed channels
            if (Ekf < 0 .and. .not. closed_interm) cycle            ! optionally ignore also weakly closed channels
            k(1) = sqrt(2*abs(Ekf)); if (Ekf < 0) k(1) = k(1) * imu
            l(1) = moldat % l2p(ichan, irr)

            ! evaluate the correction dipole integral for this final channel
            if (cache_integrals) then
                irch = l(1)*ntarg + ichl
                dip(ichan) = multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, ie, c, 1, state, ichan, sb, k, l, m, &
                                            cache(ie) % next_pws(irch))
            else
                dip(ichan) = multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, ie, c, 1, state, ichan, sb, k, l, m)
            end if

        end do

        !$omp end do
        !$omp end parallel

    end subroutine multiint


    !> \brief   Calculate dipole correction integrals at given absorption depth
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Recursively evaluates the outer region contributions multi-photon transition element contributions from all combinations
    !> of channels at all absorption levels that share the initial sequence given by k, l and m.
    !>
    !> \param[in]  moldat  multidip_io::MolecularData object with data read from *molecular_data*.
    !> \param[in]  r0      Radius from which to apply asymptotic integrals.
    !> \param[in]  Ei      Total energy of the initial state.
    !> \param[in]  esc     Photoelectron energy in the first channel after absorption of photons at this order.
    !> \param[in]  omega   Energy of all photons.
    !> \param[in]  ie      Local energy index.
    !> \param[in]  c       Dipole exponential damping coefficient.
    !> \param[in]  N       Index of the dipole operator to consider on this recursion level.
    !> \param[in]  state   Tree of parent intermediate states pointed to the last intermediate state.
    !> \param[in]  ichanf  Final channel after action of the dipole operator at this recursion level.
    !> \param[in]  sb      Kind (sign) of the outer-most Coulomb-Hankel function.
    !> \param[inout]  k    Linear momenta of the final and initial dipole transition channels for at previous recursion levels.
    !> \param[inout]  l    Angular momenta of the final and initial dipole transition channels for at previous recursion levels.
    !> \param[inout]  m    Angular projections of the final and initial dipole transition channels for at previous recursion levels.
    !> \param[in]  cache   Precomputed integrals.
    !>
    !> \return integ Evaluated multi-photon matrix element.
    !>
    recursive complex(wp) function multiint_chain (moldat, r0, Ei, esc, omega, ie, c, N, &
                                                   state, ichanf, sb, k, l, m, cache) result (integ)

        use multidip_integ,  only: nested_cgreen_integ
        use multidip_io,     only: MolecularData
        use multidip_params, only: cache_integrals, closed_interm, closed_range

        type(MolecularData),               intent(in) :: moldat
        type(IntermediateState), pointer,  intent(in) :: state
        type(integral_cache_t),  optional, intent(in) :: cache
        type(IntermediateState), pointer              :: parent

        complex(wp), intent(inout) :: k(:)
        real(wp),    intent(inout) :: omega(:)
        integer,     intent(inout) :: l(:), m(:)
        real(wp),    intent(in)    :: r0, Ei, esc
        integer,     intent(in)    :: ie, sb, N, ichanf

        complex(wp) :: ap, kr(N + 1)
        real(wp)    :: Z, c, a, Ek, Qion, Qpws, ebase, etarg, echl
        integer     :: mgvni, mgvnf, nchani, irri, irrf, ichani, ichl, irch, nphot, dcomp, lr(N + 1), mr(N), ntarg

        integ = 0

        ! no need to recurse further if the current state is the initial state
        if (.not. associated(state % parent)) return

        parent => state % parent                        ! previous intermediate state
        nphot = parent % order                          ! number of photons absorbed by the previous intermediate state
        ntarg = size(moldat % etarg)                    ! number of residual ion states
        ebase = moldat % etarg(1)                       ! energy of the ground ion state
        a = moldat % rmatr                              ! R-matrix radius
        Z = moldat % nz - moldat % nelc                 ! Residual ion charge
        dcomp = state % dcomp                           ! last polarisation component absorbed by the final state
        mgvnf = state % mgvn                            ! irreducible representation of the current state
        mgvni = parent % mgvn                           ! irreducible representation of the previos state
        irrf = findloc(moldat % mgvns, mgvnf, 1)        ! index of the current spin-symmetry in molecular_data
        irri = findloc(moldat % mgvns, mgvni, 1)        ! index of the previous spin-symmetry in molecular_data
        nchani = moldat % nchan(irri)                   ! number of channels in the previous state irreducible representation

        ! loop over all channels of the intermediate state that are dipole-coupled to ichanf
        do ichani = 1, nchani

            ! set up quantum numbers for this channel
            ichl = moldat % ichl(ichani, irri)                          ! index of ion state this pw is coupled to
            etarg = moldat % etarg(ichl)                                ! energy of that state
            echl = etarg - ebase                                        ! excitation threshold of this ion state w.r.t. ground ion
            Ek = esc - echl                                             ! can become negative starting from some channel
            if (Ek + closed_range <= 0) exit                            ! ignore strongly closed channels
            if (Ek < 0 .and. .not. closed_interm) exit                  ! optionally ignore also weakly closed channels
            l(N + 1) = moldat % l2p(ichani, irri)

            if (.not. cache_integrals) then
                k(N + 1) = sqrt(2*abs(Ek))
                if (Ek < 0) k(N + 1) = k(N + 1) * imu
                kr(1 : N + 1) = k(N + 1 : 1 : -1)
                lr(1 : N + 1) = l(N + 1 : 1 : -1)
            end if

            ! dipole angular integrals
            Qion = channel_coupling_ion(moldat, dcomp, irrf, irri, ichanf, ichani)
            Qpws = channel_coupling_pws(moldat, dcomp, irrf, irri, ichanf, ichani)

            if (Qion == 0 .and. Qpws == 0) cycle

            ! outer region expansion coefficient for this partial wave and energy
            ap = cmplx(parent % ap(ichani, 1, ie), parent % ap(ichani, 2, ie), wp)

            ! evaluate Coulomb-Green's integrals for dipole-coupled ions
            if (Qion /= 0) then
                if (cache_integrals) then
                    irch = 0
                    if (allocated(cache % rchs_ion)) then
                        irch = findloc(cache % rchs_ion, l(N + 1)*ntarg + ichl, 1)
                    end if
                    if (ap /= 0) then
                        if (irch == 0) then
                            print '(/,a,4(i0,a))', 'Error: Missing integral in cache for channel pair ', ichanf, ' (irr ', irrf, &
                            ') - ', ichani, ' (irr ', irri, ')!'
                            print '(7x,a,*(1x,i0))', 'Available reduced channels in cache:', cache % rchs_ion
                            print '(7x,3(a,i0),a)', 'Needed reduced channel: ', l(N + 1)*ntarg + ichl, &
                                                    ' (ichl = ', ichl, ', l = ', l(N + 1), ')'
                            error stop
                        end if
                        integ = integ + Qion * ap * cache % vals_ion((3 - sb)/2, irch)
                    end if
                    if (nphot > 0) then
                        if (irch == 0) then
                            print '(/,a,4(i0,a))', 'Error: Missing integral chain for channel pair ', ichanf, ' (irr ', irrf, &
                            ') - ', ichani, ' (irr ', irri, ')!'
                            print '(7x,a,*(1x,i0))', 'Available reduced channels in cache:', cache % rchs_ion
                            print '(7x,3(a,i0),a)', 'Needed reduced channel: ', l(N + 1)*ntarg + ichl, &
                                                    ' (ichl = ', ichl, ', l = ', l(N + 1), ')'
                            error stop
                        end if
                        integ = integ + Qion * multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, &
                                                              ie, c, N + 1, parent, ichani, sb, k, l, m, &
                                                              cache % next_ion(irch))
                    end if
                else
                    m(N) = 0
                    mr(1:N) = m(N:1:-1)
                    if (ap /= 0) integ = integ + Qion * ap * nested_cgreen_integ(Z, a, r0, c, N, +1, sb, mr, lr, kr)
                    if (nphot > 0) integ = integ + Qion * multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, &
                                                                         ie, c, N + 1, parent, ichani, sb, k, l, m)
                end if
            end if

            ! evaluate Coulomb-Green's integrals for dipole-coupled partial waves
            if (Qpws /= 0) then
                if (cache_integrals) then
                    irch = 0
                    if (allocated(cache % rchs_pws)) then
                        irch = findloc(cache % rchs_pws, l(N + 1)*ntarg + ichl, 1)
                    end if
                    if (ap /= 0) then
                        if (irch == 0) then
                            print '(/,a,4(i0,a))', 'Error: Missing integral in cache for channel pair ', ichanf, ' (irr ', irrf, &
                            ') - ', ichani, ' (irr ', irri, ')!'
                            print '(7x,a,*(1x,i0))', 'Available reduced channels in cache:', cache % rchs_pws
                            print '(7x,3(a,i0),a)', 'Needed reduced channel: ', l(N + 1)*ntarg + ichl, &
                                                    ' (ichl = ', ichl, ', l = ', l(N + 1), ')'
                            error stop
                        end if
                        integ = integ + Qpws * ap * cache % vals_pws((3 - sb)/2, irch)
                    end if
                    if (nphot > 0) then
                        if (irch == 0) then
                            print '(/,a,4(i0,a))', 'Error: Missing integral chain for channel pair ', ichanf, ' (irr ', irrf, &
                            ') - ', ichani, ' (irr ', irri, ')!'
                            print '(7x,a,*(1x,i0))', 'Available reduced channels in cache:', cache % rchs_pws
                            print '(7x,3(a,i0),a)', 'Needed reduced channel: ', l(N + 1)*ntarg + ichl, &
                                                    ' (ichl = ', ichl, ', l = ', l(N + 1), ')'
                            error stop
                        end if
                        integ = integ + Qpws * multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, &
                                                              ie, c, N + 1, parent, ichani, sb, k, l, m, &
                                                              cache % next_pws(irch))
                    end if
                else
                    m(N) = 1
                    mr(1:N) = m(N:1:-1)
                    if (ap /= 0) integ = integ + Qpws * ap * nested_cgreen_integ(Z, a, r0, c, N, +1, sb, mr, lr, kr)
                    if (nphot > 0) integ = integ + Qpws * multiint_chain(moldat, r0, Ei, esc - omega(nphot), omega, &
                                                                         ie, c, N + 1, parent, ichani, sb, k, l, m)
                end if
            end if

        end do

    end function multiint_chain


    !> \brief   Ion channel dipole coupling
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Returns the ion transition dipole element between the channels. This is diagonal in
    !> quantum numbers of the partial waves and simply equal to the corresponding N-electron
    !> propery integral.
    !>
    !> \param[in]  moldat  multidip_io::MolecularData object with data read from *molecular_data*.
    !> \param[in]  dcomp   Index of the Cartesian component of the dipole operator.
    !> \param[in]  irrf    Irreducible representation of the final state channel.
    !> \param[in]  irri    Irreducible representation of the initial state channel.
    !> \param[in]  ichanf  Index of the final state channel.
    !> \param[in]  ichani  Index of the initial state channel.
    !>
    real(wp) function channel_coupling_ion (moldat, dcomp, irrf, irri, ichanf, ichani) result (Qion)

        use multidip_io,     only: MolecularData
        use multidip_params, only: carti

        type(MolecularData), intent(in) :: moldat
        integer,             intent(in) :: dcomp, irrf, irri, ichanf, ichani

        integer :: itargi, itargf, li, lf, mi, mf

        lf = moldat % l2p(ichanf, irrf); mf = moldat % m2p(ichanf, irrf); itargf = moldat % ichl(ichanf, irrf)
        li = moldat % l2p(ichani, irri); mi = moldat % m2p(ichani, irri); itargi = moldat % ichl(ichani, irri)

        if (lf /= li .or. mf /= mi) then
            Qion = 0
        else
            Qion = moldat % crlv(itargf, itargi, carti(dcomp))
        end if

    end function channel_coupling_ion


    !> \brief   Partial wave channel dipole coupling
    !> \author  J Benda
    !> \date    2020 - 2024
    !>
    !> Returns the partial wave transition dipole element between the channels. This is
    !> diagonal in the ion states and proportional to the Gaunt coefficient.
    !>
    !> \param[in]  moldat  multidip_io::MolecularData object with data read from *molecular_data*.
    !> \param[in]  dcomp   Index of the Cartesian component of the dipole operator.
    !> \param[in]  irrf    Irreducible representation of the final state channel.
    !> \param[in]  irri    Irreducible representation of the initial state channel.
    !> \param[in]  ichanf  Index of the final state channel.
    !> \param[in]  ichani  Index of the initial state channel.
    !>
    real(wp) function channel_coupling_pws (moldat, dcomp, irrf, irri, ichanf, ichani) result (Qpws)

        use multidip_io,     only: MolecularData
        use multidip_params, only: cartm

        type(MolecularData), intent(in) :: moldat
        integer,             intent(in) :: dcomp, irrf, irri, ichanf, ichani

        integer :: itargi, itargf, li, lf, mi, mf, ipwi, ipwf

        itargf = moldat % ichl(ichanf, irrf)
        itargi = moldat % ichl(ichani, irri)

        if (itargf /= itargi) then
            Qpws = 0
        else
            lf = moldat % l2p(ichanf, irrf); mf = moldat % m2p(ichanf, irrf); ipwf = lf*(lf + 1) + mf + 1
            li = moldat % l2p(ichani, irri); mi = moldat % m2p(ichani, irri); ipwi = li*(li + 1) + mi + 1
            Qpws = sqrt(4*pi/3) * moldat % gaunt(ipwf, ipwi, cartm(dcomp))
        end if

    end function channel_coupling_pws


    !> \brief   Calculate partial wave dipoles, oriented dipoles and cross sections
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> Given the uncontracted partial wave dipoles
    !> \f[
    !>     M_{i_f, l_f, m_f, j_1, \dots, j_n}^{(n)}
    !> \f]
    !> calculated in \ref extract_dipole_elements, where \f$ i_f \f$ is the index of the final ion state, \f$ l_f \f$ and
    !> \f$ m_f \f$ denote the emission partial wave and \f$ j_1, \dots, j_n \f$ are the indices of components of the polarisation
    !> vectors, evaluate the partial wave transition matrix elements
    !> \f[
    !>     M_{i_f, l_f, m_f} = \mathrm{i}^{-l_f} \mathrm{e}^{\mathrm{i} \sigma_f} \sum_{j_1, \dots, j_n}
    !>                         \epsilon_{j_1} \dots \epsilon_{j_n} M_{i_f, l_f, mf_, j_1, \dots, j_n}^{(n)}
    !> \f]
    !> contracted with the polarisation vectors themselves, and the fixed-orientation generalized cross section
    !> \f[
    !>    \sigma_f^{(n)} = 2\pi (2\pi\alpha\omega)^n \sum_{l_f m_f} \left| M_{i_f, l_f, m_f} \right|^2
    !> \f]
    !>
    !> \param[in]  moldat   multidip_io::MolecularData object with data read from the file *molecular_data*.
    !> \param[in]  order    Perturbation order matrix elements (= number of absorbed photons).
    !> \param[in]  state    Tree of intermediate and final states.
    !> \param[in]  escat    Scattering energies in a.u., as stored in the K-matrix files.
    !> \param[in]  calc_Ei  Initial state energy calculated by (MPI-)SCATCI or BOUND.
    !> \param[in]  first_IP Ionization potential according to the input namelist.
    !> \param[in]  Ephoton  Fixed photon energies in a.u. or zeros for flexible photons.
    !> \param[in]  polar    Photon polarisations or zeros for polarisation averaging.
    !> \param[in]  erange   Range (subset) of energies in K-matrix files to use for calculations.
    !>
    subroutine calculate_pw_transition_elements (moldat, order, state, escat, calc_Ei, first_IP, Ephoton, polar, erange)

        use multidip_io,      only: MolecularData, myproc, nprocs, write_partial_wave_moments, write_cross_section
        use multidip_params,  only: alpha, maxtarg
        use multidip_special, only: cphase

        type(MolecularData),              intent(in) :: moldat
        type(IntermediateState), pointer, intent(in) :: state
        type(IntermediateState), pointer             :: ptr, parent

        integer,     intent(in) :: order, erange(2)
        real(wp),    intent(in) :: escat(:), calc_Ei, first_IP, Ephoton(:)
        complex(wp), intent(in) :: polar(:, :)
        integer                 :: nesc, ntarg, nchan, mxchan, lf, ichan, itarg, ie, irr, mgvn, mye, nirr, nene
        real(wp)                :: Z, Ef, Ei, kf, sigma, Q
        complex(wp)             :: d, ej
        complex(wp), allocatable :: M(:, :, :)
        real(wp),    allocatable :: cs(:, :), omega(:)

        Z = moldat % nz - moldat % nelc         ! residual charge
        nirr = size(moldat % mgvns)             ! number of irreducible representations in molecular_data
        nesc = size(escat)                      ! total number of scattering energies
        mxchan = maxval(moldat % nchan)         ! maximal number of channels per irreducible representation
        ntarg = size(moldat % etarg)            ! number of targes (residual ions)
        nene = (nesc + nprocs - 1) / nprocs     ! number of scattering energies managed by this image

        ! if not all final targets are required, reduce some of the above limits
        if (maxtarg > 0) then
            mxchan = 0
            do irr = 1, nirr
                nchan = moldat % nchan(irr)
                mxchan = max(mxchan, count(moldat % ichl(1:nchan, irr) <= maxtarg))
            end do
            ntarg = min(ntarg, maxtarg)
        end if

        ! set up final storage
        allocate (omega(order), M(mxchan, nene, 0:7), cs(2 + ntarg, nene))
        M = 0
        cs = 0

        ! the dipole elements stored in molecular_data do not contain the charge, so for each transition we need a factor of -1
        Q = (-1)**order

        ! calculate fixed-in-space transition matrix elements
        ptr => state
        do while (associated(ptr))
            if (ptr % order == order) then

                ! get polarisation factor for this absorption chain
                ej = polar(ptr % dcomp, ptr % order)
                parent => ptr % parent
                do while (associated(parent % parent))
                    ej = ej * polar(parent % dcomp, parent % order)
                    parent => parent % parent
                end do

                ! add transition element contribution from this absorption chain
                mye = 0
                do ie = myproc, nesc, nprocs
                    mye = mye + 1
                    do ichan = 1, size(ptr % dip, 1)
                        d = ej * cmplx(ptr % dip(ichan, 1, mye), ptr % dip(ichan, 2, mye), wp)
                        M(ichan, mye, ptr % mgvn) = M(ichan, mye, ptr % mgvn) + Q * d
                    end do
                end do

            end if
            ptr => ptr % next
        end do

        ! write partial wave transition moments without Coulomb phase factor
        call write_partial_wave_moments(moldat, M, nesc, '')

        ! process transition matrix elements
        mye = 0
        do ie = myproc, nesc, nprocs

            mye = mye + 1
            Ei = calc_Ei
            call calculate_photon_energies(first_IP, escat(ie), moldat % etarg(1), Ei, Ephoton, omega)

            ! add phase factors to transition matrix elements
            do irr = 1, nirr
                mgvn = moldat % mgvns(irr)
                nchan = min(int(moldat % nchan(irr)), mxchan)
                do ichan = 1, nchan
                    lf = moldat % l2p(ichan, irr)
                    Ef = escat(ie) - moldat % etarg(moldat % ichl(ichan, irr)) + moldat % etarg(1)
                    if (Ef > 0) then
                        kf = sqrt(2*Ef)
                        sigma = cphase(Z, lf, kf)
                        M(ichan, mye, mgvn) = M(ichan, mye, mgvn) * imu**(-lf) * (cos(sigma) + imu*sin(sigma))
                    end if
                end do
            end do

            ! calculate oriented cross section
            cs(1, mye) = omega(1) * to_eV
            do itarg = 1, ntarg
                cs(2 + itarg, mye) = 0
                do irr = 1, nirr
                    mgvn = moldat % mgvns(irr)
                    nchan = min(int(moldat % nchan(irr)), mxchan)
                    do ichan = 1, nchan
                        if (itarg == moldat % ichl(ichan, irr)) then
                            d = M(ichan, mye, mgvn)
                            cs(2 + itarg, mye) = cs(2 + itarg, mye) + real(d * conjg(d))
                        end if
                    end do
                end do
            end do
            cs(2, mye) = sum(cs(3:, mye))
            cs(2:, mye) = cs(2:, mye) * 2*pi * (2*pi*alpha)**order * product(omega)

        end do

        ! write partial wave transition moments with Coulomb phase factor
        call write_partial_wave_moments(moldat, M, nesc, '+cphase')

        ! write fixed-in-space cross section
        call write_cross_section(cs, nesc, erange, 'gen_photo_xsec.txt')

    end subroutine calculate_pw_transition_elements


    !> \brief   Calculate cross sections and asymmetry parameters
    !> \author  J Benda
    !> \date    2020 - 2023
    !>
    !> Evaluate coefficients \f$ \beta_L \f$ in the expansion of differential cross section averaged over
    !> molecular orientation:
    !> \f[
    !>     \frac{\mathrm{d}\sigma}{\mathrm{d}\Omega} = \frac{\beta_0}{4\pi} \left(1+\sum_{L=1}^{2n} \beta_L P_L(\cos\theta) \right).
    !> \f]
    !> The quantity \f$ \beta_0 \f$ is equal to the integral cross section.
    !>
    !> \param[in]  moldat        multidip_io::MolecularData object with data read from the file *molecular_data*.
    !> \param[in]  order         Perturbation order matrix elements (= number of absorbed photons).
    !> \param[in]  state         Tree of intermediate and final states.
    !> \param[in]  escat         Scattering energies in a.u., as stored in the K-matrix files.
    !> \param[in]  calc_Ei       Initial state energy calculated by (MPI-)SCATCI or BOUND.
    !> \param[in]  first_IP      Ionization potential according to the input namelist.
    !> \param[in]  Ephoton       Fixed photon energies in a.u. or zeros for flexible photons.
    !> \param[in]  raw           Write raw transition dipoles (in spherical or cartesian basis).
    !> \param[in]  erange        Range (subset) of energies in K-matrix files to use for calculations.
    !> \param[in]  p             Laboratory-frame polarisation (i.e. one of: -1,0,1) for each photon absorption.
    !> \param[in]  lu_pw_dipoles Base number for the logical unit for saving the pw dipoles in RSOLVE format (typically 410).
    !>
    subroutine calculate_asymmetry_parameters (moldat, order, state, escat, calc_Ei, first_IP, Ephoton, raw, erange, p, &
                                               lu_pw_dipoles)

        use multidip_io,       only: MolecularData, myproc, nprocs, write_raw_dipoles, write_cross_section, write_rsolve_dipoles
        use multidip_params,   only: alpha, maxtarg
        use multidip_special,  only: cphase

        character(len=*),                 intent(in) :: raw
        type(MolecularData),              intent(in) :: moldat
        type(IntermediateState), pointer, intent(in) :: state
        type(IntermediateState), pointer             :: ptr, parent

        integer,     intent(in) :: order, erange(2), p(:), lu_pw_dipoles
        real(wp),    intent(in) :: escat(:), calc_Ei, first_IP, Ephoton(:)

        integer                  :: nesc, ii, li, mi, nchain, ichain
        integer                  :: ntarg, ichan, ie, mye, irr, L, nene, maxl, pw
        integer,     allocatable :: chains(:, :)
        real(wp),    allocatable :: omega(:), cs(:, :)
        complex(wp), allocatable :: M_xyz(:, :, :, :), M_xyz_no_phase(:, :, :, :), M_sph(:, :, :, :), beta(:, :)
        complex(wp)              :: d
        real(wp)                 :: Z, Ek, k, sigma, Ei, Q
        character(len=50)        :: filename

        Z = moldat % nz - moldat % nelc
        nesc = size(escat)                      ! total number of scattering energies
        nene = (nesc + nprocs - 1) / nprocs     ! number of scattering energies managed by this image
        maxl = maxval(moldat % l2p)             ! highest partial wave angular momentum
        ntarg = size(moldat % etarg)            ! number of targes (residual ions)

        ! if not all final targets are required, reduce some of the above limits
        if (maxtarg > 0) then
            ntarg = min(ntarg, maxtarg)
            maxl = maxval(moldat % l2p, moldat % ichl <= maxtarg)
        end if

        ! find out how many absorption chains we have
        nchain = 0
        ptr => state
        do while (associated(ptr))
            if (ptr % order == order) then
                nchain = nchain + 1
            end if
            ptr => ptr % next
        end do

        allocate (omega(order), chains(order, nchain), cs(ntarg, nene), beta(2 + ntarg, nesc), &
                  M_xyz(nene, (maxl + 1)**2, nchain, ntarg), &
                  M_sph(nene, (maxl + 1)**2, nchain, ntarg))


        M_xyz = 0            ! multiphoton transition matrix elements in Cartesian basis
        M_sph = 0            ! multiphoton transition matrix elements in spherical basis

        if (order == 1) then
           allocate(M_xyz_no_phase(nene, (maxl + 1)**2, nchain, ntarg))
           M_xyz_no_phase = 0   ! multiphoton transition matrix elements in Cartesian basis without the additional phase factors
        endif

        ! the dipole elements stored in molecular_data do not contain the charge, so for each transition we need a factor of -1
        Q = (-1)**order

        ! assemble multi-photon molecular transition elements in Cartesian basis
        ichain = 0
        ptr => state
        do while (associated(ptr))
            if (ptr % order == order) then

                ! find this irreducible representation in molecular_data
                irr = findloc(moldat % mgvns, ptr % mgvn, 1)

                ! assemble polarisation components for this absorption chain
                ichain = ichain + 1
                parent => ptr
                do while (associated(parent % parent))
                    chains(parent % order, ichain) = mod(1 + parent % dcomp, 3) - 1  ! y ~ -1, z ~ 0, x ~ +1
                    parent => parent % parent
                end do

                ! copy all Cartesian matrix elements for this irreducible representation and absorption history to M_xyz
                do ichan = 1, size(ptr % dip, 1)
                    ii = moldat % ichl(ichan, irr)
                    li = moldat % l2p(ichan, irr)
                    mi = moldat % m2p(ichan, irr)
                    pw = li*li + li + mi + 1
                    mye = 0
                    do ie = myproc, nesc, nprocs
                        mye = mye + 1
                        Ek = escat(ie) - moldat % etarg(ii) + moldat % etarg(1)
                        if (Ek > 0) then
                            k = sqrt(2*Ek)
                            sigma = cphase(Z, li, k)
                            d = cmplx(ptr % dip(ichan, 1, mye), ptr % dip(ichan, 2, mye), wp)
                            if (.not. d == d) d = 0  ! clear NaNs
                            M_xyz(mye, pw, ichain, ii) = imu**(-li) * (cos(sigma) + imu*sin(sigma)) * Q * d
                            if (order == 1) M_xyz_no_phase(mye, pw, ichain, ii) = Q * d
                        end if
                    end do
                end do

            end if
            ptr => ptr % next
        end do

        ! for 1-photon case write the dipoles always in the RSOLVE format
        if (order == 1) then
           call write_rsolve_dipoles(moldat, M_xyz_no_phase, chains, escat, lu_pw_dipoles)
        endif

        if (raw == 'xyz' .or. raw == 'both') then
            call write_raw_dipoles(M_xyz, chains, nesc, 'xyz')
        end if

        ! transform the multiphoton transition matrix elements from Cartesian basis (M_xyz) to spherical basis (M_sph)
        call convert_xyz_to_sph (M_xyz, M_sph, maxl, chains)

        if (raw == 'sph' .or. raw == 'both') then
            call write_raw_dipoles(M_sph, chains, nesc, 'sph')
        end if

        ! evaluate and write the asymmetry parameters for all possible orders
        do L = 0, 2*order

            print '(/,A,I0,A,*(1x,I0))', 'Evaluating asymmetry parameter for L = ', L, ', p =', p(1:order)

            ! calculate the absolute asymmetry parameter
            call calculate_quadratic_dipole_sph(beta, L, maxl, chains, chains, ntarg, nesc, M_sph, M_sph, p)

            ! sum partial cross section, add prefactors
            mye = 0
            do ie = myproc, nesc, nprocs
                mye = mye + 1
                Ei = calc_Ei
                call calculate_photon_energies(first_IP, escat(ie), moldat % etarg(1), Ei, Ephoton, omega)
                beta(1, mye) = omega(1) * to_eV
                beta(2:, mye) = beta(2:, mye) * 2*pi * (2*pi*alpha)**order * product(abs(omega))
                if (L == 0) then
                    cs(:, mye) = real(beta(3:, mye)) ! partial cross sections per target
                    beta(2, mye) = sum(cs(:, mye))   ! total cross section
                else
                    where (cs(:, mye) > 0)
                        beta(3:, mye) = beta(3:, mye) / cs(:, mye)  ! scale partial asymmetry parameter by the partial cross section
                    end where
                    beta(2, :) = 0                                  ! there is no "total beta" for L > 0
                end if
            end do

            ! write the asymmetry parameter
            write (filename, '(A,I0,A,I0,A)') 'gen_', order, 'photo_beta_', L, '.txt'
            call write_cross_section(real(beta, wp), nesc, erange, filename)

        end do

    end subroutine calculate_asymmetry_parameters


    !> \brief  Change coordiantes
    !> \author J Benda
    !> \date   2021
    !>
    !> Transform the multi-photon matrix elements from the Cartesian basis to the spherical basis.
    !> It is expected that both `M_xyz` and `M_sph` arrays are allocated and have the same size.
    !>
    subroutine convert_xyz_to_sph (M_xyz, M_sph, maxl, chains)

        use dipelm_special_functions, only: sph_basis_transform_elm

        complex(wp), allocatable, intent(in)    :: M_xyz(:, :, :, :)
        complex(wp), allocatable, intent(inout) :: M_sph(:, :, :, :)
        integer,                  intent(in)    :: maxl, chains(:, :)

        integer     :: order, ntarg, nchain, i, ii, l, mi, mj, pwi, pwj, ichain, jchain
        complex(wp) :: cpl

        order = size(chains, 1)
        nchain = size(chains, 2)
        ntarg = size(M_xyz, 4)

        M_sph = 0

        do ii = 1, ntarg
            do ichain = 1, nchain
                do jchain = 1, nchain
                    do l = 0, maxl
                        do mi = -l, l
                            do mj = -l, l
                                cpl = conjg(sph_basis_transform_elm(l, mj, mi, 'Slm'))
                                do i = 1, order
                                    cpl = cpl * sph_basis_transform_elm(1, chains(i, jchain), chains(i, ichain), 'Slm')
                                end do
                                pwi = l*l + l + mi + 1
                                pwj = l*l + l + mj + 1
                                M_sph(:, pwj, jchain, ii) = M_sph(:, pwj, jchain, ii) + cpl * M_xyz(:, pwi, ichain, ii)
                            end do
                        end do
                    end do
                end do
            end do
        end do

    end subroutine convert_xyz_to_sph


    !> \brief   Evaluate asymmetry parameter for given total L in the spherical basis
    !> \author  J Benda
    !> \date    2021 - 2024
    !>
    !> Evaluate quadratic form in partial wave dipoles (e.g. asymmetry parameter beta_L) for all final states.
    !> For cross sections, the (spherical) matrix elements M1 and M2 correspond to the same energies. However, for RABITT, these
    !> are evaluated at different energies.
    !>
    subroutine calculate_quadratic_dipole_sph (beta, L, maxl, chains1, chains2, ntarg, nesc, M1, M2, p)

        use mpi_gbl,          only: mpi_reduceall_inplace_sum_wp
        use multidip_io,      only: myproc, nprocs
        use multidip_special, only: beta_contraction_tensor

        complex(wp), allocatable, intent(inout) :: beta(:, :)
        complex(wp), allocatable, intent(in)    :: M1(:, :, :, :), M2(:, :, :, :)
        integer,                  intent(in)    :: L, maxl, chains1(:, :), chains2(:, :), ntarg, nesc, p(:)

        complex(wp)           :: MM
        real(wp), allocatable :: T(:, :, :, :), buffer(:)
        integer               :: ie, mye, itarg, idx, li, mi, lj, mj, pwi, pwj, ichain, jchain, nchain1, nchain2, order1, order2
        integer               :: qi(size(chains1, 1)), qj(size(chains2, 1))

        if (any(abs(p) > 1)) then
           print '(/,A,I0)','calculate_quadratic_dipole_sph: lab-frame polarisation p out of range', p
           stop 1
        endif

        beta = 0
        order1 = size(chains1, 1); nchain1 = size(chains1, 2)
        order2 = size(chains2, 1); nchain2 = size(chains2, 2)

        if (order1 /= order2) then
            print '(A)', 'WARNING: calculate_quadratic_dipole_sph is implemented only equal absorption orders'
            return
        end if

        allocate (T((maxl + 1)**2, (maxl + 1)**2, nchain1, nchain2))

        ! erase the angular integrals storage
        !$omp parallel do default(none) private(pwi, pwj, ichain, jchain) shared(maxl, nchain1, nchain2, T) collapse(4)
        do jchain = 1, nchain2
            do ichain = 1, nchain1
                do pwj = 1, (maxl + 1)**2
                    do pwi = 1, (maxl + 1)**2
                        T(pwi, pwj, ichain, jchain) = 0
                    end do
                end do
            end do
        end do

        ! calculate angular integrals (distribute over images and threads)
        !$omp parallel do default(none) schedule(dynamic) private(idx, pwi, pwj, li, lj, mi, mj, ichain, jchain, qi, qj) &
        !$omp& shared(nchain1, nchain2, maxl, chains1, chains2, nprocs, myproc, L, p, T, order1)
        do idx = myproc, (maxl + 1)**4 * nchain1 * nchain2, nprocs

            ! unpack idx = ((pwi*(maxl + 1)**2 + pwj)*nchain1 + ichain - 1)*nchain2 + jchain
            pwi    = 1 +     (idx - 1) / (nchain2 * nchain1 * (maxl + 1)**2)    ! = 1, ..., (maxl + 1)^2
            pwj    = 1 + mod((idx - 1) / (nchain2 * nchain1), (maxl + 1)**2)    ! = 1, ..., (maxl + 1)^2
            ichain = 1 + mod((idx - 1) /  nchain2,  nchain1)                    ! = 1, ..., nchain1
            jchain = 1 + mod( idx - 1,    nchain2)                              ! = 1, ..., nchain2

            ! unpack pwi = li*li + li + mi + 1
            li = floor(sqrt(pwi - 1._wp))
            mi = pwi - li*li - li - 1
            qi = chains1(:, ichain)

            ! unpack pwj = lj*lj + lj + mj + 1
            lj = floor(sqrt(pwj - 1._wp))
            mj = pwj - lj*lj - lj - 1
            qj = chains2(:, jchain)

            T(pwi, pwj, ichain, jchain) = beta_contraction_tensor(L, order1, p, li, mi, qi, lj, mj, qj)

        end do

        ! allreduce the contraction tensor
        buffer = reshape(T, [size(T)])
        call mpi_reduceall_inplace_sum_wp(buffer, size(buffer))
        T = reshape(buffer, shape(T))

        ! calculate the asymmetry parameter for this image's energies (distribute over threads)
        !$omp parallel do default(none) private(idx, itarg, pwi, pwj, ichain, jchain, li, mi, lj, mj, mye, MM) reduction(+:beta) &
        !$omp& shared(ntarg, maxl, nchain1, nchain2, myproc, nesc, nprocs, M1, M2, T)
        do idx = 1, ntarg * (maxl + 1)**4 * nchain1 * nchain2

            ! unpack idx = (((itarg - 1)*(maxl + 1)**2 + pwi)*(maxl + 1)**2 + pwj)*nchain1 + ichain - 1)*nchain2 + jchain
            itarg  = 1 +     (idx - 1) / (nchain2 * nchain1 * (maxl + 1)**4)                    ! = 1, ..., ntarg
            pwi    = 1 + mod((idx - 1) / (nchain2 * nchain1 * (maxl + 1)**2), (maxl + 1)**2)    ! = 1, ..., (maxl + 1)^2
            pwj    = 1 + mod((idx - 1) / (nchain2 * nchain1), (maxl + 1)**2)                    ! = 1, ..., (maxl + 1)^2
            ichain = 1 + mod((idx - 1) /  nchain2,  nchain1)                                    ! = 1, ..., nchain1
            jchain = 1 + mod( idx - 1,    nchain2)                                              ! = 1, ..., nchain2

            ! unpack pwi = li*li + li + mi
            li = floor(sqrt(pwi - 1._wp))
            mi = pwi - li*li - li - 1

            ! unpack pwj = lj*lj + lj + mj
            lj = floor(sqrt(pwj - 1._wp))
            mj = pwj - lj*lj - lj - 1

            mye = 0
            do ie = myproc, nesc, nprocs
                mye = mye + 1
                MM = conjg(M1(mye, pwi, ichain, itarg)) * M2(mye, pwj, jchain, itarg)
                beta(2 + itarg, mye) = beta(2 + itarg, mye) + T(pwi, pwj, ichain, jchain) * MM
            end do

        end do

    end subroutine calculate_quadratic_dipole_sph


    !> \brief   Evaluate asymmetry parameter for given total L in the Cartesian basis
    !> \author  J Benda
    !> \date    2021 - 2023
    !>
    !> Evaluate quadratic form in partial wave dipoles (e.g. asymmetry parameter beta_L) for all final states.
    !> For cross sections, the (Cartesian) matrix elements M1 and M2 correspond to the same energies. However, for RABITT, these
    !> are evaluated at different energies.
    !>
    !> \note This subroutine is implemented only for linear polarisation and only for L = 0.
    !>
    subroutine calculate_quadratic_dipole_xyz (beta, L, maxl, chains1, chains2, ntarg, nesc, M1, M2)

        use multidip_io,      only: myproc, nprocs
        use multidip_special, only: cartesian_vector_component_product_average

        complex(wp), allocatable, intent(inout) :: beta(:, :)
        complex(wp), allocatable, intent(in)    :: M1(:, :, :, :), M2(:, :, :, :)
        integer,                  intent(in)    :: L, maxl, chains1(:, :), chains2(:, :), ntarg, nesc

        complex(wp) :: MM
        real(wp)    :: T
        integer     :: ie, mye, itarg, pwi, qi(size(chains1, 1)), qj(size(chains2, 1)), ichain, jchain

        beta = 0

        if (L /= 0) then
            print '(A)', 'WARNING: calculate_quadratic_dipole_xyz is implemented only for L = 0'
            return
        end if

        ! calculate the asymmetry parameter
        !$omp parallel do collapse(2) default(none) private(ichain, jchain, qi, qj, T, pwi, mye, ie, MM) reduction(+:beta) &
        !$omp& shared(chains1, chains2, maxl, myproc, nesc, nprocs, ntarg, M1, M2)
        do ichain = 1, size(chains1, 2)
            do jchain = 1, size(chains2, 2)
                qi = chains1(:, ichain)
                qj = chains2(:, jchain)
                T = cartesian_vector_component_product_average([qi, qj])
                do pwi = 1, (maxl + 1)**2
                    mye = 0
                    do ie = myproc, nesc, nprocs
                        mye = mye + 1
                        do itarg = 1, ntarg
                            MM = conjg(M1(mye, pwi, ichain, itarg)) * M2(mye, pwi, jchain, itarg)
                            beta(2 + itarg, mye) = beta(2 + itarg, mye) + T * MM
                        end do
                    end do
                end do
            end do
        end do

    end subroutine calculate_quadratic_dipole_xyz

end module multidip_routines
