Generated from subdiag_mpi.c with ROBODoc v3.2.2 on Fri Sep 14 14:23:24 2001

TABLE OF CONTENTS

  1. QMD-MGDFT/subdiag_mpi.c

QMD-MGDFT/subdiag_mpi.c

NAME
   Ab initio real space code with multigrid acceleration
   Quantum molecular dynamics package.
   Version: 2.1.5
COPYRIGHT
   Copyright (C) 1995  Emil Briggs
   Copyright (C) 1998  Emil Briggs, Charles Brabec, Mark Wensell, 
                       Dan Sullivan, Chris Rapcewicz, Jerzy Bernholc
   Copyright (C) 2001  Emil Briggs, Wenchang Lu,
                       Marco Buongiorno Nardelli,Charles Brabec, 
                       Mark Wensell,Dan Sullivan, Chris Rapcewicz,
                       Jerzy Bernholc
FUNCTION
   void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
   This is the version of subdiag for message passing machines(MPI).
   Subspace diagonalizer for the Mehrstellen discretization of the
   Kohn-Sham Hamiltonian.

    1. Computes <i |A + BV| j> and <i |B| j>
    2. Generates the inverse of <i |B| j> and then multiplies <i |A + BV| j>
       by this inverse.
    3. Diagonalizes the resulting matrix and then remixes the eigenvectors
       accordingly.
INPUTS
   states: points to orbital structure (see md.h)
   vh:     Hartree potential
   vnuc:   pseudopotential
   vxc:    exchange correlation potential
OUTPUT
   wave functions and eigenvalues in states are updated
PARENTS
   cdfastrlx.c fastrlx.c moldyn.c init.c quench.c
CHILDREN
   global_sums.c app_cir.c app_nl.c genvpsi.c pack_ptos.c app_cilr.c 
   gather_psi.c trade_images.c app_grad.c
SOURCE
    #include "md.h"
    #include <float.h>
    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    #if (MACHINE_TYPE == PARALLEL_MESSAGE)
    
    #if (CRAY_T3E || CRAY_C90)
    #include <fortran.h>
    
    
    void spotrf(_fcd uplo, int *n, REAL *a, int *lda, int *info);
    void spotri(_fcd uplo, int *n, REAL *a, int *lda, int *info);
    void ssyev(_fcd jobz, _fcd uplo, int *n, REAL *a, int *lda,
                  REAL *w, REAL *work, int *lwork, int *info);
    void cpotrf(_fcd uplo, int *n, REAL *a, int *lda, int *info); 
    
    #endif
    
    void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij); 
    void subdiag2_mpi(REAL *Aij, REAL *base_mem);
    
    
    void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
    {
        int idx, ip, st1, st2;
        int stop;
        int t1, kidx;
        REAL *work1, *work2, *work3;
        int ione = 1; /* blas constants */
        int lwork;
        char *uplo="l", *jobz="v";
    
        int info = 0;
        REAL time1, time2;
        REAL *Aij, *Bij, *Cij;
        STATE *sp;
        ION *iptr;
        REAL *work1R, *work1I;
    
    #if (CRAY_T3E || CRAY_C90)
        _fcd uplo_fcd,jobz_fcd;
    #endif
    barrier();
    
        time1 = my_crtc();
    
        if(pct.thispe == 0) printf("\n SUBSPACE DIAGONALIZATION");  
    
    
        /* Grab some temporary storage */
        stop = ct.num_states * ct.num_states;
    #if !GAMMA_PT
        stop = 2 * ct.num_states * ct.num_states;
    #endif
        Aij = (REAL *)get_mem(stop + 256);
        Bij = (REAL *)get_mem(stop + 256);
        Cij = (REAL *)get_mem(stop + 256);
        work1 = (REAL *)get_mem(26 * ct.num_states);
        work2 = work1 + 8*ct.num_states;
        work3 = work2 + 8*ct.num_states;
        work1R = work3 + 8*ct.num_states;
        work1I = work1R + ct.num_states;
    
        for(idx = 0;idx < stop;idx++) {
          Aij[idx] = Bij[idx] = Cij[idx] = 0.0;
        }
    
        kidx = states[0].kidx;
    
        for(st1 = 0;st1 < ct.num_states;st1++) {
    
          subdiag1_mpi(st1, states, Aij, Bij);
    
        } /* end for */
    
        /* Sum A and B overlap matrices over all processors */
        global_sums(Bij, &stop); 
        global_sums(Aij, &stop); 
    
        /* Generate the inverse of the B matrix and */
        /* compute the cholesky factor of the matrix w/LAPACK */
    #if GAMMA_PT
        cholesky(Bij, ct.num_states);
    #if CRAY_T3E
        uplo_fcd = _cptofcd(uplo, 1);  
        /* now get the inverse */
        spotri(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info); 
    #else
        spotri(uplo, &ct.num_states, Bij, &ct.num_states, &info); 
    #endif
    
    #else
    #if CRAY_T3E
        uplo_fcd = _cptofcd(uplo, 1);  
        cpotrf(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info);
    
        if (info != 0)
           error_handler("subdiag","cholesky factor of Bij failed");
    
        cpotri(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info);
    #else
        cpotrf(uplo, &ct.num_states, Bij, &ct.num_states, &info);
    
        if (info != 0)
           error_handler("subdiag","cholesky factor of Bij failed");
    
        cpotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
    
    #endif
    #endif
    
        if (info != 0)
           error_handler("subdiag","inverse of Bij failed");
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=st1;st2<ct.num_states;st2++){
    #if GAMMA_PT
              Bij[st2*ct.num_states + st1] = Bij[st1*ct.num_states+st2];
    #else
              Bij[2*(st2*ct.num_states + st1)] = Bij[2*(st1*ct.num_states+st2)];
              Bij[2*(st2*ct.num_states + st1)+1] = Bij[2*(st1*ct.num_states+st2)+1];
    #endif
    
          } /* end for */
    
        } /* end for */
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=0;st2<ct.num_states;st2++){
    
    #if GAMMA_PT
            Cij[st1*ct.num_states+st2] = ZERO;
    #else
            Cij[2*(st1*ct.num_states+st2)] = ZERO;
            Cij[2*(st1*ct.num_states+st2)+1] = ZERO;
    #endif
    
            for(idx=0;idx<ct.num_states;idx++){
    
    #if GAMMA_PT
              Cij[st1*ct.num_states+st2] += Bij[idx*ct.num_states+st1] * 
                                         Aij[st2*ct.num_states+idx];
    #else
              Cij[2*(st1*ct.num_states+st2)] += Bij[2*(idx*ct.num_states+st1)] * 
                                             Aij[2*(st2*ct.num_states+idx)];
              Cij[2*(st1*ct.num_states+st2)] -= Bij[2*(idx*ct.num_states+st1)+1] * 
                                             Aij[2*(st2*ct.num_states+idx)+1];
    
              Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)] * 
                                               Aij[2*(st2*ct.num_states+idx)+1];
              Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)+1] * 
                                               Aij[2*(st2*ct.num_states+idx)];
    
    #endif
    
            } /* end for */
    
          } /* end for */
    
        } /* end for */
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=st1;st2<ct.num_states;st2++){
    
    #if GAMMA_PT
    
            Aij[st1*ct.num_states+st2] = Cij[st1*ct.num_states+st2];
            Aij[st2*ct.num_states+st1] = Cij[st1*ct.num_states+st2];
    
    #else
            Aij[2*(st1*ct.num_states+st2)] = Cij[2*(st1*ct.num_states+st2)];
            Aij[2*(st1*ct.num_states+st2)+1] = Cij[2*(st1*ct.num_states+st2)+1];
    
            Aij[2*(st2*ct.num_states+st1)] = Cij[2*(st1*ct.num_states+st2)];
            Aij[2*(st2*ct.num_states+st1)+1] = -Cij[2*(st1*ct.num_states+st2)+1];
    #endif
    
          } /* end for */
    
          Aij[2*(st1*ct.num_states+st1)+1] = 0.0;
    
        } /* end for */
    
    
    #if GAMMA_PT
      #if CRAY_T3E
        /* Next we need to diagonalize A */
        uplo_fcd = _cptofcd(uplo, 1);
        jobz_fcd = _cptofcd(jobz, 1);                                              
        lwork = 3*ct.num_states;
        ssyev(jobz_fcd, uplo_fcd, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, &info);
      #else
        lwork = 3*ct.num_states;
        ssyev(jobz, uplo, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, &info);
      #endif
    #else
      #if CRAY_T3E
        uplo_fcd = _cptofcd(uplo, 1);
        jobz_fcd = _cptofcd(jobz, 1);                                              
        lwork = 3*ct.num_states;
        cheev(jobz_fcd, uplo_fcd, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, work3, &info);
      #else
        lwork = 3*ct.num_states;
        cheev(jobz, uplo, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, work3, &info);
      #endif
    #endif
    
    
        if (info != 0) 
           error_handler("subdiag","diag of Aij failed");
    
    
        /* Do the orbital update in here */
        subdiag2_mpi(Aij, states->psiR);
    
    
        /* Renormalize the orbitals */
        for (st1 = 0; st1 < ct.num_states; st1++) {
    
            sp = &states[st1];
            norm_psi(sp, ONE);
    
        } /* st */
    
    
        /* rotate stored projection-wf integrals */
        for(idx = 0; idx < ct.num_ions; idx++) {
            iptr = &ct.ions[idx];
    
            for(ip = 0; ip < ct.max_nl; ip++) {
    
                for(st1 = 0; st1 < ct.num_states; st1++) {
    #if GAMMA_PT
                    work1R[st1] = 0.0;
    #else
                    work1R[st1] = 0.0;
                    work1I[st1] = 0.0;
    
    #endif
                    for(st2 = 0; st2 < ct.num_states; st2++) {
    #if GAMMA_PT
                        work1R[st1] += Aij[st1*ct.num_states + st2] *
                            iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];                            
    #else
                        work1R[st1] += Aij[2*(st1*ct.num_states + st2)] * 
                                       iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] * 
                                       iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1I[st1] += Aij[2*(st1*ct.num_states + st2)] * 
                                       iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] * 
                                       iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                                     
    #endif
    
                    } /* st2 */
                } /* st1 */
    
    #if GAMMA_PT
                for(st1 = 0; st1 < ct.num_states; st1++) {
    
                    iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
    
                } /* st1 */
    #else
    
                for(st1 = 0; st1 < ct.num_states; st1++) {
    
                    iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
                    iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1I[st1];
    
                } /* st1 */
    
    #endif
    
            } /* ip */
    
        } /* idx */
    
        /* release our temporary storage */
        release_mem(work1);
        release_mem(Cij);
        release_mem(Bij);
        release_mem(Aij);
    
        time2 = my_crtc();
        md_timings(DIAG_TIME, (time2 - time1), 0);
    
    
    } /* end subdiag */
    
    
    #if GAMMA_PT
    void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij)
    {
      int kidx, idx, st2, ione=1;
      int dimx, dimy, dimz;
      REAL *sg_psi, *sg_twovpsi, *tmp_psi, *work1, *work2, *work3;
      REAL *vtot;
      STATE *sp, *sp1;
    
      sp = &states[istate];
      kidx = 0;
      tmp_psi = get_mem(7*sp->sbasis);
      sg_psi = tmp_psi + sp->sbasis;
      work1 = sg_psi + sp->sbasis;
      work2 = work1 + sp->sbasis;
      work3 = work2 + sp->sbasis;
      sg_twovpsi = work3 + sp->sbasis;
      vtot = sg_twovpsi + sp->sbasis;
    
      for(idx = 0;idx < sp->pbasis;idx++) 
              vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
    
    
      dimx = sp->dimx;
      dimy = sp->dimy;
      dimz = sp->dimz;  
      gather_psi(tmp_psi, NULL, sp, 0);
    
      /* Apply non-local operator to psi and store in work2 */
      app_nl(tmp_psi, NULL, work2, NULL, sp->istate, FALSE, kidx, 0);
    
    
      /* Generate 2*V*psi and store it in a smoothing grid */  
      genvpsi(tmp_psi, sg_twovpsi, vtot, work2, work3, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
                 
      /* B operating on 2*V*psi stored in work1 */
      app_cir(sg_twovpsi, work1, dimx, dimy, dimz);
    
    
      /* Pack psi into smoothing array */
      pack_ptos(sg_psi, tmp_psi, dimx, dimy, dimz);
    
    
      /* A operating on psi stored in work2 and B operating on psi stored in work3 */
      app_cilr(sg_psi, work2, work3, dimx, dimy, dimz, sp->hxgrid, sp->hygrid, sp->hzgrid);
                 
    
      for(idx=0;idx< sp->pbasis;idx++){
    
        work1[idx] = 0.5*ct.vel*(work1[idx] - work2[idx]);
    
      } /* end for */
    
    
      for(st2 = 0; st2 < ct.num_states;st2++) {
    
        sp1 = &states[st2];
        gather_psi(tmp_psi, NULL, sp1, 0);
        Aij[istate*ct.num_states + st2] = 
           QMD_sdot(sp->pbasis, tmp_psi, ione, work1, ione);
    
        /* <psi|B|psi> is symmetric */
        if(st2 >= istate) {
            Bij[istate*ct.num_states + st2] =
                  QMD_sdot(sp->pbasis, work3, ione, tmp_psi, ione);
            Bij[st2*ct.num_states + istate] = Bij[istate*ct.num_states + st2];
        } /* end if */
      } /* st2 */
    
    
      release_mem(tmp_psi);
    
    
    } /* end subdiag1_mpi */
    
    
    
    
    /* This routine is used to do the subspace rotation of the orbitals. Each
     * thread handles a specific portion of the real space domain.
     */
    void subdiag2_mpi(REAL *Aij, REAL *base_mem)
    {
      int idx, st1, st2;
      REAL *rptr;
      REAL *work1, *work2;
    
      work1 = get_mem(2 * ct.num_states);
      work2 = work1 + ct.num_states;
    
      for(idx = 0; idx < P0_BASIS; idx++) {
    
        rptr = base_mem;
        /* We make a temporary copy and store it in work2 otherwise the
         * cache thrashing is horrible.
         */
        for(st2 = 0;st2 < ct.num_states;st2++)
            work2[st2] = rptr[st2*P0_BASIS + idx];
    
        for(st1 = 0;st1 < ct.num_states;st1++) {
                 
          work1[st1] = 0.0;
          for(st2 = 0; st2 < ct.num_states; st2++) {
                 
            work1[st1] += Aij[st1*ct.num_states + st2] * work2[st2];
                 
          } /* st2 */
                 
        } /* end for */
                 
        /* update all wavefunctions for this *idx* */
        for(st1 = 0;st1 < ct.num_states;st1++) 
            rptr[st1*P0_BASIS + idx] = work1[st1];
                 
    
      } /* idx */
    
      release_mem(work1);
    
    } /* end subdiag2_mpi */
    
    #else
    
    
    void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij)
    {
      int idx, st1, st2, ione=1;
      int dimx, dimy, dimz, kidx;
      REAL *sg_psiR, *sg_psiI, *sg_twovpsiR, *sg_twovpsiI, *tmp_psiR, *tmp_psiI;
      REAL *work1R, *work1I;
      REAL *work2R, *work2I, *work3R, *work3I;
      REAL *rptr, *vtot;
      REAL *gx, *gy, *gz, *kdr;
      REAL s1, s2, s3, s4;
      STATE *sp, *sp1;
    
      sp = &states[istate];
      kidx = sp->kidx;
      tmp_psiR = get_mem(18*sp->sbasis);
      tmp_psiI = tmp_psiR + sp->sbasis;
      sg_psiR = tmp_psiI + sp->sbasis;
      sg_psiI = sg_psiR + sp->sbasis;
      work1R = sg_psiI + sp->sbasis;
      work1I = work1R + sp->sbasis;
      work2R = work1I + sp->sbasis;
      work2I = work2R + sp->sbasis;
      work3R = work2I + sp->sbasis;
      work3I = work3R + sp->sbasis;
      sg_twovpsiR = work3I + sp->sbasis;
      sg_twovpsiI = sg_twovpsiR + sp->sbasis;
      vtot = sg_twovpsiI + sp->sbasis;
      gx = vtot + sp->sbasis;
      gy = gx + sp->sbasis;
      gz = gy + sp->sbasis;
      kdr = gz + sp->sbasis;
    
      dimx = sp->dimx;
      dimy = sp->dimy;
      dimz = sp->dimz;  
    
    
      for(idx = 0;idx < sp->pbasis;idx++) 
              vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
    
      gather_psi(tmp_psiR, tmp_psiI, sp, 0);
    
    
      /* Apply non-local operator to psi and store in work2 */
      app_nl(tmp_psiR, tmp_psiI, work2R, work2I, sp->istate, FALSE, sp->kidx, 0);
    
    
      /* Pack psi into smoothing array and get image data */
      pack_ptos(sg_psiR, tmp_psiR, dimx, dimy, dimz);
      trade_images(sg_psiR, dimx, dimy, dimz, pct.neighbors);
      pack_ptos(sg_psiI, tmp_psiI, dimx, dimy, dimz);
      trade_images(sg_psiI, dimx, dimy, dimz, pct.neighbors);
    
    
      /* Apply the gradient operator to psi */
      app_grad((S0_GRID *)sg_psiI, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
      for(idx = 0;idx < P0_BASIS;idx++) {
    
          kdr[idx] = (ct.kp[sp->kidx].kvec[0] * gx[idx] +
                      ct.kp[sp->kidx].kvec[1] * gy[idx] +
                      ct.kp[sp->kidx].kvec[2] * gz[idx]);
    
      } /* end for */
    
      /* Generate 2 * V * psi */
      genvpsi(tmp_psiR, sg_twovpsiR, vtot, work2R,
              kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
    
      /* Apply the gradient operator to psi */
      app_grad((S0_GRID *)sg_psiR, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
      for(idx = 0;idx < P0_BASIS;idx++) {
    
        kdr[idx] = -(ct.kp[sp->kidx].kvec[0] * gx[idx] +
                     ct.kp[sp->kidx].kvec[1] * gy[idx] +
                     ct.kp[sp->kidx].kvec[2] * gz[idx]);      
    
      } /* end for */     
    
      /* Generate 2 * V * psi */
      genvpsi(tmp_psiI, sg_twovpsiI, vtot, work2I,
              kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
    
    
      /* A operating on psi stored in work2 and B operating on psi stored in work3 */
      app_cilr(sg_psiR, work2R, work3R, dimx, dimy, dimz,
                    sp->hxgrid, sp->hygrid, sp->hzgrid);
      app_cilr(sg_psiI, work2I, work3I, dimx, dimy, dimz,
                    sp->hxgrid, sp->hygrid, sp->hzgrid);
    
    
    
      /* B operating on 2*V*psi stored in work1 */
      app_cir(sg_twovpsiR, work1R, dimx, dimy, dimz);
      app_cir(sg_twovpsiI, work1I, dimx, dimy, dimz);
    
    
      for(idx=0;idx< sp->pbasis;idx++)
        work1R[idx] = 0.5*ct.vel*(work1R[idx] - work2R[idx]);
    
      for(idx=0;idx< sp->pbasis;idx++)
        work1I[idx] = 0.5*ct.vel*(work1I[idx] - work2I[idx]);
    
    
    /* Compute the complex overlap matrices here */
    
      for(st2 = 0; st2 < ct.num_states;st2++) {
    
        sp1 = &states[st2];
        gather_psi(tmp_psiR, tmp_psiI, sp1, 0);
        s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1R, ione);
        s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1I, ione);
        s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1R, ione);
        s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1I, ione);
    
        Aij[2*(sp->istate*ct.num_states + st2)] = (s1 + s4);
        Aij[2*(sp->istate*ct.num_states + st2) + 1] = (s3 - s2);
        /* <psi|B|psi> is symmetric */
        if(st2 >= istate) {
    
          s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3R, ione);
          s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3I, ione);
          s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3R, ione);
          s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3I, ione);
    
          Bij[2*(sp->istate*ct.num_states + st2)] = ct.vel * (s1 + s4);
          Bij[2*(sp->istate*ct.num_states + st2)+1] = -ct.vel * (s3 - s2);
          Bij[2*(st2*ct.num_states + sp->istate)] = ct.vel * (s1 + s4);
          Bij[2*(st2*ct.num_states + sp->istate)+1] = ct.vel * (s3 - s2);
    
       } /* end if */
    
      } /* st2 */
    
    
      release_mem(tmp_psiR);
    
    
    } /* end subdiag1_mpi */
    
    
    
    /* This routine is used to do the subspace rotation of the orbitals. Each
     * thread handles a specific portion of the real space domain.
     */
    void subdiag2_mpi(REAL *Aij, REAL *base_mem)
    {
      int idx, st1, st2;
      REAL *rptr;
      REAL *work1R, *work2R;
      REAL *work1I, *work2I;
    
      work1R = get_mem(4 * ct.num_states);
      work2R = work1R + ct.num_states;
      work1I = work2R + ct.num_states;
      work2I = work1I + ct.num_states;
    
      rptr = base_mem;
    
      for(idx = 0; idx < P0_BASIS; idx++) {
    
        /* We make a temporary copy and store it in work2 otherwise the
         * cache thrashing is horrible on the O2K.
         */
        for(st2 = 0;st2 < ct.num_states;st2++) {
            work2R[st2] = rptr[2*st2*P0_BASIS + idx];
            work2I[st2] = rptr[2*st2*P0_BASIS + P0_BASIS + idx];
        }
    
        for(st1 = 0;st1 < ct.num_states;st1++) {
    
          work1R[st1] = 0.0;
          work1I[st1] = 0.0;
          for(st2 = 0; st2 < ct.num_states; st2++) {
                 
            work1R[st1] += Aij[2*(st1*ct.num_states + st2)] * work2R[st2];
            work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] * work2I[st2];
            work1I[st1] += Aij[2*(st1*ct.num_states + st2)] * work2I[st2];
            work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] * work2R[st2];
                 
          } /* st2 */
                 
        } /* end for */
    
        /* update all wavefunctions for this *idx* */
        for(st1 = 0;st1 < ct.num_states;st1++) {
            rptr[2*st1*P0_BASIS + idx] = work1R[st1];
            rptr[2*st1*P0_BASIS + P0_BASIS + idx] = work1I[st1];
        }             
    
      } /* idx */
    
      release_mem(work1R);
    
    } /* end subdiag2_mpi */
    
    
    #endif
    
    #endif