Generated from subdiag_smp.c with ROBODoc v3.2.2 on Fri Sep 14 14:23:24 2001

TABLE OF CONTENTS

  1. QMD-MGDFT/subdiag_smp.c

QMD-MGDFT/subdiag_smp.c

NAME
   Ab initio real space code with multigrid acceleration
   Quantum molecular dynamics package.
   Version: 2.1.5
COPYRIGHT
   Copyright (C) 1995  Emil Briggs
   Copyright (C) 1998  Emil Briggs, Charles Brabec, Mark Wensell, 
                       Dan Sullivan, Chris Rapcewicz, Jerzy Bernholc
   Copyright (C) 2001  Emil Briggs, Wenchang Lu,
                       Marco Buongiorno Nardelli,Charles Brabec, 
                       Mark Wensell,Dan Sullivan, Chris Rapcewicz,
                       Jerzy Bernholc
FUNCTION
   void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
   This is the version of subdiag for SMP and NUMA machines
   Subspace diagonalizer for the Mehrstellen discretization of the
   Kohn-Sham Hamiltonian.

    1. Computes <i |A + BV| j> and <i |B| j>
    2. Generates the inverse of <i |B| j> and then multiplies <i |A + BV| j>
       by this inverse.
    3. Diagonalizes the resulting matrix and then remixes the eigenvectors
       accordingly.
INPUTS
   states: points to orbital structure (see md.h)
   vh:     Hartree potential
   vnuc:   pseudopotential
   vxc:    exchange correlation potential
OUTPUT
   wave functions and eigenvalues in states are updated
PARENTS
   cdfastrlx.c fastrlx.c moldyn.c init.c quench.c
CHILDREN
   global_sums.c app_cir.c app_nl.c genvpsi.c pack_ptos.c app_cilr.c 
   gather_psi.c trade_images.c app_grad.c
SOURCE
    #include "md.h"
    #include <float.h>
    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    #if (MACHINE_TYPE == PARALLEL_SMP)
    void spotri(char *uplo, int *n, REAL *a, int *lda, int *info);
    void ssyev(char *jobz, char *uplo, int *n, REAL *a, int *lda,
               REAL *w, REAL *work, int *lwork, int *info);                                                        
    
    
    
    void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
    {
        int idx, ip, st1, st2;
        int stop;
        int t1, thread, kidx;
        REAL *work1, *work2, *work3, *work1R, *work1I;
        int lwork;
        char *uplo="l", *jobz="v";
        int info = 0;
        REAL time1, time2;
        REAL *Aij, *Bij, *Cij;
        STATE *sp;
        ION *iptr;
    
        time1 = my_crtc();
    
        if(pct.thispe == 0) printf("\n SUBSPACE DIAGONALIZATION");  
    
    
        /* Grab some temporary storage */
        stop = ct.num_states * ct.num_states;
    #if !GAMMA_PT
        stop = 2 * ct.num_states * ct.num_states;
    #endif
        Aij = (REAL *)get_mem(stop + 256);
        Bij = (REAL *)get_mem(stop + 256);
        Cij = (REAL *)get_mem(stop + 256);
    
        work1 = get_mem(26 * ct.num_states);
        work2 = work1 + 8 * ct.num_states;
        work3 = work2 + 8 * ct.num_states;
        work1R = work3 + 8*ct.num_states;
        work1I = work1R + ct.num_states;                                                         
    
    
        for(idx = 0;idx < stop;idx++) {
          Aij[idx] = Bij[idx] = Cij[idx] = 0.0;
        }                                                                           
    
        kidx = states[0].kidx;
    
        /* Make sure the threads all know which k-point they are working with
         * and what temporary array to use. */
        for(thread = 0;thread < ct.num_threads;thread++) {
           thread_control[thread].kidx = kidx;
           thread_control[thread].barr = Bij;
        } /* end for */
    
    
        start_threads(SMP_DIAG1);
        wait_for_threads();
    
        /* Sum A and B overlap matrices over all threads */
        for(idx = 0;idx < stop;idx++) {
    
          Aij[idx] = 0.0;
          for(thread = 0;thread < ct.num_threads;thread++) {
    
            Aij[idx] += thread_control[thread].darr[idx];
    
          } /* end for */
    
        } /* end for */
    
    
        /* Generate the inverse of the B matrix and */
        /* compute the cholesky factor of the matrix w/LAPACK */
    #if GAMMA_PT
    
        cholesky(Bij, ct.num_states);
    
        /* now get the inverse */
        spotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
    
    #else
        cpotrf(uplo, &ct.num_states, Bij, &ct.num_states, &info);
    
        if (info != 0)
           error_handler("subdiag","cholesky factor of Bij failed");
    
        cpotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
    
    #endif
    
        if (info != 0)
           error_handler("subdiag","inverse of Bij failed");
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=st1;st2<ct.num_states;st2++){
    #if GAMMA_PT
              Bij[st2*ct.num_states + st1] = Bij[st1*ct.num_states+st2];
    #else
              Bij[2*(st2*ct.num_states + st1)] = Bij[2*(st1*ct.num_states+st2)];
              Bij[2*(st2*ct.num_states + st1)+1] = Bij[2*(st1*ct.num_states+st2)+1];
    #endif
    
          } /* end for */
    
        } /* end for */
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=0;st2<ct.num_states;st2++){
    
    #if GAMMA_PT
            Cij[st1*ct.num_states+st2] = ZERO;
    #else
            Cij[2*(st1*ct.num_states+st2)] = ZERO;
            Cij[2*(st1*ct.num_states+st2)+1] = ZERO;
    #endif
    
            for(idx=0;idx<ct.num_states;idx++){
    
    #if GAMMA_PT
              Cij[st1*ct.num_states+st2] += Bij[idx*ct.num_states+st1] * 
                                         Aij[st2*ct.num_states+idx];
    #else
              Cij[2*(st1*ct.num_states+st2)] += Bij[2*(idx*ct.num_states+st1)] * 
                                             Aij[2*(st2*ct.num_states+idx)];
              Cij[2*(st1*ct.num_states+st2)] -= Bij[2*(idx*ct.num_states+st1)+1] * 
                                             Aij[2*(st2*ct.num_states+idx)+1];
    
              Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)] * 
                                               Aij[2*(st2*ct.num_states+idx)+1];
              Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)+1] * 
                                               Aij[2*(st2*ct.num_states+idx)];
    
    #endif
    
            } /* end for */
    
          } /* end for */
    
        } /* end for */
    
    
        for(st1=0;st1<ct.num_states;st1++){
    
          for(st2=st1;st2<ct.num_states;st2++){
    
    #if GAMMA_PT
    
            Aij[st1*ct.num_states+st2] = Cij[st1*ct.num_states+st2];
            Aij[st2*ct.num_states+st1] = Cij[st1*ct.num_states+st2];
    
    #else
            Aij[2*(st1*ct.num_states+st2)] = Cij[2*(st1*ct.num_states+st2)];
            Aij[2*(st1*ct.num_states+st2)+1] = Cij[2*(st1*ct.num_states+st2)+1];
    
            Aij[2*(st2*ct.num_states+st1)] = Cij[2*(st1*ct.num_states+st2)];
            Aij[2*(st2*ct.num_states+st1)+1] = -Cij[2*(st1*ct.num_states+st2)+1];
    #endif
    
          } /* end for */
    
          Aij[2*(st1*ct.num_states+st1)+1] = 0.0;
    
        } /* end for */
    
    
        t1 = ct.num_states * ct.num_states;
    #if !GAMMA_PT
        t1 *= 2;
    #endif
    
    
    #if GAMMA_PT
        /* Next we need to diagonalize A */
        lwork = 4*ct.num_states;
        ssyev(jobz, uplo, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, &info);
    #else
        lwork = 4*ct.num_states;
        cheev(jobz, uplo, &ct.num_states, Aij, &ct.num_states, 
                        work1, work2, &lwork, work3, &info);
    #endif
    
    
        if (info != 0) 
           error_handler("subdiag","diag of Aij failed");
    
    
        /* The orbital update is actually done by multiple threads in
         * subdiag3_smp so we start them up here.
         */
        for(thread = 0;thread <  ct.num_threads;thread++)
            thread_control[thread].farr = Aij;
    
    
        start_threads(SMP_DIAG2);
        wait_for_threads();
    
    
        /* Renormalize the orbitals */
        for (st1 = 0; st1 < ct.num_states; st1++) {
    
            sp = &states[st1];
            norm_psi(sp, ONE);
    
        } /* st */
    
    
        /* rotate stored projection-wf integrals */
        for(idx = 0; idx < ct.num_ions; idx++) {
            iptr = &ct.ions[idx];
    
            for(ip = 0; ip < ct.max_nl; ip++) {
    
                for(st1 = 0; st1 < ct.num_states; st1++) {
    #if GAMMA_PT
                    work1R[st1] = 0.0;
    #else
                    work1R[st1] = 0.0;
                    work1I[st1] = 0.0;
    
    #endif
                    for(st2 = 0; st2 < ct.num_states; st2++) {
    #if GAMMA_PT
                        work1R[st1] += Aij[st1*ct.num_states + st2] *
                            iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];                            
    #else
                        work1R[st1] += Aij[2*(st1*ct.num_states + st2)] * 
                                       iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] * 
                                       iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1I[st1] += Aij[2*(st1*ct.num_states + st2)] * 
                                       iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                        work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] * 
                                       iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip]; 
                                     
    #endif
    
                    } /* st2 */
                } /* st1 */
    
    #if GAMMA_PT
                for(st1 = 0; st1 < ct.num_states; st1++) {
    
                    iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
    
                } /* st1 */
    #else
    
                for(st1 = 0; st1 < ct.num_states; st1++) {
    
                    iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
                    iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1I[st1];
    
                } /* st1 */
    
    #endif
    
            } /* ip */
    
        } /* idx */
    
    
        /* release our temporary storage */
        release_mem(work1);
        release_mem(Cij);
        release_mem(Bij);
        release_mem(Aij);
    
        time2 = my_crtc();
        md_timings(DIAG_TIME, (time2 - time1), 0);
    
    
    } /* end subdiag */
    
    
    #if GAMMA_PT
    void subdiag1_smp(SCF_THREAD_CONTROL *s) 
    {
      int kidx, idx, st1, st2, thread, ione=1;
      int dimx, dimy, dimz;
      REAL *sg_psi, *sg_twovpsi, *tmp_psi, *work1, *work2, *work3;
      REAL *rptr, *vtot;
      STATE *sp, *sp1;
    
      sp = &s->my_states[s->tid];
      kidx = sp->kidx;
      tmp_psi = get_mem(7*sp->sbasis);
      sg_psi = tmp_psi + sp->sbasis;
      work1 = sg_psi + sp->sbasis;
      work2 = work1 + sp->sbasis;
      work3 = work2 + sp->sbasis;
      sg_twovpsi = work3 + sp->sbasis;
      vtot = sg_twovpsi + sp->sbasis;
    
      /* Zero s->darr */
      for(idx = 0;idx < ct.num_states*ct.num_states;idx++) s->darr[idx] = 0.0;
      for(idx = 0;idx < sp->pbasis;idx++) 
              vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
    
    
      sp = &s->my_states[s->tid];
      dimx = sp->dimx;
      dimy = sp->dimy;
      dimz = sp->dimz;  
      gather_psi(tmp_psi, NULL, sp, s->tid);
    
    
      /* Apply non-local operator to psi and store in work2 */
      app_nl(tmp_psi, NULL, work2, NULL, sp->istate, FALSE, sp->kidx, s->tid);
    
    
      /* Generate 2*V*psi and store it in a smoothing grid */  
      genvpsi(tmp_psi, sg_twovpsi, vtot, work2, work3, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
                 
      /* B operating on 2*V*psi stored in work1 */
      app_cir(sg_twovpsi, work1, dimx, dimy, dimz);
    
    
      /* Pack psi into smoothing array */
      pack_ptos(sg_psi, tmp_psi, dimx, dimy, dimz);
    
    
      /* A operating on psi stored in work2 and B operating on psi stored in work3 */
      app_cilr(sg_psi, work2, work3, dimx, dimy, dimz, sp->hxgrid, sp->hygrid, sp->hzgrid);
                 
    
      for(idx=0;idx< sp->pbasis;idx++){
    
        work1[idx] = 0.5*ct.vel*(work1[idx] - work2[idx]);
    
      } /* end for */
    
    
      for(st2 = 0; st2 < ct.num_states;st2++) {
    
        sp1 = &s->my_states[st2];
        gather_psi(tmp_psi, NULL, sp1, s->tid);
        s->darr[s->tid*ct.num_states + st2] = 
           QMD_sdot(sp->pbasis, tmp_psi, ione, work1, ione);
    
        /* <psi|B|psi> is symmetric */
        if(st2 >= s->tid) {
            s->barr[s->tid*ct.num_states + st2] =
                  QMD_sdot(sp->pbasis, work3, ione, tmp_psi, ione);
            s->barr[st2*ct.num_states + s->tid] =
                  s->barr[s->tid*ct.num_states + st2];
        }
      } /* st2 */
    
    
      release_mem(tmp_psi);
    
    
    } /* end subdiag1_smp */
    
    
    
    
    /* This routine is used to do the subspace rotation of the orbitals. Each
     * thread handles a specific portion of the real space domain.
     */
    void subdiag2_smp(SCF_THREAD_CONTROL *s)
    {
      int idx, st1, st2;
      REAL *rptr;
      REAL *work1, *work2;
    
      work1 = get_mem(2 * ct.num_states);
      work2 = work1 + ct.num_states;
    
      for(idx = 0; idx < s->numpt; idx++) {
    
        rptr = s->base_mem;
        /* We make a temporary copy and store it in work2 otherwise the
         * cache thrashing is horrible on the O2K.
         */
        for(st2 = 0;st2 < ct.num_states;st2++)
            work2[st2] = rptr[st2*s->lda + idx];
    
        for(st1 = 0;st1 < ct.num_states;st1++) {
                 
          work1[st1] = 0.0;
          for(st2 = 0; st2 < ct.num_states; st2++) {
                 
            work1[st1] += s->farr[st1*ct.num_states + st2] * work2[st2];
                 
          } /* st2 */
                 
        } /* end for */
                 
        /* update all wavefunctions for this *idx* */
        for(st1 = 0;st1 < ct.num_states;st1++) 
            rptr[st1*s->lda + idx] = work1[st1];
                 
    
      } /* idx */
    
      release_mem(work1);
    
    } /* end subdiag2_smp */
    
    #else
    
    
    void subdiag1_smp(SCF_THREAD_CONTROL *s) 
    {
      int kidx, idx, st2, thread, ione=1;
      int dimx, dimy, dimz;
      REAL *sg_psiR, *sg_psiI, *sg_twovpsiR, *sg_twovpsiI, *tmp_psiR, *tmp_psiI;
      REAL *work1R, *work1I;
      REAL *work2R, *work2I, *work3R, *work3I;
      REAL *rptr, *vtot;
      REAL *gx, *gy, *gz, *kdr;
      REAL s1, s2, s3, s4;
      STATE *sp, *sp1;
    
      sp = &s->my_states[s->kidx * ct.num_states + s->tid];
      kidx = s->kidx;
      tmp_psiR = get_mem(18*sp->sbasis);
      tmp_psiI = tmp_psiR + sp->sbasis;
      sg_psiR = tmp_psiI + sp->sbasis;
      sg_psiI = sg_psiR + sp->sbasis;
      work1R = sg_psiI + sp->sbasis;
      work1I = work1R + sp->sbasis;
      work2R = work1I + sp->sbasis;
      work2I = work2R + sp->sbasis;
      work3R = work2I + sp->sbasis;
      work3I = work3R + sp->sbasis;
      sg_twovpsiR = work3I + sp->sbasis;
      sg_twovpsiI = sg_twovpsiR + sp->sbasis;
      vtot = sg_twovpsiI + sp->sbasis;
      gx = vtot + sp->sbasis;
      gy = gx + sp->sbasis;
      gz = gy + sp->sbasis;
      kdr = gz + sp->sbasis;
    
      dimx = sp->dimx;
      dimy = sp->dimy;
      dimz = sp->dimz;  
    
      /* Zero s->darr */
      for(idx = 0;idx < 2*ct.num_states*ct.num_states;idx++) s->darr[idx] = 0.0;
    
      for(idx = 0;idx < sp->pbasis;idx++) 
              vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
    
    
      gather_psi(tmp_psiR, tmp_psiI, sp, s->tid);
    
    
      /* Apply non-local operator to psi and store in work2 */
      app_nl(tmp_psiR, tmp_psiI, work2R, work2I, sp->istate, FALSE, sp->kidx, s->tid);
    
    
      /* Pack psi into smoothing array and get image data */
      pack_ptos(sg_psiR, tmp_psiR, dimx, dimy, dimz);
      trade_images(sg_psiR, dimx, dimy, dimz, pct.neighbors);
      pack_ptos(sg_psiI, tmp_psiI, dimx, dimy, dimz);
      trade_images(sg_psiI, dimx, dimy, dimz, pct.neighbors);
    
    
      /* Apply the gradient operator to psi */
      app_grad((S0_GRID *)sg_psiI, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
      for(idx = 0;idx < P0_BASIS;idx++) {
    
          kdr[idx] = (ct.kp[sp->kidx].kvec[0] * gx[idx] +
                      ct.kp[sp->kidx].kvec[1] * gy[idx] +
                      ct.kp[sp->kidx].kvec[2] * gz[idx]);
    
      } /* end for */
    
      /* Generate 2 * V * psi */
      genvpsi(tmp_psiR, sg_twovpsiR, vtot, work2R,
              kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
    
      /* Apply the gradient operator to psi */
      app_grad((S0_GRID *)sg_psiR, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
      for(idx = 0;idx < P0_BASIS;idx++) {
    
        kdr[idx] = -(ct.kp[sp->kidx].kvec[0] * gx[idx] +
                     ct.kp[sp->kidx].kvec[1] * gy[idx] +
                     ct.kp[sp->kidx].kvec[2] * gz[idx]);      
    
      } /* end for */     
    
      /* Generate 2 * V * psi */
      genvpsi(tmp_psiI, sg_twovpsiI, vtot, work2I,
              kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);                                                       
    
      /* A operating on psi stored in work2 and B operating on psi stored in work3 */
      app_cilr(sg_psiR, work2R, work3R, dimx, dimy, dimz,
                    sp->hxgrid, sp->hygrid, sp->hzgrid);
      app_cilr(sg_psiI, work2I, work3I, dimx, dimy, dimz,
                    sp->hxgrid, sp->hygrid, sp->hzgrid);
    
    
    
      /* B operating on 2*V*psi stored in work1 */
      app_cir(sg_twovpsiR, work1R, dimx, dimy, dimz);
      app_cir(sg_twovpsiI, work1I, dimx, dimy, dimz);
    
    
      for(idx=0;idx< sp->pbasis;idx++)
        work1R[idx] = 0.5*ct.vel*(work1R[idx] - work2R[idx]);
    
      for(idx=0;idx< sp->pbasis;idx++)
        work1I[idx] = 0.5*ct.vel*(work1I[idx] - work2I[idx]);
    
    
    /* Compute the complex overlap matrices here */
    
      for(st2 = 0; st2 < ct.num_states;st2++) {
    
        sp1 = &s->my_states[kidx*ct.num_states+st2];
        gather_psi(tmp_psiR, tmp_psiI, sp1, s->tid);
        s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1R, ione);
        s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1I, ione);
        s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1R, ione);
        s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1I, ione);
    
        s->darr[2*(sp->istate*ct.num_states + st2)] = (s1 + s4);
        s->darr[2*(sp->istate*ct.num_states + st2) + 1] = (s3 - s2);
        /* <psi|B|psi> is symmetric */
        if(st2 >= s->tid) { 
    
          s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3R, ione);
          s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3I, ione);
          s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3R, ione);
          s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3I, ione);
    
          s->barr[2*(sp->istate*ct.num_states + st2)] = ct.vel * (s1 + s4);
          s->barr[2*(sp->istate*ct.num_states + st2)+1] = -ct.vel * (s3 - s2);
          s->barr[2*(st2*ct.num_states + sp->istate)] = ct.vel * (s1 + s4);
          s->barr[2*(st2*ct.num_states + sp->istate)+1] = ct.vel * (s3 - s2);
    
        } /* end if */
    
      } /* st2 */
    
    
      release_mem(tmp_psiR);
    
    
    } /* end subdiag1_smp */
    
    
    
    
    /* This routine is used to do the subspace rotation of the orbitals. Each
     * thread handles a specific portion of the real space domain.
     */
    void subdiag2_smp(SCF_THREAD_CONTROL *s)
    {
      int idx, st1, st2;
      REAL *rptr;
      REAL *work1R, *work2R, *work1I, *work2I;
    
      work1R = get_mem(4 * ct.num_states);
      work2R = work1R + ct.num_states;
      work1I = work2R + ct.num_states;
      work2I = work1I + ct.num_states;
    
      for(idx = 0; idx < s->numpt; idx++) {
    
        rptr = &s->base_mem[2 * s->kidx * ct.num_states * s->lda];
        /* We make a temporary copy and store it in work2 otherwise the
         * cache thrashing is horrible on the O2K.
         */
        for(st2 = 0;st2 < ct.num_states;st2++) {
            work2R[st2] = rptr[2*st2*s->lda + idx];
            work2I[st2] = rptr[2*st2*s->lda + s->lda + idx];
        }
    
        for(st1 = 0;st1 < ct.num_states;st1++) {
                 
          work1R[st1] = 0.0;
          work1I[st1] = 0.0;
          for(st2 = 0; st2 < ct.num_states; st2++) {
                 
            work1R[st1] += s->farr[2*(st1*ct.num_states + st2)] * work2R[st2];
            work1R[st1] -= s->farr[2*(st1*ct.num_states + st2)+1] * work2I[st2];
            work1I[st1] += s->farr[2*(st1*ct.num_states + st2)] * work2I[st2];
            work1I[st1] += s->farr[2*(st1*ct.num_states + st2)+1] * work2R[st2];
                 
          } /* st2 */
                 
        } /* end for */
                 
        /* update all wavefunctions for this *idx* */
        for(st1 = 0;st1 < ct.num_states;st1++) {
            rptr[2*st1*s->lda + idx] = work1R[st1];
            rptr[2*st1*s->lda + s->lda + idx] = work1I[st1];
        }             
    
      } /* idx */
    
      release_mem(work1R);
    
    } /* end subdiag2_smp */
    
    
    #endif
    
    #endif