Generated from subdiag_smp.c with ROBODoc v3.2.2 on Fri Sep 14 14:23:24 2001
TABLE OF CONTENTS
- QMD-MGDFT/subdiag_smp.c
NAME
Ab initio real space code with multigrid acceleration
Quantum molecular dynamics package.
Version: 2.1.5
COPYRIGHT
Copyright (C) 1995 Emil Briggs
Copyright (C) 1998 Emil Briggs, Charles Brabec, Mark Wensell,
Dan Sullivan, Chris Rapcewicz, Jerzy Bernholc
Copyright (C) 2001 Emil Briggs, Wenchang Lu,
Marco Buongiorno Nardelli,Charles Brabec,
Mark Wensell,Dan Sullivan, Chris Rapcewicz,
Jerzy Bernholc
FUNCTION
void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
This is the version of subdiag for SMP and NUMA machines
Subspace diagonalizer for the Mehrstellen discretization of the
Kohn-Sham Hamiltonian.
1. Computes <i |A + BV| j> and <i |B| j>
2. Generates the inverse of <i |B| j> and then multiplies <i |A + BV| j>
by this inverse.
3. Diagonalizes the resulting matrix and then remixes the eigenvectors
accordingly.
INPUTS
states: points to orbital structure (see md.h)
vh: Hartree potential
vnuc: pseudopotential
vxc: exchange correlation potential
OUTPUT
wave functions and eigenvalues in states are updated
PARENTS
cdfastrlx.c fastrlx.c moldyn.c init.c quench.c
CHILDREN
global_sums.c app_cir.c app_nl.c genvpsi.c pack_ptos.c app_cilr.c
gather_psi.c trade_images.c app_grad.c
SOURCE
#include "md.h"
#include <float.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#if (MACHINE_TYPE == PARALLEL_SMP)
void spotri(char *uplo, int *n, REAL *a, int *lda, int *info);
void ssyev(char *jobz, char *uplo, int *n, REAL *a, int *lda,
REAL *w, REAL *work, int *lwork, int *info);
void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
{
int idx, ip, st1, st2;
int stop;
int t1, thread, kidx;
REAL *work1, *work2, *work3, *work1R, *work1I;
int lwork;
char *uplo="l", *jobz="v";
int info = 0;
REAL time1, time2;
REAL *Aij, *Bij, *Cij;
STATE *sp;
ION *iptr;
time1 = my_crtc();
if(pct.thispe == 0) printf("\n SUBSPACE DIAGONALIZATION");
/* Grab some temporary storage */
stop = ct.num_states * ct.num_states;
#if !GAMMA_PT
stop = 2 * ct.num_states * ct.num_states;
#endif
Aij = (REAL *)get_mem(stop + 256);
Bij = (REAL *)get_mem(stop + 256);
Cij = (REAL *)get_mem(stop + 256);
work1 = get_mem(26 * ct.num_states);
work2 = work1 + 8 * ct.num_states;
work3 = work2 + 8 * ct.num_states;
work1R = work3 + 8*ct.num_states;
work1I = work1R + ct.num_states;
for(idx = 0;idx < stop;idx++) {
Aij[idx] = Bij[idx] = Cij[idx] = 0.0;
}
kidx = states[0].kidx;
/* Make sure the threads all know which k-point they are working with
* and what temporary array to use. */
for(thread = 0;thread < ct.num_threads;thread++) {
thread_control[thread].kidx = kidx;
thread_control[thread].barr = Bij;
} /* end for */
start_threads(SMP_DIAG1);
wait_for_threads();
/* Sum A and B overlap matrices over all threads */
for(idx = 0;idx < stop;idx++) {
Aij[idx] = 0.0;
for(thread = 0;thread < ct.num_threads;thread++) {
Aij[idx] += thread_control[thread].darr[idx];
} /* end for */
} /* end for */
/* Generate the inverse of the B matrix and */
/* compute the cholesky factor of the matrix w/LAPACK */
#if GAMMA_PT
cholesky(Bij, ct.num_states);
/* now get the inverse */
spotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
#else
cpotrf(uplo, &ct.num_states, Bij, &ct.num_states, &info);
if (info != 0)
error_handler("subdiag","cholesky factor of Bij failed");
cpotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
#endif
if (info != 0)
error_handler("subdiag","inverse of Bij failed");
for(st1=0;st1<ct.num_states;st1++){
for(st2=st1;st2<ct.num_states;st2++){
#if GAMMA_PT
Bij[st2*ct.num_states + st1] = Bij[st1*ct.num_states+st2];
#else
Bij[2*(st2*ct.num_states + st1)] = Bij[2*(st1*ct.num_states+st2)];
Bij[2*(st2*ct.num_states + st1)+1] = Bij[2*(st1*ct.num_states+st2)+1];
#endif
} /* end for */
} /* end for */
for(st1=0;st1<ct.num_states;st1++){
for(st2=0;st2<ct.num_states;st2++){
#if GAMMA_PT
Cij[st1*ct.num_states+st2] = ZERO;
#else
Cij[2*(st1*ct.num_states+st2)] = ZERO;
Cij[2*(st1*ct.num_states+st2)+1] = ZERO;
#endif
for(idx=0;idx<ct.num_states;idx++){
#if GAMMA_PT
Cij[st1*ct.num_states+st2] += Bij[idx*ct.num_states+st1] *
Aij[st2*ct.num_states+idx];
#else
Cij[2*(st1*ct.num_states+st2)] += Bij[2*(idx*ct.num_states+st1)] *
Aij[2*(st2*ct.num_states+idx)];
Cij[2*(st1*ct.num_states+st2)] -= Bij[2*(idx*ct.num_states+st1)+1] *
Aij[2*(st2*ct.num_states+idx)+1];
Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)] *
Aij[2*(st2*ct.num_states+idx)+1];
Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)+1] *
Aij[2*(st2*ct.num_states+idx)];
#endif
} /* end for */
} /* end for */
} /* end for */
for(st1=0;st1<ct.num_states;st1++){
for(st2=st1;st2<ct.num_states;st2++){
#if GAMMA_PT
Aij[st1*ct.num_states+st2] = Cij[st1*ct.num_states+st2];
Aij[st2*ct.num_states+st1] = Cij[st1*ct.num_states+st2];
#else
Aij[2*(st1*ct.num_states+st2)] = Cij[2*(st1*ct.num_states+st2)];
Aij[2*(st1*ct.num_states+st2)+1] = Cij[2*(st1*ct.num_states+st2)+1];
Aij[2*(st2*ct.num_states+st1)] = Cij[2*(st1*ct.num_states+st2)];
Aij[2*(st2*ct.num_states+st1)+1] = -Cij[2*(st1*ct.num_states+st2)+1];
#endif
} /* end for */
Aij[2*(st1*ct.num_states+st1)+1] = 0.0;
} /* end for */
t1 = ct.num_states * ct.num_states;
#if !GAMMA_PT
t1 *= 2;
#endif
#if GAMMA_PT
/* Next we need to diagonalize A */
lwork = 4*ct.num_states;
ssyev(jobz, uplo, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, &info);
#else
lwork = 4*ct.num_states;
cheev(jobz, uplo, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, work3, &info);
#endif
if (info != 0)
error_handler("subdiag","diag of Aij failed");
/* The orbital update is actually done by multiple threads in
* subdiag3_smp so we start them up here.
*/
for(thread = 0;thread < ct.num_threads;thread++)
thread_control[thread].farr = Aij;
start_threads(SMP_DIAG2);
wait_for_threads();
/* Renormalize the orbitals */
for (st1 = 0; st1 < ct.num_states; st1++) {
sp = &states[st1];
norm_psi(sp, ONE);
} /* st */
/* rotate stored projection-wf integrals */
for(idx = 0; idx < ct.num_ions; idx++) {
iptr = &ct.ions[idx];
for(ip = 0; ip < ct.max_nl; ip++) {
for(st1 = 0; st1 < ct.num_states; st1++) {
#if GAMMA_PT
work1R[st1] = 0.0;
#else
work1R[st1] = 0.0;
work1I[st1] = 0.0;
#endif
for(st2 = 0; st2 < ct.num_states; st2++) {
#if GAMMA_PT
work1R[st1] += Aij[st1*ct.num_states + st2] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
#else
work1R[st1] += Aij[2*(st1*ct.num_states + st2)] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] *
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)] *
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
#endif
} /* st2 */
} /* st1 */
#if GAMMA_PT
for(st1 = 0; st1 < ct.num_states; st1++) {
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
} /* st1 */
#else
for(st1 = 0; st1 < ct.num_states; st1++) {
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1I[st1];
} /* st1 */
#endif
} /* ip */
} /* idx */
/* release our temporary storage */
release_mem(work1);
release_mem(Cij);
release_mem(Bij);
release_mem(Aij);
time2 = my_crtc();
md_timings(DIAG_TIME, (time2 - time1), 0);
} /* end subdiag */
#if GAMMA_PT
void subdiag1_smp(SCF_THREAD_CONTROL *s)
{
int kidx, idx, st1, st2, thread, ione=1;
int dimx, dimy, dimz;
REAL *sg_psi, *sg_twovpsi, *tmp_psi, *work1, *work2, *work3;
REAL *rptr, *vtot;
STATE *sp, *sp1;
sp = &s->my_states[s->tid];
kidx = sp->kidx;
tmp_psi = get_mem(7*sp->sbasis);
sg_psi = tmp_psi + sp->sbasis;
work1 = sg_psi + sp->sbasis;
work2 = work1 + sp->sbasis;
work3 = work2 + sp->sbasis;
sg_twovpsi = work3 + sp->sbasis;
vtot = sg_twovpsi + sp->sbasis;
/* Zero s->darr */
for(idx = 0;idx < ct.num_states*ct.num_states;idx++) s->darr[idx] = 0.0;
for(idx = 0;idx < sp->pbasis;idx++)
vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
sp = &s->my_states[s->tid];
dimx = sp->dimx;
dimy = sp->dimy;
dimz = sp->dimz;
gather_psi(tmp_psi, NULL, sp, s->tid);
/* Apply non-local operator to psi and store in work2 */
app_nl(tmp_psi, NULL, work2, NULL, sp->istate, FALSE, sp->kidx, s->tid);
/* Generate 2*V*psi and store it in a smoothing grid */
genvpsi(tmp_psi, sg_twovpsi, vtot, work2, work3, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* B operating on 2*V*psi stored in work1 */
app_cir(sg_twovpsi, work1, dimx, dimy, dimz);
/* Pack psi into smoothing array */
pack_ptos(sg_psi, tmp_psi, dimx, dimy, dimz);
/* A operating on psi stored in work2 and B operating on psi stored in work3 */
app_cilr(sg_psi, work2, work3, dimx, dimy, dimz, sp->hxgrid, sp->hygrid, sp->hzgrid);
for(idx=0;idx< sp->pbasis;idx++){
work1[idx] = 0.5*ct.vel*(work1[idx] - work2[idx]);
} /* end for */
for(st2 = 0; st2 < ct.num_states;st2++) {
sp1 = &s->my_states[st2];
gather_psi(tmp_psi, NULL, sp1, s->tid);
s->darr[s->tid*ct.num_states + st2] =
QMD_sdot(sp->pbasis, tmp_psi, ione, work1, ione);
/* <psi|B|psi> is symmetric */
if(st2 >= s->tid) {
s->barr[s->tid*ct.num_states + st2] =
QMD_sdot(sp->pbasis, work3, ione, tmp_psi, ione);
s->barr[st2*ct.num_states + s->tid] =
s->barr[s->tid*ct.num_states + st2];
}
} /* st2 */
release_mem(tmp_psi);
} /* end subdiag1_smp */
/* This routine is used to do the subspace rotation of the orbitals. Each
* thread handles a specific portion of the real space domain.
*/
void subdiag2_smp(SCF_THREAD_CONTROL *s)
{
int idx, st1, st2;
REAL *rptr;
REAL *work1, *work2;
work1 = get_mem(2 * ct.num_states);
work2 = work1 + ct.num_states;
for(idx = 0; idx < s->numpt; idx++) {
rptr = s->base_mem;
/* We make a temporary copy and store it in work2 otherwise the
* cache thrashing is horrible on the O2K.
*/
for(st2 = 0;st2 < ct.num_states;st2++)
work2[st2] = rptr[st2*s->lda + idx];
for(st1 = 0;st1 < ct.num_states;st1++) {
work1[st1] = 0.0;
for(st2 = 0; st2 < ct.num_states; st2++) {
work1[st1] += s->farr[st1*ct.num_states + st2] * work2[st2];
} /* st2 */
} /* end for */
/* update all wavefunctions for this *idx* */
for(st1 = 0;st1 < ct.num_states;st1++)
rptr[st1*s->lda + idx] = work1[st1];
} /* idx */
release_mem(work1);
} /* end subdiag2_smp */
#else
void subdiag1_smp(SCF_THREAD_CONTROL *s)
{
int kidx, idx, st2, thread, ione=1;
int dimx, dimy, dimz;
REAL *sg_psiR, *sg_psiI, *sg_twovpsiR, *sg_twovpsiI, *tmp_psiR, *tmp_psiI;
REAL *work1R, *work1I;
REAL *work2R, *work2I, *work3R, *work3I;
REAL *rptr, *vtot;
REAL *gx, *gy, *gz, *kdr;
REAL s1, s2, s3, s4;
STATE *sp, *sp1;
sp = &s->my_states[s->kidx * ct.num_states + s->tid];
kidx = s->kidx;
tmp_psiR = get_mem(18*sp->sbasis);
tmp_psiI = tmp_psiR + sp->sbasis;
sg_psiR = tmp_psiI + sp->sbasis;
sg_psiI = sg_psiR + sp->sbasis;
work1R = sg_psiI + sp->sbasis;
work1I = work1R + sp->sbasis;
work2R = work1I + sp->sbasis;
work2I = work2R + sp->sbasis;
work3R = work2I + sp->sbasis;
work3I = work3R + sp->sbasis;
sg_twovpsiR = work3I + sp->sbasis;
sg_twovpsiI = sg_twovpsiR + sp->sbasis;
vtot = sg_twovpsiI + sp->sbasis;
gx = vtot + sp->sbasis;
gy = gx + sp->sbasis;
gz = gy + sp->sbasis;
kdr = gz + sp->sbasis;
dimx = sp->dimx;
dimy = sp->dimy;
dimz = sp->dimz;
/* Zero s->darr */
for(idx = 0;idx < 2*ct.num_states*ct.num_states;idx++) s->darr[idx] = 0.0;
for(idx = 0;idx < sp->pbasis;idx++)
vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
gather_psi(tmp_psiR, tmp_psiI, sp, s->tid);
/* Apply non-local operator to psi and store in work2 */
app_nl(tmp_psiR, tmp_psiI, work2R, work2I, sp->istate, FALSE, sp->kidx, s->tid);
/* Pack psi into smoothing array and get image data */
pack_ptos(sg_psiR, tmp_psiR, dimx, dimy, dimz);
trade_images(sg_psiR, dimx, dimy, dimz, pct.neighbors);
pack_ptos(sg_psiI, tmp_psiI, dimx, dimy, dimz);
trade_images(sg_psiI, dimx, dimy, dimz, pct.neighbors);
/* Apply the gradient operator to psi */
app_grad((S0_GRID *)sg_psiI, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
for(idx = 0;idx < P0_BASIS;idx++) {
kdr[idx] = (ct.kp[sp->kidx].kvec[0] * gx[idx] +
ct.kp[sp->kidx].kvec[1] * gy[idx] +
ct.kp[sp->kidx].kvec[2] * gz[idx]);
} /* end for */
/* Generate 2 * V * psi */
genvpsi(tmp_psiR, sg_twovpsiR, vtot, work2R,
kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* Apply the gradient operator to psi */
app_grad((S0_GRID *)sg_psiR, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
for(idx = 0;idx < P0_BASIS;idx++) {
kdr[idx] = -(ct.kp[sp->kidx].kvec[0] * gx[idx] +
ct.kp[sp->kidx].kvec[1] * gy[idx] +
ct.kp[sp->kidx].kvec[2] * gz[idx]);
} /* end for */
/* Generate 2 * V * psi */
genvpsi(tmp_psiI, sg_twovpsiI, vtot, work2I,
kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* A operating on psi stored in work2 and B operating on psi stored in work3 */
app_cilr(sg_psiR, work2R, work3R, dimx, dimy, dimz,
sp->hxgrid, sp->hygrid, sp->hzgrid);
app_cilr(sg_psiI, work2I, work3I, dimx, dimy, dimz,
sp->hxgrid, sp->hygrid, sp->hzgrid);
/* B operating on 2*V*psi stored in work1 */
app_cir(sg_twovpsiR, work1R, dimx, dimy, dimz);
app_cir(sg_twovpsiI, work1I, dimx, dimy, dimz);
for(idx=0;idx< sp->pbasis;idx++)
work1R[idx] = 0.5*ct.vel*(work1R[idx] - work2R[idx]);
for(idx=0;idx< sp->pbasis;idx++)
work1I[idx] = 0.5*ct.vel*(work1I[idx] - work2I[idx]);
/* Compute the complex overlap matrices here */
for(st2 = 0; st2 < ct.num_states;st2++) {
sp1 = &s->my_states[kidx*ct.num_states+st2];
gather_psi(tmp_psiR, tmp_psiI, sp1, s->tid);
s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1R, ione);
s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1I, ione);
s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1R, ione);
s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1I, ione);
s->darr[2*(sp->istate*ct.num_states + st2)] = (s1 + s4);
s->darr[2*(sp->istate*ct.num_states + st2) + 1] = (s3 - s2);
/* <psi|B|psi> is symmetric */
if(st2 >= s->tid) {
s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3R, ione);
s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3I, ione);
s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3R, ione);
s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3I, ione);
s->barr[2*(sp->istate*ct.num_states + st2)] = ct.vel * (s1 + s4);
s->barr[2*(sp->istate*ct.num_states + st2)+1] = -ct.vel * (s3 - s2);
s->barr[2*(st2*ct.num_states + sp->istate)] = ct.vel * (s1 + s4);
s->barr[2*(st2*ct.num_states + sp->istate)+1] = ct.vel * (s3 - s2);
} /* end if */
} /* st2 */
release_mem(tmp_psiR);
} /* end subdiag1_smp */
/* This routine is used to do the subspace rotation of the orbitals. Each
* thread handles a specific portion of the real space domain.
*/
void subdiag2_smp(SCF_THREAD_CONTROL *s)
{
int idx, st1, st2;
REAL *rptr;
REAL *work1R, *work2R, *work1I, *work2I;
work1R = get_mem(4 * ct.num_states);
work2R = work1R + ct.num_states;
work1I = work2R + ct.num_states;
work2I = work1I + ct.num_states;
for(idx = 0; idx < s->numpt; idx++) {
rptr = &s->base_mem[2 * s->kidx * ct.num_states * s->lda];
/* We make a temporary copy and store it in work2 otherwise the
* cache thrashing is horrible on the O2K.
*/
for(st2 = 0;st2 < ct.num_states;st2++) {
work2R[st2] = rptr[2*st2*s->lda + idx];
work2I[st2] = rptr[2*st2*s->lda + s->lda + idx];
}
for(st1 = 0;st1 < ct.num_states;st1++) {
work1R[st1] = 0.0;
work1I[st1] = 0.0;
for(st2 = 0; st2 < ct.num_states; st2++) {
work1R[st1] += s->farr[2*(st1*ct.num_states + st2)] * work2R[st2];
work1R[st1] -= s->farr[2*(st1*ct.num_states + st2)+1] * work2I[st2];
work1I[st1] += s->farr[2*(st1*ct.num_states + st2)] * work2I[st2];
work1I[st1] += s->farr[2*(st1*ct.num_states + st2)+1] * work2R[st2];
} /* st2 */
} /* end for */
/* update all wavefunctions for this *idx* */
for(st1 = 0;st1 < ct.num_states;st1++) {
rptr[2*st1*s->lda + idx] = work1R[st1];
rptr[2*st1*s->lda + s->lda + idx] = work1I[st1];
}
} /* idx */
release_mem(work1R);
} /* end subdiag2_smp */
#endif
#endif