Generated from subdiag_mpi.c with ROBODoc v3.2.2 on Fri Sep 14 14:23:24 2001
TABLE OF CONTENTS
- QMD-MGDFT/subdiag_mpi.c
NAME
Ab initio real space code with multigrid acceleration
Quantum molecular dynamics package.
Version: 2.1.5
COPYRIGHT
Copyright (C) 1995 Emil Briggs
Copyright (C) 1998 Emil Briggs, Charles Brabec, Mark Wensell,
Dan Sullivan, Chris Rapcewicz, Jerzy Bernholc
Copyright (C) 2001 Emil Briggs, Wenchang Lu,
Marco Buongiorno Nardelli,Charles Brabec,
Mark Wensell,Dan Sullivan, Chris Rapcewicz,
Jerzy Bernholc
FUNCTION
void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
This is the version of subdiag for message passing machines(MPI).
Subspace diagonalizer for the Mehrstellen discretization of the
Kohn-Sham Hamiltonian.
1. Computes <i |A + BV| j> and <i |B| j>
2. Generates the inverse of <i |B| j> and then multiplies <i |A + BV| j>
by this inverse.
3. Diagonalizes the resulting matrix and then remixes the eigenvectors
accordingly.
INPUTS
states: points to orbital structure (see md.h)
vh: Hartree potential
vnuc: pseudopotential
vxc: exchange correlation potential
OUTPUT
wave functions and eigenvalues in states are updated
PARENTS
cdfastrlx.c fastrlx.c moldyn.c init.c quench.c
CHILDREN
global_sums.c app_cir.c app_nl.c genvpsi.c pack_ptos.c app_cilr.c
gather_psi.c trade_images.c app_grad.c
SOURCE
#include "md.h"
#include <float.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#if (MACHINE_TYPE == PARALLEL_MESSAGE)
#if (CRAY_T3E || CRAY_C90)
#include <fortran.h>
void spotrf(_fcd uplo, int *n, REAL *a, int *lda, int *info);
void spotri(_fcd uplo, int *n, REAL *a, int *lda, int *info);
void ssyev(_fcd jobz, _fcd uplo, int *n, REAL *a, int *lda,
REAL *w, REAL *work, int *lwork, int *info);
void cpotrf(_fcd uplo, int *n, REAL *a, int *lda, int *info);
#endif
void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij);
void subdiag2_mpi(REAL *Aij, REAL *base_mem);
void subdiag(STATE *states, REAL *vh, REAL *vnuc, REAL *vxc)
{
int idx, ip, st1, st2;
int stop;
int t1, kidx;
REAL *work1, *work2, *work3;
int ione = 1; /* blas constants */
int lwork;
char *uplo="l", *jobz="v";
int info = 0;
REAL time1, time2;
REAL *Aij, *Bij, *Cij;
STATE *sp;
ION *iptr;
REAL *work1R, *work1I;
#if (CRAY_T3E || CRAY_C90)
_fcd uplo_fcd,jobz_fcd;
#endif
barrier();
time1 = my_crtc();
if(pct.thispe == 0) printf("\n SUBSPACE DIAGONALIZATION");
/* Grab some temporary storage */
stop = ct.num_states * ct.num_states;
#if !GAMMA_PT
stop = 2 * ct.num_states * ct.num_states;
#endif
Aij = (REAL *)get_mem(stop + 256);
Bij = (REAL *)get_mem(stop + 256);
Cij = (REAL *)get_mem(stop + 256);
work1 = (REAL *)get_mem(26 * ct.num_states);
work2 = work1 + 8*ct.num_states;
work3 = work2 + 8*ct.num_states;
work1R = work3 + 8*ct.num_states;
work1I = work1R + ct.num_states;
for(idx = 0;idx < stop;idx++) {
Aij[idx] = Bij[idx] = Cij[idx] = 0.0;
}
kidx = states[0].kidx;
for(st1 = 0;st1 < ct.num_states;st1++) {
subdiag1_mpi(st1, states, Aij, Bij);
} /* end for */
/* Sum A and B overlap matrices over all processors */
global_sums(Bij, &stop);
global_sums(Aij, &stop);
/* Generate the inverse of the B matrix and */
/* compute the cholesky factor of the matrix w/LAPACK */
#if GAMMA_PT
cholesky(Bij, ct.num_states);
#if CRAY_T3E
uplo_fcd = _cptofcd(uplo, 1);
/* now get the inverse */
spotri(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info);
#else
spotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
#endif
#else
#if CRAY_T3E
uplo_fcd = _cptofcd(uplo, 1);
cpotrf(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info);
if (info != 0)
error_handler("subdiag","cholesky factor of Bij failed");
cpotri(uplo_fcd, &ct.num_states, Bij, &ct.num_states, &info);
#else
cpotrf(uplo, &ct.num_states, Bij, &ct.num_states, &info);
if (info != 0)
error_handler("subdiag","cholesky factor of Bij failed");
cpotri(uplo, &ct.num_states, Bij, &ct.num_states, &info);
#endif
#endif
if (info != 0)
error_handler("subdiag","inverse of Bij failed");
for(st1=0;st1<ct.num_states;st1++){
for(st2=st1;st2<ct.num_states;st2++){
#if GAMMA_PT
Bij[st2*ct.num_states + st1] = Bij[st1*ct.num_states+st2];
#else
Bij[2*(st2*ct.num_states + st1)] = Bij[2*(st1*ct.num_states+st2)];
Bij[2*(st2*ct.num_states + st1)+1] = Bij[2*(st1*ct.num_states+st2)+1];
#endif
} /* end for */
} /* end for */
for(st1=0;st1<ct.num_states;st1++){
for(st2=0;st2<ct.num_states;st2++){
#if GAMMA_PT
Cij[st1*ct.num_states+st2] = ZERO;
#else
Cij[2*(st1*ct.num_states+st2)] = ZERO;
Cij[2*(st1*ct.num_states+st2)+1] = ZERO;
#endif
for(idx=0;idx<ct.num_states;idx++){
#if GAMMA_PT
Cij[st1*ct.num_states+st2] += Bij[idx*ct.num_states+st1] *
Aij[st2*ct.num_states+idx];
#else
Cij[2*(st1*ct.num_states+st2)] += Bij[2*(idx*ct.num_states+st1)] *
Aij[2*(st2*ct.num_states+idx)];
Cij[2*(st1*ct.num_states+st2)] -= Bij[2*(idx*ct.num_states+st1)+1] *
Aij[2*(st2*ct.num_states+idx)+1];
Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)] *
Aij[2*(st2*ct.num_states+idx)+1];
Cij[2*(st1*ct.num_states+st2)+1] += Bij[2*(idx*ct.num_states+st1)+1] *
Aij[2*(st2*ct.num_states+idx)];
#endif
} /* end for */
} /* end for */
} /* end for */
for(st1=0;st1<ct.num_states;st1++){
for(st2=st1;st2<ct.num_states;st2++){
#if GAMMA_PT
Aij[st1*ct.num_states+st2] = Cij[st1*ct.num_states+st2];
Aij[st2*ct.num_states+st1] = Cij[st1*ct.num_states+st2];
#else
Aij[2*(st1*ct.num_states+st2)] = Cij[2*(st1*ct.num_states+st2)];
Aij[2*(st1*ct.num_states+st2)+1] = Cij[2*(st1*ct.num_states+st2)+1];
Aij[2*(st2*ct.num_states+st1)] = Cij[2*(st1*ct.num_states+st2)];
Aij[2*(st2*ct.num_states+st1)+1] = -Cij[2*(st1*ct.num_states+st2)+1];
#endif
} /* end for */
Aij[2*(st1*ct.num_states+st1)+1] = 0.0;
} /* end for */
#if GAMMA_PT
#if CRAY_T3E
/* Next we need to diagonalize A */
uplo_fcd = _cptofcd(uplo, 1);
jobz_fcd = _cptofcd(jobz, 1);
lwork = 3*ct.num_states;
ssyev(jobz_fcd, uplo_fcd, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, &info);
#else
lwork = 3*ct.num_states;
ssyev(jobz, uplo, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, &info);
#endif
#else
#if CRAY_T3E
uplo_fcd = _cptofcd(uplo, 1);
jobz_fcd = _cptofcd(jobz, 1);
lwork = 3*ct.num_states;
cheev(jobz_fcd, uplo_fcd, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, work3, &info);
#else
lwork = 3*ct.num_states;
cheev(jobz, uplo, &ct.num_states, Aij, &ct.num_states,
work1, work2, &lwork, work3, &info);
#endif
#endif
if (info != 0)
error_handler("subdiag","diag of Aij failed");
/* Do the orbital update in here */
subdiag2_mpi(Aij, states->psiR);
/* Renormalize the orbitals */
for (st1 = 0; st1 < ct.num_states; st1++) {
sp = &states[st1];
norm_psi(sp, ONE);
} /* st */
/* rotate stored projection-wf integrals */
for(idx = 0; idx < ct.num_ions; idx++) {
iptr = &ct.ions[idx];
for(ip = 0; ip < ct.max_nl; ip++) {
for(st1 = 0; st1 < ct.num_states; st1++) {
#if GAMMA_PT
work1R[st1] = 0.0;
#else
work1R[st1] = 0.0;
work1I[st1] = 0.0;
#endif
for(st2 = 0; st2 < ct.num_states; st2++) {
#if GAMMA_PT
work1R[st1] += Aij[st1*ct.num_states + st2] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
#else
work1R[st1] += Aij[2*(st1*ct.num_states + st2)] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] *
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)] *
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] *
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st2*ct.max_nl + ip];
#endif
} /* st2 */
} /* st1 */
#if GAMMA_PT
for(st1 = 0; st1 < ct.num_states; st1++) {
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
} /* st1 */
#else
for(st1 = 0; st1 < ct.num_states; st1++) {
iptr->oldsintR[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1R[st1];
iptr->oldsintI[kidx*ct.num_states*ct.max_nl + st1*ct.max_nl + ip] = work1I[st1];
} /* st1 */
#endif
} /* ip */
} /* idx */
/* release our temporary storage */
release_mem(work1);
release_mem(Cij);
release_mem(Bij);
release_mem(Aij);
time2 = my_crtc();
md_timings(DIAG_TIME, (time2 - time1), 0);
} /* end subdiag */
#if GAMMA_PT
void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij)
{
int kidx, idx, st2, ione=1;
int dimx, dimy, dimz;
REAL *sg_psi, *sg_twovpsi, *tmp_psi, *work1, *work2, *work3;
REAL *vtot;
STATE *sp, *sp1;
sp = &states[istate];
kidx = 0;
tmp_psi = get_mem(7*sp->sbasis);
sg_psi = tmp_psi + sp->sbasis;
work1 = sg_psi + sp->sbasis;
work2 = work1 + sp->sbasis;
work3 = work2 + sp->sbasis;
sg_twovpsi = work3 + sp->sbasis;
vtot = sg_twovpsi + sp->sbasis;
for(idx = 0;idx < sp->pbasis;idx++)
vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
dimx = sp->dimx;
dimy = sp->dimy;
dimz = sp->dimz;
gather_psi(tmp_psi, NULL, sp, 0);
/* Apply non-local operator to psi and store in work2 */
app_nl(tmp_psi, NULL, work2, NULL, sp->istate, FALSE, kidx, 0);
/* Generate 2*V*psi and store it in a smoothing grid */
genvpsi(tmp_psi, sg_twovpsi, vtot, work2, work3, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* B operating on 2*V*psi stored in work1 */
app_cir(sg_twovpsi, work1, dimx, dimy, dimz);
/* Pack psi into smoothing array */
pack_ptos(sg_psi, tmp_psi, dimx, dimy, dimz);
/* A operating on psi stored in work2 and B operating on psi stored in work3 */
app_cilr(sg_psi, work2, work3, dimx, dimy, dimz, sp->hxgrid, sp->hygrid, sp->hzgrid);
for(idx=0;idx< sp->pbasis;idx++){
work1[idx] = 0.5*ct.vel*(work1[idx] - work2[idx]);
} /* end for */
for(st2 = 0; st2 < ct.num_states;st2++) {
sp1 = &states[st2];
gather_psi(tmp_psi, NULL, sp1, 0);
Aij[istate*ct.num_states + st2] =
QMD_sdot(sp->pbasis, tmp_psi, ione, work1, ione);
/* <psi|B|psi> is symmetric */
if(st2 >= istate) {
Bij[istate*ct.num_states + st2] =
QMD_sdot(sp->pbasis, work3, ione, tmp_psi, ione);
Bij[st2*ct.num_states + istate] = Bij[istate*ct.num_states + st2];
} /* end if */
} /* st2 */
release_mem(tmp_psi);
} /* end subdiag1_mpi */
/* This routine is used to do the subspace rotation of the orbitals. Each
* thread handles a specific portion of the real space domain.
*/
void subdiag2_mpi(REAL *Aij, REAL *base_mem)
{
int idx, st1, st2;
REAL *rptr;
REAL *work1, *work2;
work1 = get_mem(2 * ct.num_states);
work2 = work1 + ct.num_states;
for(idx = 0; idx < P0_BASIS; idx++) {
rptr = base_mem;
/* We make a temporary copy and store it in work2 otherwise the
* cache thrashing is horrible.
*/
for(st2 = 0;st2 < ct.num_states;st2++)
work2[st2] = rptr[st2*P0_BASIS + idx];
for(st1 = 0;st1 < ct.num_states;st1++) {
work1[st1] = 0.0;
for(st2 = 0; st2 < ct.num_states; st2++) {
work1[st1] += Aij[st1*ct.num_states + st2] * work2[st2];
} /* st2 */
} /* end for */
/* update all wavefunctions for this *idx* */
for(st1 = 0;st1 < ct.num_states;st1++)
rptr[st1*P0_BASIS + idx] = work1[st1];
} /* idx */
release_mem(work1);
} /* end subdiag2_mpi */
#else
void subdiag1_mpi(int istate, STATE *states, REAL *Aij, REAL *Bij)
{
int idx, st1, st2, ione=1;
int dimx, dimy, dimz, kidx;
REAL *sg_psiR, *sg_psiI, *sg_twovpsiR, *sg_twovpsiI, *tmp_psiR, *tmp_psiI;
REAL *work1R, *work1I;
REAL *work2R, *work2I, *work3R, *work3I;
REAL *rptr, *vtot;
REAL *gx, *gy, *gz, *kdr;
REAL s1, s2, s3, s4;
STATE *sp, *sp1;
sp = &states[istate];
kidx = sp->kidx;
tmp_psiR = get_mem(18*sp->sbasis);
tmp_psiI = tmp_psiR + sp->sbasis;
sg_psiR = tmp_psiI + sp->sbasis;
sg_psiI = sg_psiR + sp->sbasis;
work1R = sg_psiI + sp->sbasis;
work1I = work1R + sp->sbasis;
work2R = work1I + sp->sbasis;
work2I = work2R + sp->sbasis;
work3R = work2I + sp->sbasis;
work3I = work3R + sp->sbasis;
sg_twovpsiR = work3I + sp->sbasis;
sg_twovpsiI = sg_twovpsiR + sp->sbasis;
vtot = sg_twovpsiI + sp->sbasis;
gx = vtot + sp->sbasis;
gy = gx + sp->sbasis;
gz = gy + sp->sbasis;
kdr = gz + sp->sbasis;
dimx = sp->dimx;
dimy = sp->dimy;
dimz = sp->dimz;
for(idx = 0;idx < sp->pbasis;idx++)
vtot[idx] = sp->vh[idx] + sp->vxc[idx] + sp->vnuc[idx];
gather_psi(tmp_psiR, tmp_psiI, sp, 0);
/* Apply non-local operator to psi and store in work2 */
app_nl(tmp_psiR, tmp_psiI, work2R, work2I, sp->istate, FALSE, sp->kidx, 0);
/* Pack psi into smoothing array and get image data */
pack_ptos(sg_psiR, tmp_psiR, dimx, dimy, dimz);
trade_images(sg_psiR, dimx, dimy, dimz, pct.neighbors);
pack_ptos(sg_psiI, tmp_psiI, dimx, dimy, dimz);
trade_images(sg_psiI, dimx, dimy, dimz, pct.neighbors);
/* Apply the gradient operator to psi */
app_grad((S0_GRID *)sg_psiI, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
for(idx = 0;idx < P0_BASIS;idx++) {
kdr[idx] = (ct.kp[sp->kidx].kvec[0] * gx[idx] +
ct.kp[sp->kidx].kvec[1] * gy[idx] +
ct.kp[sp->kidx].kvec[2] * gz[idx]);
} /* end for */
/* Generate 2 * V * psi */
genvpsi(tmp_psiR, sg_twovpsiR, vtot, work2R,
kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* Apply the gradient operator to psi */
app_grad((S0_GRID *)sg_psiR, (P0_GRID *)gx, (P0_GRID *)gy, (P0_GRID *)gz);
for(idx = 0;idx < P0_BASIS;idx++) {
kdr[idx] = -(ct.kp[sp->kidx].kvec[0] * gx[idx] +
ct.kp[sp->kidx].kvec[1] * gy[idx] +
ct.kp[sp->kidx].kvec[2] * gz[idx]);
} /* end for */
/* Generate 2 * V * psi */
genvpsi(tmp_psiI, sg_twovpsiI, vtot, work2I,
kdr, ct.kp[sp->kidx].kmag, dimx, dimy, dimz);
/* A operating on psi stored in work2 and B operating on psi stored in work3 */
app_cilr(sg_psiR, work2R, work3R, dimx, dimy, dimz,
sp->hxgrid, sp->hygrid, sp->hzgrid);
app_cilr(sg_psiI, work2I, work3I, dimx, dimy, dimz,
sp->hxgrid, sp->hygrid, sp->hzgrid);
/* B operating on 2*V*psi stored in work1 */
app_cir(sg_twovpsiR, work1R, dimx, dimy, dimz);
app_cir(sg_twovpsiI, work1I, dimx, dimy, dimz);
for(idx=0;idx< sp->pbasis;idx++)
work1R[idx] = 0.5*ct.vel*(work1R[idx] - work2R[idx]);
for(idx=0;idx< sp->pbasis;idx++)
work1I[idx] = 0.5*ct.vel*(work1I[idx] - work2I[idx]);
/* Compute the complex overlap matrices here */
for(st2 = 0; st2 < ct.num_states;st2++) {
sp1 = &states[st2];
gather_psi(tmp_psiR, tmp_psiI, sp1, 0);
s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1R, ione);
s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work1I, ione);
s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1R, ione);
s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work1I, ione);
Aij[2*(sp->istate*ct.num_states + st2)] = (s1 + s4);
Aij[2*(sp->istate*ct.num_states + st2) + 1] = (s3 - s2);
/* <psi|B|psi> is symmetric */
if(st2 >= istate) {
s1 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3R, ione);
s2 = QMD_sdot(sp->pbasis, tmp_psiR, ione, work3I, ione);
s3 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3R, ione);
s4 = QMD_sdot(sp->pbasis, tmp_psiI, ione, work3I, ione);
Bij[2*(sp->istate*ct.num_states + st2)] = ct.vel * (s1 + s4);
Bij[2*(sp->istate*ct.num_states + st2)+1] = -ct.vel * (s3 - s2);
Bij[2*(st2*ct.num_states + sp->istate)] = ct.vel * (s1 + s4);
Bij[2*(st2*ct.num_states + sp->istate)+1] = ct.vel * (s3 - s2);
} /* end if */
} /* st2 */
release_mem(tmp_psiR);
} /* end subdiag1_mpi */
/* This routine is used to do the subspace rotation of the orbitals. Each
* thread handles a specific portion of the real space domain.
*/
void subdiag2_mpi(REAL *Aij, REAL *base_mem)
{
int idx, st1, st2;
REAL *rptr;
REAL *work1R, *work2R;
REAL *work1I, *work2I;
work1R = get_mem(4 * ct.num_states);
work2R = work1R + ct.num_states;
work1I = work2R + ct.num_states;
work2I = work1I + ct.num_states;
rptr = base_mem;
for(idx = 0; idx < P0_BASIS; idx++) {
/* We make a temporary copy and store it in work2 otherwise the
* cache thrashing is horrible on the O2K.
*/
for(st2 = 0;st2 < ct.num_states;st2++) {
work2R[st2] = rptr[2*st2*P0_BASIS + idx];
work2I[st2] = rptr[2*st2*P0_BASIS + P0_BASIS + idx];
}
for(st1 = 0;st1 < ct.num_states;st1++) {
work1R[st1] = 0.0;
work1I[st1] = 0.0;
for(st2 = 0; st2 < ct.num_states; st2++) {
work1R[st1] += Aij[2*(st1*ct.num_states + st2)] * work2R[st2];
work1R[st1] -= Aij[2*(st1*ct.num_states + st2)+1] * work2I[st2];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)] * work2I[st2];
work1I[st1] += Aij[2*(st1*ct.num_states + st2)+1] * work2R[st2];
} /* st2 */
} /* end for */
/* update all wavefunctions for this *idx* */
for(st1 = 0;st1 < ct.num_states;st1++) {
rptr[2*st1*P0_BASIS + idx] = work1R[st1];
rptr[2*st1*P0_BASIS + P0_BASIS + idx] = work1I[st1];
}
} /* idx */
release_mem(work1R);
} /* end subdiag2_mpi */
#endif
#endif