Generated from gramsch.c with ROBODoc v3.2.2 on Fri Sep 14 14:23:17 2001
TABLE OF CONTENTS
- QMD-MGDFT/gramsch.c
NAME
Ab initio real space code with multigrid acceleration
Quantum molecular dynamics package.
Version: 2.1.5
COPYRIGHT
Copyright (C) 1995 Emil Briggs
Copyright (C) 1998 Emil Briggs, Charles Brabec, Mark Wensell,
Dan Sullivan, Chris Rapcewicz, Jerzy Bernholc
Copyright (C) 2001 Emil Briggs, Wenchang Lu,
Marco Buongiorno Nardelli,Charles Brabec,
Mark Wensell,Dan Sullivan, Chris Rapcewicz,
Jerzy Bernholc
FUNCTION
void gram(KPOINT *kpt, REAL vel, int numst, int maxst, int numpt, int maxpt)
Performs gram-schmidt orthogonalization of the wavefunctions
INPUTS
kpt: structure KPOINT (see md.h),
including wave function for different kpoint
vel: ct.vel, volume for each grid
numst: number of states
maxst: leading dimention for numst, must be >= numst
numpt: number of grid
maxpt: leading dimension for numpt, must be >= numpt
OUTPUT
wave functions are updated
PARENTS
ortho_full.c
CHILDREN
fgram.f cholesky.c global_sums.c
SOURCE
#include "md.h"
#include <float.h>
#include <math.h>
#include <stdio.h>
#if GAMMA_PT
#if (MACHINE_TYPE == PARALLEL_SMP)
#include <pthread.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#include <stdlib.h>
char *transt = "t";
char *uplo = "l";
QMD_thread_barrier_struct bstruct1 = {0,
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER};
QMD_thread_barrier_struct bstruct2 = {0,
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER};
void gram(KPOINT *kpt, REAL vel, int numst, int maxst, int numpt, int maxpt)
{
/* On an SMP or NUMA system we don't actully use the arguments, the
* threads already know what they need to do so we just start them up.
*/
bstruct1.count = 0;
bstruct2.count = 0;
start_threads(SMP_ORTHO1);
wait_for_threads();
start_threads(SMP_ORTHO2);
wait_for_threads();
} /* end of gram */
/* This computes the overlap matrix, since it's a symmetric matrix we only
* compute the lower triangle.
*/
void ortho1_smp(SCF_THREAD_CONTROL *s)
{
int st1, st2, idx, ione=1, numst;
REAL zero=0.0, rone=1.0;
REAL *rptr, *rptr1;
int offset, size, step, count, numpt, lda;
int thread, reals_per_cline, reals_per_cache, rlda;
numst = ct.num_states;
numpt = s->numpt;
lda = s->lda;
reals_per_cline = CACHE_LINE_SIZE / sizeof(REAL);
reals_per_cache = L1_CACHE_LINES * reals_per_cline;
rlda = reals_per_cache / ct.num_states;
rptr = get_mem((ct.num_states+1)*rlda);
rptr1 = s->base_mem;
#if 0
/* If your BLAS libraries don't have a reentrant version of ssyrk then
* this kludge is needed.
*/
for(st1 = 0;st1 < ct.num_states;st1++){
for(st2 = 0;st2 < ct.num_states;st2++)
s->darr[st1*ct.num_states + st2] = 0.0;
count = 0;
step = rlda;
while(count < numpt) {
if((step + count) > numpt) step = numpt - count;
for(st2=0;st2<ct.num_states;st2++)
QMD_scopy( step, &rptr1[st2 * lda + count], ione, &rptr[st2*rlda], ione );
for(st2 = st1;st2 < ct.num_states;st2++){
for(idx = 0;idx < step;idx++) {
s->darr[st1*ct.num_states + st2] += rptr[idx + st1*rlda]*rptr[idx + st2*rlda];
} /* end for */
} /* end for */
count += step;
} /* end while */
} /* end for */
#else
/* A platform optimized BLAS call is usually faster -- if it's reentrant */
ssyrk( uplo, transt, &numst, &s->numpt, &rone, rptr1, &s->lda,
&zero, s->darr, &numst);
#endif
QMD_thread_barrier(&bstruct1);
/* Everyone has finished their local overlaps so now we need to sum them */
size = ct.num_states * ct.num_states / ct.num_threads;
offset = s->tid * size;
if(s->tid == (ct.num_threads - 1))
size += (ct.num_states * ct.num_states % ct.num_threads);
/* Sum up the stuff that this thread is responsible for */
for(thread = 0;thread < ct.num_threads;thread++){
if(thread != s->tid) {
QMD_saxpy(size, rone, &thread_control[thread].darr[offset], ione,
&thread_control[s->tid].darr[offset], ione );
} /* end if */
} /* end for */
/* Now make sure everyone else has a copy of my stuff */
for(thread = 0;thread < ct.num_threads;thread++){
if(thread != s->tid) {
QMD_scopy(size, &thread_control[s->tid].darr[offset], ione,
&thread_control[thread].darr[offset], ione);
} /* end if */
} /* end for */
QMD_thread_barrier(&bstruct2);
/* Compute the Choleski factorization */
cholesky(s->darr, numst);
release_mem(rptr);
} /* end ortho1_smp */
/* Performs the wavefunction update using the inverse of the cholesky factor.
* If we have a reentrant version of the BLAS routine sger we could probably
* make this much faster. (See the T3E subroutine for the routine)
*/
void ortho2_smp(SCF_THREAD_CONTROL *s)
{
int st1, st2, idx, ione=1, numpt, lda;
REAL *c, tmp, alpha=-1.0;
REAL *rptr;
int reals_per_cline, reals_per_cache;
int rlda, step, count;
c = s->base_mem;
numpt = s->numpt;
lda = s->lda;
reals_per_cline = CACHE_LINE_SIZE / sizeof(REAL);
reals_per_cache = L1_CACHE_LINES * reals_per_cline;
rlda = reals_per_cache / ct.num_states;
rptr = get_mem((ct.num_states+1)*rlda);
/* We block this to improve performance on cache based architectures.
* and copy to a temporary array to reduce TLB thrashing.
*/
count = 0;
step = rlda;
while(count < numpt) {
if((step + count) > numpt) step = numpt - count;
for(st1=0;st1<ct.num_states;st1++)
QMD_scopy( step, &c[st1 * lda + count], ione, &rptr[st1*rlda], ione );
for (st1 = 0; st1 < ct.num_states; st1++) {
/* normalize c[st] */
tmp = 1.0 / s->darr[st1 + ct.num_states * st1];
QMD_sscal( step, tmp, &rptr[st1 * rlda], ione);
for(st2 = st1+1;st2 < ct.num_states;st2++){
alpha = -s->darr[st2 + ct.num_states*st1];
QMD_saxpy(step, alpha, &rptr[st1 * rlda], ione, &rptr[st2*rlda], ione);
} /* end for */
} /* end for */
/* Copy the updated section of the orbital back to it's regular storage */
for(st1=0;st1<ct.num_states;st1++)
QMD_scopy( step, &rptr[st1*rlda], ione, &c[st1 * lda + count], ione );
count += step;
} /* end while */
tmp = 1.0 / sqrt(ct.vel);
idx = ct.num_states * s->lda;
QMD_sscal(idx, tmp, &c[0], ione);
release_mem(rptr);
/* For efficiency reasons we always generate the thread local copies of
* the density immediately after the orthogonalization step.
*/
get_rho_smp(s);
} /* end ortho2_smp */
#else
#if (CRAY_T3D || CRAY_T3E || CRAY_C90)
#include <fortran.h>
#endif
#if (CRAY_T3D || CRAY_T3E || CRAY_C90)
#else
void sgemv(char *trans, int *m, int *n, REAL *alpha, REAL *a, int *lda,
REAL *x, int *incx, REAL *beta, REAL *y, int *incy);
#endif
char *transn = "n";
char *transt = "t";
char *uplo = "l";
char *uphi = "u";
void gram(KPOINT *kpt, REAL vel, int numst, int maxst, int numpt, int maxpt)
{
int st, info, length, idx, idj;
int ione = 1;
REAL alpha = -1.0;
REAL zero = 0.0;
REAL one = 1.0;
REAL tmp, *darr, *c;
STATE *sp;
char trans;
#if (CRAY_T3D || CRAY_T3E || CRAY_C90)
_fcd trans_fcd, uplo_fcd, transa, transb, cuplo;
#endif
sp = kpt->kstate;
c = sp->psiR;
darr = get_mem(ct.num_states * ct.num_states );
trans = 't';
#if (CRAY_T3D || CRAY_T3E || CRAY_C90)
trans_fcd = _cptofcd(&trans, 1);
transa = _cptofcd(transt, 1);
transb = _cptofcd(transn, 1);
cuplo = _cptofcd(uplo, 1);
SSYRK( cuplo, transa, &numst, &numpt, &one, c, &numpt,
&zero, darr, &numst);
#else
ssyrk( uplo, transt, &numst, &numpt, &one, c, &numpt,
&zero, darr, &numst);
#endif
/* get the global part */
length = maxst * maxst;
global_sums(darr, &length);
/* compute the cholesky factor of the overlap matrix */
cholesky(darr, maxst);
/* apply inverse of cholesky factor to states */
for (st = 0; st < numst; st++) {
/* normalize c[st] */
tmp = 1.0 / darr[st + maxst * st];
QMD_sscal( numpt, tmp, &c[st * maxpt], ione);
/* subtract the projection along c[st] from the remaining vectors */
idx = numst - st - 1;
if(idx)
sger(&numpt, &idx, &alpha, &c[st * maxpt], &ione,
&darr[(st+1) + maxst*st], &ione, &c[(st+1) * maxpt], &maxpt);
} /* end of for */
tmp = 1.0 / sqrt(vel);
idx = numst * maxpt;
QMD_sscal(idx, tmp, &c[0], ione);
release_mem(darr);
} /* end of gram */
#endif
#else
#if (MACHINE_TYPE == PARALLEL_SMP)
/* This section is for complex orbitals */
#include <pthread.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
char *transt = "t";
char *uplo = "l";
QMD_thread_barrier_struct bstruct1 = {0,
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER};
QMD_thread_barrier_struct bstruct2 = {0,
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER};
void cpotrf(char *uplo, int *n, REAL *a, int *lda, int *info);
void gram(KPOINT *kpt, REAL vel, int numst, int maxst, int numpt, int maxpt)
{
int thread;
bstruct1.count = 0;
bstruct2.count = 0;
for(thread = 0;thread < ct.num_threads;thread++)
thread_control[thread].kidx = kpt->kidx;
start_threads(SMP_ORTHO1);
wait_for_threads();
start_threads(SMP_ORTHO2);
wait_for_threads();
/* If this is the last k-point then compute the density */
if(kpt->kidx == (ct.num_kpts-1)) {
start_threads(SMP_GET_RHO);
wait_for_threads();
} /* end if */
} /* end of gram */
void ortho1_smp(SCF_THREAD_CONTROL *s)
{
int stop, st1, st2, idx, ione=1, koffset;
int lda, info;
int thread;
REAL s1, s2, s3, s4;
REAL *psi1R, *psi1I, *psi2R, *psi2I;
koffset = 2 * s->kidx * s->lda * ct.num_states;
lda = s->lda;
stop = s->numpt;
for(idx = 0;idx < 2*ct.num_states*ct.num_states;idx++)s->darr[idx] = 0.0;
for(st1 = 0;st1 < ct.num_states;st1++) {
psi1R = &s->base_mem[koffset + 2*st1*lda];
psi1I = psi1R + s->lda;
for(st2 = st1;st2 < ct.num_states;st2++) {
s->darr[2*st1*ct.num_states + 2*st2] = ZERO;
s->darr[2*st1*ct.num_states + 2*st2 + 1] = ZERO;
psi2R = &s->base_mem[koffset + 2*st2*lda];
psi2I = psi2R + s->lda;
s1 = QMD_sdot(stop, psi1R, ione, psi2R, ione);
s2 = QMD_sdot(stop, psi1R, ione, psi2I, ione);
s3 = QMD_sdot(stop, psi1I, ione, psi2R, ione);
s4 = QMD_sdot(stop, psi1I, ione, psi2I, ione);
s->darr[2*st1*ct.num_states + 2*st2] += s1;
s->darr[2*st1*ct.num_states + 2*st2] += s4;
s->darr[2*st1*ct.num_states + 2*st2 + 1] += s3;
s->darr[2*st1*ct.num_states + 2*st2 + 1] -= s2;
s->darr[2*st2*ct.num_states + 2*st1] = s->darr[2*st1*ct.num_states + 2*st2];
s->darr[2*st2*ct.num_states + 2*st1 + 1] = s->darr[2*st1*ct.num_states + 2*st2 + 1];
} /* end for */
} /* end for */
QMD_thread_barrier(&bstruct1);
/* Everyone has finished their local overlaps so now we need to sum them.
* We only have one thread do the sums.
*/
if(!s->tid) {
for(thread = 1;thread < ct.num_threads;thread++){
for(idx = 0;idx < 2*ct.num_states * ct.num_states;idx++)
s->darr[idx] += thread_control[thread].darr[idx];
} /* end for */
/* Next compute the cholesy factorization */
cpotrf(uplo, &ct.num_states, s->darr, &ct.num_states, &info);
if (info != 0)
error_handler("gram","cholesky factor of Darr failed");
/* And make sure that everyone has a copy of it */
for(thread = 1;thread < ct.num_threads;thread++){
for(idx = 0;idx < 2*ct.num_states * ct.num_states;idx++)
thread_control[thread].darr[idx] = s->darr[idx];
} /* end for */
} /* end if */
} /* end ortho1_smp */
void ortho2_smp(SCF_THREAD_CONTROL *s)
{
int st1, st2, idx, lda, koffset, stop, ione=1;
REAL *psi1R, *psi1I, *psi2R, *psi2I;
REAL tmp, alphaR, alphaI;
int step, count, rlda, reals_per_cline, reals_per_cache;
reals_per_cline = CACHE_LINE_SIZE / sizeof(REAL);
reals_per_cache = L1_CACHE_LINES * reals_per_cline;
rlda = reals_per_cache / ct.num_states;
rlda /= 2;
koffset = 2 * s->kidx * s->lda * ct.num_states;
lda = s->lda;
stop = s->numpt;
/* Update the orbitals */
for(st1 = 0;st1 < ct.num_states;st1++) {
/* Normalize first orbital */
tmp = 1.0 / s->darr[2*st1 + 2*ct.num_states*st1];
psi1R = &s->base_mem[koffset + 2*st1*lda];
psi1I = psi1R + s->lda;
QMD_sscal( stop, tmp, psi1R, ione);
QMD_sscal( stop, tmp, psi1I, ione);
count = 0;
step = rlda;
while(count < stop) {
if((step + count) > stop) step = stop - count;
/* Subtract projections along remaining orbitals */
for(st2 = st1 + 1;st2 < ct.num_states;st2++) {
psi2R = &s->base_mem[koffset + 2*st2*lda] + count;
psi2I = psi2R + s->lda;
psi1R = &s->base_mem[koffset + 2*st1*lda] + count;
psi1I = psi1R + s->lda;
alphaR = -s->darr[2*(st2 + ct.num_states*st1)];
alphaI = -s->darr[2*(st2 + ct.num_states*st1)+1];
QMD_saxpy(step, alphaR, psi1R, ione, psi2R, ione);
QMD_saxpy(step, alphaI, psi1I, ione, psi2R, ione);
alphaI = - alphaI;
QMD_saxpy(step, alphaI, psi1R, ione, psi2I, ione);
QMD_saxpy(step, alphaR, psi1I, ione, psi2I, ione);
} /* end for */
count += step;
} /* end while */
} /* end for */
tmp = 1.0 / sqrt(ct.vel);
idx = 2 * ct.num_states * s->lda;
psi1R = &s->base_mem[koffset];
QMD_sscal(idx, tmp, psi1R, ione);
}
#else
/* Complex orbitals on Message-passing machines */
#if CRAY_T3E
#include <complex.h>
#endif
typedef struct {
REAL r;
REAL i;
} QMD_complex;
void fgram (REAL *c, REAL *vel, int *numst, int *maxst, int *numpt,
int *maxpt, REAL *dr);
void gram(KPOINT *kpt, REAL vel, int numst, int maxst, int numpt, int maxpt)
{
int idx, st1, st2, ione=1, ns;
STATE *sp;
REAL *c;
QMD_complex *dr, *z, *tarr;
sp = kpt->kstate;
c = sp->psiR;
tarr = get_mem(2 * maxpt );
dr = (QMD_complex *)get_mem(2 * maxst );
for(st1 = 0;st1 < ct.num_states;st1++) {
for(idx = 0;idx < numpt;idx++) {
tarr[idx].r = c[2*st1*maxpt+idx];
tarr[idx].i = c[2*st1*maxpt+maxpt+idx];
}
z = (QMD_complex *)&c[2*st1*maxpt];
for(idx = 0;idx < maxpt;idx++) z[idx] = tarr[idx];
}
fgram(c, &vel, &numst, &numst, &numpt, &numpt, (REAL *)dr);
for(st1 = 0;st1 < ct.num_states;st1++) {
z = (QMD_complex *)&c[2*st1*maxpt];
for(idx = 0;idx < numpt;idx++) {
tarr[idx] = z[idx];
}
for(idx = 0;idx < maxpt;idx++) {
c[2*st1*maxpt+idx] = tarr[idx].r;
c[2*st1*maxpt+maxpt+idx] = tarr[idx].i;
}
}
release_mem(dr);
release_mem(tarr);
} /* end gram */
#endif
#endif