/*
DFT++ is a density functional package developed by the research group
of Professor Tomas Arias
Copyright 1996-2003 Sohrab Ismail-Beigi
This file is part of DFT++.
DFT++ is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
DFT++ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DFT++; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Please see the file CREDITS for a list of authors.
For academic users, we request that publications using results obtained with
this software reference
"New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).
and, if using the wavelet basis, further reference
"Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).
and
"Robust ab initio calculation of condensed matter: transparent convergence through
semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).
For your convenience, preprints of the above articles may be obtained from
http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/
#include "header.h"
/*********************
* copy constructor *
*********************/
ColumnBundle::ColumnBundle(const ColumnBundle &C)
{
// copy the structure
C.copy_structure_to(*this);
// copy the data!
data = C.data;
}
/***********************
* regular constructor *
***********************/
ColumnBundle::ColumnBundle(int length, int ncol, Basis *basis)
{
tot_ncols = ncol;
my_ncols = ncol;
start_ncol = 0;
end_ncol = ncol-1;
col_length = length;
init(tot_ncols, basis, "distributed");
}
ColumnBundle::ColumnBundle(int ncols, Basis *basis, const char *distr)
{
init(ncols, basis, distr);
}
/////////////////////////////////////////////
// Destructor: frees the memory used //
/////////////////////////////////////////////
ColumnBundle::~ColumnBundle()
{
destroy_bands(my_ncols,col);
myfree(col);
data.free();
}
/*****************************************************************************
* Here, we take an existing ColumnBundle, and make a new one using *
* it as a template. This entails creating our own Column structure, then *
* calling init() to actually allocate the memory chunk (and point the Column*
* datapointers to the right place). *
*****************************************************************************/
void
ColumnBundle::copy_structure_to(ColumnBundle &C) const
{
C.tot_ncols = tot_ncols;
C.my_ncols = my_ncols;
C.start_ncol = start_ncol;
C.end_ncol = end_ncol;
C.col_length = col_length;
C.distributed = distributed;
// dft_log("&(col[0]->basis->nbasis) = %d, col[0]->basis->nbasis= %d\n",
// &(col[0]->basis->nbasis), col[0]->basis->nbasis);
C.init(tot_ncols, col[0]->basis, C.distributed?"distributed":"local");
}
/*****************************************************************************
* This creates a ColumnBundle from scratch, using the number of
* columns, * the quantum number, the basis spec. "str" tells us if we
* are "distributed"* or not. *
*****************************************************************************/
void
ColumnBundle::init(int ncols, Basis *basis, const char *str)
{
tot_ncols = ncols;
// let's work out how to distribute ourselves
distributed = FALSE;
#ifdef DFT_MPI
if(!strcmp(str, "distributed"))
distributed = TRUE;
#endif
if(distributed)
distribute_cols();
else{
my_ncols = tot_ncols;
start_ncol = 0;
end_ncol = tot_ncols-1;
}
// allocate our array of columns
col = (CoeffSpaceWavefunctionColumn **)mymalloc(my_ncols*sizeof(CoeffSpaceWavefunctionColumn*), "ColumnBundle::init(int ncols, Basis &basis, const char *str)",
"col");
// actually create our column structure
create_bands(my_ncols, col, basis);
// currently, lets assume each columns is the same length.
// if not, then major changes will have to be made to ^ and a bunch
// of other things as well
// the length of the column is a basis dependent thing,
// so we can only set it after the columns have been created.
// here we set the bundle col_length equal to these
if(basis != NULL)
// ipd: 08/13/02, sometimes with MPI we have mpi_processes
// that hold bundles with no columns. so we can't dereference
// col. Let's take the length from the basis, so that it is
// properly set, even if there are no columns
// (we need this so that many consistency checks keep working)
// col_length = col[0]->length;
col_length = basis->nbasis;
// initialize us (get memory, set datapointers inside the columns
allocate_memory();
}
/*
* Allocate/free an array of column bundles according to the ones in
* bloch states: used in the minimization algorithm */
ColumnBundle **alloc_ColumnBundle_array(int nbundles,BlochState *states)
{
ColumnBundle **CB = (ColumnBundle **)mymalloc(nbundles*sizeof(ColumnBundle*),
"alloc_ColumnBundle_aray()",
"CB");
int i;
for(i=0; i<nbundles; i++){
CB[i]= new ColumnBundle(states[i].C);
//CB[i]=new Column
}
return CB;
}
//VERIFY how to call the destructors
void free_ColumnBundle_array(int nbundles,ColumnBundle **Y)
{
int i;
for(i=0; i<nbundles; i++)
delete Y[i]; // cal ~{} on each bundle
delete []Y; //destroy the pointers
}
void
ColumnBundle::negate()
{
data.negate();
}
void
create_bands(int ncols, CoeffSpaceWavefunctionColumn **cols,
Basis *b)
{
// may be there is nothing to do
if (ncols==0) return;
int i;
for(i = 0; i < ncols; i++){
cols[i] = new CoeffSpaceWavefunctionColumn(b, "slim" );
if(b != NULL)
cols[i]->init_wavefunction(*b);
}
}
void
destroy_bands(int ncols, CoeffSpaceWavefunctionColumn **cols)
{
int i;
for(i = 0; i < ncols; i++)
delete cols[i];
}
/*************************************************************************
* allocate_memory() assumes that there is a valid Column structure in *
* place and it allocates memory for our data. then it points the *
* datapointers inside our columns to the appropriate place in the *
* bigass chunk *
*************************************************************************/
void
ColumnBundle::allocate_memory()
{
if (my_ncols==0) return;
data.init(my_ncols*col_length);
for(int i=0; i < my_ncols; i++){
col[i]->length = col_length;
col[i]->data.ndata = col_length;
col[i]->data.d = data.d+i*col_length;
}
}
void
ColumnBundle::get_scratch_memory()
{
data.get_temp(my_ncols*col_length);
for(int i=0; i < my_ncols; i++){
col[i]->data.ndata = col_length;
col[i]->data.d = data.d+i*col_length;
}
}
void
ColumnBundle::free_scratch_memory()
{
data.release_temp();
}
/////////////////////////////////////////////
// Generate my_ncols, start_ncol, end_ncol //
// from tot_ncols for a distributed //
// ColumnBundle case //
// This must also work for the single node //
// case where it just distributed all //
// columns to a single node (i.e. serial //
// case). //
// The distribution is as follows: //
// d = tot_ncols/N_Procs //
// r = tot_ncols%N_Procs //
// the first r processes get d+1 columns //
// and the rest get d columns //
// and the order of distribution is just //
// linear in the columns, i.e. columns //
// 0..d go to process 0, columns d+1,2*d+1 //
// go to process 1, etc. //
/////////////////////////////////////////////
void
ColumnBundle::distribute_cols()
{
int d,r;
d = tot_ncols / System::Get_N_Procs();
r = tot_ncols % System::Get_N_Procs();
// If the process is one of the first r, then it gets d+1 columns
if (System::Get_procID() < r) {
my_ncols = d+1;
start_ncol = (d+1)*System::Get_procID();
}
// Otherwise, it gets d columns
else {
my_ncols = d;
start_ncol = (d+1)*r + d*(System::Get_procID()-r);
}
// the last column owned by the process (inclusive)
end_ncol = start_ncol + my_ncols - 1;
}
/////////////////////////////////////////////
// Get Processor ID from column number //
// Namely, given a column_number, find the //
// processor number of its owner based on //
// above distribution //
/////////////////////////////////////////////
int
ColumnBundle::ncol_to_ID(int col_index) const
{
int d,r,id;
d = tot_ncols / System::Get_N_Procs();
r = tot_ncols % System::Get_N_Procs();
// If it's one of the columns belonging to the first r processes
// which have d+1 columns each...
if (col_index < r*(d+1))
id = col_index / (d+1);
// otherwise, it belongs to the set of processes with d columns
else
id = (col_index - r) / d;
// In the unlikely but bizzare case...
if (id >= System::Get_N_Procs() || id < 0)
die("In ColumnBundle::ncol_to_ID, id >= N_Procs or id < 0\n");
return id;
}
//////////////////////////////////////
// //
// Operator functions //
// //
//////////////////////////////////////
/* Assignment: nonstandard in that it returns void. To make it standard,
* replace void -> ColumnBundle and uncomment the return *this; */
void
ColumnBundle::operator=(const ColumnBundle &Y)
{
/* The sizes must agree */
if (tot_ncols != Y.tot_ncols)
die("In ColumnBundle::operator=, tot_ncols != Y.tot_ncols\n");
if (my_ncols != Y.my_ncols)
die("In ColumnBundle::operator=, my_ncols != Y.my_ncols\n");
if (col_length != Y.col_length)
die("In ColumnBundle::operator=, col_length != Y.col_length\n");
if (distributed != Y.distributed)
die("In ColumnBundle::operator=, distributed != Y.distributed\n");
data = Y.data;
}
void
ColumnBundle::zero_out(void)
{
data.zero_out();
}
/************************************************
* IPD: Linear Algebra - call the array algebra *
***********************************************/
void
ColumnBundle::operator+=(const ColumnBundle &in)
{
// check the quantum numbers
//...............
//...............
data += in.data;
}
void
ColumnBundle::operator-=(const ColumnBundle &in)
{
data -= in.data;
}
void
ColumnBundle::operator*=(const double &s)
{
data *= s;
}
void
ColumnBundle::operator*=(const complex &s)
{
data *= s;
}
void
ColumnBundle::operator/=(const double &s)
{
data*=(1./s);
}
void
ColumnBundle::operator/=(const complex &s)
{
data*= (1./s);
}
/**** Binary operators - call the unary operators ********/
ColumnBundle operator+(const ColumnBundle &a, const ColumnBundle &b)
{
ColumnBundle out(a);
out+=b;
return out;
}
ColumnBundle operator-(const ColumnBundle &a, const ColumnBundle &b)
{
ColumnBundle out(a);
out -= b;
return out;
}
ColumnBundle operator*(const double &d, const ColumnBundle &b)
{
ColumnBundle out(b);
out *= d;
return out;
}
ColumnBundle operator*(const ColumnBundle &b, const double &d)
{
ColumnBundle out(b);
out *= d;
return out;
}
ColumnBundle operator*(const complex &d, const ColumnBundle &b)
{
ColumnBundle out(b);
out *= d;
return out;
}
ColumnBundle operator*(const ColumnBundle &b, const complex &d)
{
ColumnBundle out(b);
out *= d;
return out;
}
ColumnBundle operator/(const ColumnBundle &b, const complex &c)
{
ColumnBundle out(b);
out /= c;
return out;
}
/* Does Yout += s*Yin */
void scale_accumulate(real r,const ColumnBundle &Yin,ColumnBundle &Yout)
{
scale_accumulate(r, Yin.data, Yout.data);
}
/* Does Yout[k] += s * Yin[k] */
void scale_accumulate(int nbundles,real r,ColumnBundle **Yin,
ColumnBundle **Yout)
{
for (int k=0; k < nbundles; k++)
scale_accumulate(r,*Yin[k],*Yout[k]);
}
/* Does Yout = s1*Y1 + s2*Y2 */
void
scaled_sum(real r1,ColumnBundle &Y1,
real r2,ColumnBundle &Y2,
ColumnBundle &Yout)
{
if (Y1.my_ncols != Y2.my_ncols || Y1.my_ncols != Yout.my_ncols ||
Y1.col_length != Y2.col_length || Y1.col_length != Yout.col_length)
die("Incomaptible sizes in scaled_sum(s1,Y1,s2,Y2,Yout)");
scaled_sum(r1, Y1.data, r2, Y2.data, Yout.data);
}
/* Does Yout[i] = s1*Y1[i] + s2*Y2[i] */
void
scaled_sum(int nbundles,
real r1,ColumnBundle **Y1,
real r2,ColumnBundle **Y2,
ColumnBundle **Yout)
{
for (int i=0; i < nbundles; i++)
scaled_sum(r1,*Y1[i],r2,*Y2[i],*Yout[i]);
}
/* Sum of absolute squares of all elements in Y */
real
abs2(const ColumnBundle &Y)
{
real Y2;
int i,j;
Y2 = 0.0;
for (i=0; i < Y.my_ncols; i++)
for (j=0; j < Y.col_length; j++){
#if defined WFS_COMPLEX
Y2 += Y.col[i]->data.d[j].x * Y.col[i]->data.d[j].x +
Y.col[i]->data.d[j].y * Y.col[i]->data.d[j].y;
#elif defined WFS_REAL
Y2 += Y.col[i]->data.d[j] * Y.col[i]->data.d[j];
#else
#error wavefnctions are neither real nor complex!
#endif
}
// If the object is distributed, do a global
// summation across the processors.
if (Y.distributed)
{
#ifdef DFT_MPI
#ifdef DFT_PROFILING
timerOn(13); // Turn on other MPI_Allreduce timer
counterIncr(21); // other reduces counter
counterIncr(22,sizeof(double)); // Bytes reduced
#endif // DFT_PROFILING
real temp = 0.0;
MPI_Allreduce ( &Y2, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
Y2 = temp;
#ifdef DFT_PROFILING
timerOff(13); // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING
#endif // DFT_MPI
}
return Y2;
}
/* Sum of absolute squares of all elements in array of ColumnBundles Y:
* i.e. it is just the sum of abs2() on all nbundle elements of Y */
real
abs2(int nbundles,ColumnBundle **Y)
{
int i;
real Y2;
Y2 = 0.0;
for (i=0; i < nbundles; i++)
Y2 += abs2(*Y[i]);
return Y2;
}
/* Take "dot-product" of two ColumnBundles: REsum the diagonals of Y1^Y2 */
real dot(const ColumnBundle &Y1,const ColumnBundle &Y2)
{
if (Y1.tot_ncols != Y2.tot_ncols)
die("Y1.tot_ncols != Y2.tot_ncols in dot_ColumnBundles\n");
if (Y1.my_ncols != Y2.my_ncols) {
dft_log(DFT_SILENCE,
"Different distribution of the same size arrays!!\n");
die("Y.my_ncols != Y2.my_ncols in dot_ColumnBundles\n");
}
if (Y1.col_length != Y2.col_length)
die("Y1.col_length != Y2.col_length in dot_ColumnBundles\n");
if (Y1.distributed != Y2.distributed)
die("Y1, Y2 are not the same distributed type\n");
int j;
int i;
real d;
d = 0.0;
for (i=0; i < Y1.my_ncols; i++)
for (j=0; j < Y1.col_length; j++){
/* do d += RE(conj(Y1.col[i].c[j])*Y2.col[i].c[j];) */
#ifdef WFS_COMPLEX
d += Y1.col[i]->data.d[j].x*Y2.col[i]->data.d[j].x +
Y1.col[i]->data.d[j].y*Y2.col[i]->data.d[j].y ;
#elif defined WFS_REAL
d += Y1.col[i].data.d[j]*Y2.col[i].data.d[j];
#else
#error wavefunctions are neither real nor complex!
#endif
}
// If the object is distributed, then do a global summation
if ( Y1.distributed ){
#ifdef DFT_MPI
real temp = 0.0;
#ifdef DFT_PROFILING
timerOn(13); // Turn on other MPI_Allreduce timer
counterIncr(21,1); // reduce count
counterIncr(22,sizeof(real)); // byte count
#endif // DFT_PROFILING
MPI_Allreduce ( &d, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
d = temp;
#ifdef DFT_PROFILING
timerOff(13); // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING
#endif // DFT_MPI
}
return d;
}
/* Take "dot-product" of nbundles pairs of ColumnBundles: i.e.
* do dot_ColumnBundles does, but just loop over the pairs. */
real dot(int nbundles,ColumnBundle **Y1,ColumnBundle **Y2)
{
int i;
real d;
d = 0.0;
for (i=0; i < nbundles; i++)
d += dot(*Y1[i],*Y2[i]);
return d;
}
Matrix operator^(const ColumnBundle &Y1,const ColumnBundle &Y2)
{
// Y1 is Nxn1, Y2 is Nxn2, and Y1dY2 is n1xn2.
int n1, n2, N;
n1 = Y1.my_ncols;
n2 = Y2.my_ncols;
N = Y1.col_length;
if (Y1.col_length != Y2.col_length)
die("In operator ^ on ColumnBundles, Y1.col_length != Y2.col_length\n");
#ifdef DFT_PROFILING
timerOn(3); // Y1^Y2 timer
counterIncr(1); // Y1^Y2 count
// Y1^Y2 flop count
counterIncr(0, 8.0*(double)n1*(double)Y2.tot_ncols*(double)N);
#endif // DFT_PROFILING
Matrix Y1dY2(Y1.tot_ncols,Y2.tot_ncols);
// zero out Y1dY2
Y1dY2.zero_out();
#ifdef DFT_MPI // with MPI, Y1 or Y2 might be distributed
// both Y1 and Y2 are distribyted
if ( (Y1.distributed == 1) && (Y2.distributed == 1) ) {
// use the code in dist_multiply.c by Ken Esler.
do_Y1dag_Y2_distributed(Y1, Y2, Y1dY2);
#ifdef DFT_PROFILING
timerOff(3); // turn off Y1^Y2 timer
counterIncr(2); // Distributed Y1^Y2 counter
#endif // DFT_PROFILING
// done
return Y1dY2;
}
// Y1 is local, Y2 is distributed
else if ( Y1.distributed == 0 ) {
// At the end, this needs a global reduction (see below).
// do the multiply with proper offset
Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,Y2.start_ncol,0,0);
}
// Y1 is distributed, Y2 is local
else if ( Y2.distributed == 0 ) {
// At the end, this needs a global reduction (see below).
// do the multiply with proper offset
Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,Y1.start_ncol,0,0,0);
}
#endif //DFT_MPI
// Y1 and Y2 are both local.
if ( (Y1.distributed == 0) && (Y2.distributed == 0) ) {
// do the multiply
Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,0,0,0);
#ifdef DFT_PROFILING
timerOff(3); // turn off Y1dag Y2 timer
#endif // DFT_PROFILING
// both are local, so we've got the final answer! We're done.
return Y1dY2;
}
#ifdef DFT_MPI
// If we are running MPI and have distributed ColumnBundles,
// then we must do a global reduction of all the processors's
// Y1dY2 matrices into result to get the final answer.
Matrix result(Y1.tot_ncols, Y2.tot_ncols);
// zero out where the final answer is going
result.zero_out();
// do the global sum over all processors
#ifdef DFT_PROFILING
timerOn(12); // Turn on Y1^Y2 MPI_Allreduce timer
// Number of bytes reduced
counterIncr(3,sizeof(scalar)*(double)Y1.tot_ncols*(double)Y2.tot_ncols);
#endif
MPI_Allreduce ( &(Y1dY2.c[0]), &(result.c[0]),
Y1.tot_ncols * Y2.tot_ncols * SCALAR_SIZE,
MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
timerOff(12); // Turn off Y1^Y2 MPI_Allreduce timer
#endif // DFT_PROFILING
#else // DFT_MPI
// We aren't running MPI, then we don't do any reduction, and
// the final answer is just Y1dY2 and we'll just make result
// point to it.
Matrix &result = Y1dY2;
#endif // DFT_MPI
return result;
}
/*
* Multiply ColumnBundle by a matrix on the right:
* YM = Y*M
*/
ColumnBundle operator*(const ColumnBundle &Y, const Matrix &M)
{
if (Y.tot_ncols != M.nr)
die("In ColumnBundle::operator*, Y.tot_ncols != M.nr\n");
ColumnBundle YM(Y.col_length, M.nc);
/* Call the router function below!! */
do_ColumnBundle_matrix_mult(Y,M,YM,0);
return YM;
}
void
mult(const ColumnBundle Y, const Matrix &M, ColumnBundle &YM)
{
do_ColumnBundle_matrix_mult(Y,M,YM,0);
}
/*
* Does YM = Y*M or YM += Y*M.
*/
void
do_ColumnBundle_matrix_mult(const ColumnBundle &Y,
const Matrix &M,
ColumnBundle &YM,
int accum)
{
#ifdef DFT_PROFILING
timerOn(4); // turn on Y * M timer
counterIncr(5); // Y*M count
#endif // DFT_PROFILING
if (Y.tot_ncols != M.nr)
die("In do_ColumnBundle_matrix_mult(), Y.tot_ncols != M.nr\n");
if (Y.col_length != YM.col_length)
die("In do_ColumnBundle_matrix_mult(), Y.col_length != YM.col_length\n");
if (M.nc != YM.tot_ncols)
die("In do_ColumnBundle_matrix_mult(), M.nc != YM.tot_ncols\n");
// We assume that YM and Y both have valid and congruent column structures
int N = Y.col_length;
int ncols = YM.my_ncols;
int nrows = Y.my_ncols;
#ifdef DFT_PROFILING
// Flop count of Y*M
counterIncr(4,8.0*(double)N*(double)ncols*Y.tot_ncols);
#endif
/* Zero out final result if not in accumulation mode */
if (!accum)
YM.zero_out();
// both YM and Y are distributed
if ( (Y.distributed == 1) && (YM.distributed == 1) ) {
// Use code in dist_multiply.c
do_Y_M_mult_distributed(Y, M, YM, accum);
#ifdef DFT_PROFILING
counterIncr(6); // distributed Y*M count
#endif // DFT_PROFILING
}
// Y and YM are both local.
else if ( (Y.distributed == 0) && (YM.distributed == 0) ) {
Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,0,0,accum);
}
// Y is local, YM is distributed.
else if ( Y.distributed == 0 ) {
Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,YM.start_ncol,0,accum);
}
// Y is distributed, YM is local.
else if ( YM.distributed == 0 ) {
#ifdef DFT_MPI
die("YM = Y*M where Y is distributed and YM is local is not supported\n");
/* // If in MPI mode, then we have to create a new ColumnBundle */
/* // to hold the local result before we sum up accros processors. */
/* ColumnBundle Ytemp(YM.tot_ncols, YM.col_length, YM.basis, "local"); */
/* copy_innards_ColumnBundle(&YM, &Ytemp); */
/* // If in accum mode, then the zeroth processor starts with YM */
/* // in its local result and accumulates into it. All other proc. */
/* // and for all other cases start with zero in their local result. */
/* if (accum && System::Get_procID()==0) */
/* Ytemp = YM; */
/* else */
/* Ytemp.zero_out(); */
/* #else */
/* // Non MPI mode... just make Ytemp point to YM and zero it out */
/* // if not in accum mode. */
/* ColumnBundle &Ytemp = YM; */
/* if (!accum) */
/* Ytemp.zero_out(); */
#endif
/* // do the multiply into Ytemp */
/* Y_M_block_matrix_mult(Y,M,Ytemp,N,nrows,ncols,Y.start_ncol,0,0,accum); */
/* #ifdef DFT_MPI */
/* #ifdef DFT_PROFILING */
/* timerOn(13); // Turn on other MPI_Allreduce timer */
/* counterIncr(21); // Number of other reduces */
/* // Bytes reduced */
/* counterIncr(22,sizeof(scalar)*(double)N*(double)ncols); */
/* #endif */
/* // do a global reduction on Ytemp, store in YM; */
/* MPI_Allreduce ( &(Ytemp.col[0].c[0]), &(YM.col[0].c[0]), */
/* N * ncols * SCALAR_SIZE, */
/* MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); */
/* #ifdef DFT_PROFILING */
/* timerOff(13); // Turn off other MPI_Allreduce timer */
/* #endif */
/* #endif // DFT_MPI */
}
#ifdef DFT_PROFILING
timerOff(4); // turn off Y * M timer
#endif // DFT_PROFILING
}
/* randomize: this is basis dependent, so call the
* relevant function on the columns
*/
void
ColumnBundle::randomize()
{
for(int i = 0; i < my_ncols; i++)
col[i]->randomize();
}
// Write ColumnBundle in binary to file fname
//////////////////////////////////////////////////
// Processor identified by MPI_IO has regular //
// IO functionalities. See MPI_Get_Attr for //
// furthur details. For now, assume processor 0 //
// is this processor. //
//////////////////////////////////////////////////
void
ColumnBundle::write(char *fname)
{
FILE *fp;
fp = dft_fopen(fname, "w");
write_stream(fp);
dft_fclose(fp);
}
void
write_ColumnBundle_array(char *fname,int nbundles,ColumnBundle **Y)
{
FILE *fp = NULL;
int i;
fp = dft_fopen(fname,"w");
for (i=0; i < nbundles; i++)
Y[i]->write_stream(fp);
dft_fclose(fp);
}
/* Write ColumnBundle in binary to the already open IO stream...don't
* close it! */
///////////////////////////////
// See comments for write //
// above. //
///////////////////////////////
void
ColumnBundle::write_stream(FILE *fp)
{
#ifdef DFT_MPI
if (System::Get_procID() == System::Get_IOprocID() ) {
// Temporary holding places for MPI.
int nproc;
int remote_ndata;
// here we assume that the IO node has as many columns as the
// node with the most columns
CoeffSpaceWavefunctionData tmp_data(data.ndata);
MPI_Status status;
// CRITICAL: data.col[i].c[j] must be continuous, with j being
// the inner index.
// loop over all MPI processes
for ( nproc = 0; nproc < System::Get_N_Procs(); nproc++ ) {
// if you are at the IO MPI process, just write to file
if ( nproc == System::Get_procID() )
fwrite(data.d, data.datasize, data.ndata, fp);
else {
// Receive the size of other proces' data into remote_ndata.
MPI_Recv(&remote_ndata, 1, MPI_INT, nproc, 0,
MPI_COMM_WORLD, &status);
// Receive the actual data.
if (remote_ndata > 0){
MPI_Recv(tmp_data.d,
remote_ndata*(tmp_data.datasize/sizeof(double)),
MPI_DOUBLE, nproc, 1,
MPI_COMM_WORLD, &status);
fwrite(tmp_data.d,
tmp_data.datasize,
remote_ndata,
fp);
}
}
}
} else {
// Send data to processor io_node for writing.
MPI_Send(&data.ndata, 1, MPI_INT, System::Get_IOprocID(),
0, MPI_COMM_WORLD);
if (data.ndata > 0)
MPI_Send(data.d, data.ndata*(data.datasize/sizeof(double)),
MPI_DOUBLE,
System::Get_IOprocID(), 1, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD); // just to be safe, synchronize.
#else // DFT_MPI
fwrite(data.d, data.datasize, data.ndata, fp);
#endif // DFT_MPI
}
void
ColumnBundle::read(char *fname)
{
FILE *fp;
fp = dft_fopen(fname, "r");
read_stream(fp);
dft_fclose(fp);
}
/* Read ColumnBundle in binary form from an already open IO stream...
* don't close it! */
void
ColumnBundle::read_stream(FILE *fp)
{
#ifdef DFT_MPI
if (System::Get_procID() == System::Get_IOprocID() ) {
// Temporary holding places for MPI.
int nproc;
int remote_ndata;
CoeffSpaceWavefunctionData tmp_data(data.ndata); // here we assume that the IO node
// has as many columns as the
// node with the most columns
MPI_Status status;
// CRITICAL: data.col[i].c[j] must be continuous, with j being
// the inner index.
// And my_ncols must NOT differ by more than 1 across processes.
for ( nproc = 0; nproc < System::Get_N_Procs(); nproc++ ) {
if ( nproc == System::Get_procID() ) {
fread(data.d,
data.datasize,
data.ndata,
fp);
} else {
MPI_Recv(&remote_ndata, 1, MPI_INT, nproc, 0, MPI_COMM_WORLD, &status);
if (remote_ndata > 0){
fread(tmp_data.d,
tmp_data.datasize,
remote_ndata,
fp);
MPI_Send(tmp_data.d, remote_ndata*(tmp_data.datasize/sizeof(double)),
MPI_DOUBLE, nproc, 1, MPI_COMM_WORLD);
}
}
}
} else {
MPI_Status status;
MPI_Send(&data.ndata, 1, MPI_INT, System::Get_IOprocID(),
0, MPI_COMM_WORLD);
// Receive data from processor io_node.
if (my_ncols > 0)
MPI_Recv(data.d, data.ndata*(data.datasize/sizeof(double)), MPI_DOUBLE,
System::Get_IOprocID(), 1, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD); // just to be safe, synchronize.
#else // DFT_MPI
fread(data.d,
data.datasize,
data.ndata,
fp);
#endif // DFT_MPI
}
/* Copies all the internal data of Y1 into Y2: Y1 -> Y2 */
/* void */
/* copy_innards_ColumnBundle(const ColumnBundle *Y1,ColumnBundle *Y2) */
/* { */
/* Y2->tot_ncols=Y1->tot_ncols; */
/* Y2->my_ncols=Y1->my_ncols; */
/* Y2->start_ncol=Y1->start_ncol; */
/* Y2->end_ncol=Y1->end_ncol; */
/* Y2->col_length=Y1->col_length; */
/* Y2->distributed=Y1->distributed; */
/* } */
syntax highlighted by Code2HTML, v. 0.9.1