/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

#include "header.h"

/*********************
 * copy constructor  *
 *********************/
ColumnBundle::ColumnBundle(const ColumnBundle &C)
{
  // copy the structure
  C.copy_structure_to(*this);

  // copy the data!
  data = C.data;
}



/***********************
 * regular constructor *
 ***********************/
ColumnBundle::ColumnBundle(int length, int ncol, Basis *basis)
{
  tot_ncols = ncol;
  my_ncols = ncol;
  start_ncol = 0;
  end_ncol = ncol-1;
  col_length = length;

  init(tot_ncols, basis, "distributed");
}


ColumnBundle::ColumnBundle(int ncols, Basis *basis, const char *distr)
{
  init(ncols, basis, distr);
}



/////////////////////////////////////////////
// Destructor: frees the memory used  //
/////////////////////////////////////////////
ColumnBundle::~ColumnBundle()
{
  
  destroy_bands(my_ncols,col);
  
  myfree(col);

  data.free();
}


/*****************************************************************************
 * Here, we take an existing  ColumnBundle, and make a new one using         * 
 * it as a template. This entails creating our own Column structure, then    *
 * calling init() to actually allocate the memory chunk (and point the Column*
 * datapointers to the right place).                                         *
 *****************************************************************************/
void
ColumnBundle::copy_structure_to(ColumnBundle &C) const
{
  C.tot_ncols = tot_ncols;
  C.my_ncols = my_ncols;
  C.start_ncol = start_ncol;
  C.end_ncol = end_ncol;
  C.col_length = col_length;
  C.distributed = distributed;

//  dft_log("&(col[0]->basis->nbasis) = %d, col[0]->basis->nbasis= %d\n",
//	  &(col[0]->basis->nbasis), col[0]->basis->nbasis);

  C.init(tot_ncols, col[0]->basis, C.distributed?"distributed":"local");


}

/*****************************************************************************
 * This creates a ColumnBundle from scratch, using the number of
 * columns, * the quantum number, the basis spec. "str" tells us if we
 * are "distributed"* or not.  *
 *****************************************************************************/


void
ColumnBundle::init(int ncols, Basis *basis, const char *str)
{
  tot_ncols = ncols;
  
  // let's work out how to distribute ourselves

  distributed = FALSE;  
#ifdef DFT_MPI
  if(!strcmp(str, "distributed"))
    distributed = TRUE;
#endif

  if(distributed)
    distribute_cols();
  else{
    my_ncols = tot_ncols;
    start_ncol = 0;
    end_ncol = tot_ncols-1;
  }
  
  // allocate our array of columns
  col  = (CoeffSpaceWavefunctionColumn **)mymalloc(my_ncols*sizeof(CoeffSpaceWavefunctionColumn*), "ColumnBundle::init(int ncols, Basis &basis, const char *str)",
						   "col");


  // actually create our column structure
  create_bands(my_ncols, col, basis);

  // currently, lets assume each columns is the same length.
  // if not, then major changes will have to be made to ^ and a bunch
  // of other things as well
  // the length of the column is a basis dependent thing,
  // so we can only set it after the columns have been created.
  // here we set the bundle col_length equal to these
  if(basis != NULL)
    // ipd: 08/13/02, sometimes with MPI we have mpi_processes
    // that hold bundles with no columns. so we can't dereference
    // col. Let's take the length from the basis, so that it is
    // properly set, even if there are no columns
    // (we need this so that many consistency checks keep working)
    // col_length = col[0]->length;
    col_length = basis->nbasis;

  // initialize us (get memory, set datapointers inside the columns 

  allocate_memory();
}

/*
 * Allocate/free an array of column bundles according to the ones in
 * bloch states:  used in the minimization algorithm */
ColumnBundle **alloc_ColumnBundle_array(int nbundles,BlochState *states)
{
  ColumnBundle **CB = (ColumnBundle **)mymalloc(nbundles*sizeof(ColumnBundle*),
                                                "alloc_ColumnBundle_aray()",
                                                "CB");
  int i;
  for(i=0; i<nbundles; i++){
      CB[i]= new ColumnBundle(states[i].C);
      //CB[i]=new Column
  }
  
  return CB;
}

//VERIFY how to call the destructors
void free_ColumnBundle_array(int nbundles,ColumnBundle **Y)
{
  int i;
  for(i=0; i<nbundles; i++)
    delete Y[i]; // cal ~{} on each bundle
  delete []Y; //destroy the pointers
}

void
ColumnBundle::negate()
{
  data.negate();
}

void
create_bands(int ncols, CoeffSpaceWavefunctionColumn **cols,
             Basis *b)
{
  // may be there is nothing to do
  if (ncols==0) return;
  
  int i;
  for(i = 0; i < ncols; i++){
    cols[i] = new CoeffSpaceWavefunctionColumn(b, "slim" );
    if(b != NULL)
      cols[i]->init_wavefunction(*b);
  }
    
}


void
destroy_bands(int ncols, CoeffSpaceWavefunctionColumn **cols)
{
  int i;
  for(i = 0; i < ncols; i++)
    delete cols[i];
}


/*************************************************************************
 * allocate_memory() assumes that there is a valid Column structure in   *
 * place and it allocates memory for our data. then it points the        *
 * datapointers inside our columns to the appropriate place in the       *
 * bigass chunk                                                          *
 *************************************************************************/
void
ColumnBundle::allocate_memory()
{
  if (my_ncols==0) return;
  
  data.init(my_ncols*col_length);

  for(int i=0; i < my_ncols; i++){
    col[i]->length = col_length;
    col[i]->data.ndata = col_length;
    col[i]->data.d = data.d+i*col_length;
  }
}



void
ColumnBundle::get_scratch_memory()
{
  data.get_temp(my_ncols*col_length);

  for(int i=0; i < my_ncols; i++){
    col[i]->data.ndata = col_length;
    col[i]->data.d = data.d+i*col_length;
  }
}

void
ColumnBundle::free_scratch_memory()
{
  data.release_temp();  
}



/////////////////////////////////////////////
// Generate my_ncols, start_ncol, end_ncol //
// from tot_ncols for a distributed        //
// ColumnBundle case                       //
// This must also work for the single node //
// case where it just distributed all      //
// columns to a single node (i.e. serial   //
// case).                                  //
// The distribution is as follows:         //
//    d = tot_ncols/N_Procs                //
//    r = tot_ncols%N_Procs                //
// the first r processes get d+1 columns   //
// and the rest get d columns              //
// and the order of distribution is just   //
// linear in the columns, i.e. columns     //
// 0..d go to process 0, columns d+1,2*d+1 //
// go to process 1, etc.                   //
/////////////////////////////////////////////

void
ColumnBundle::distribute_cols()
{
  int d,r;

  d = tot_ncols / System::Get_N_Procs();
  r = tot_ncols % System::Get_N_Procs();

  // If the process is one of the first r, then it gets d+1 columns
  if (System::Get_procID() < r) {
    my_ncols = d+1;
    start_ncol = (d+1)*System::Get_procID();
  }
  // Otherwise, it gets d columns
  else  {
    my_ncols = d;
    start_ncol = (d+1)*r + d*(System::Get_procID()-r);
  }

  // the last column owned by the process (inclusive)
  end_ncol = start_ncol + my_ncols - 1;
}

/////////////////////////////////////////////
// Get Processor ID from column number     //
// Namely, given a column_number, find the //
// processor number of its owner based on  //
// above distribution                      //
/////////////////////////////////////////////
int
ColumnBundle::ncol_to_ID(int col_index) const
{
  int d,r,id;

  d = tot_ncols / System::Get_N_Procs();
  r = tot_ncols % System::Get_N_Procs();

  // If it's one of the columns belonging to the first r processes
  // which have d+1 columns each...
  if (col_index < r*(d+1))
    id = col_index / (d+1);
  // otherwise, it belongs to the set of processes with d columns
  else
    id = (col_index - r) / d;

  // In the unlikely but bizzare case...
  if (id >= System::Get_N_Procs() || id < 0)
    die("In ColumnBundle::ncol_to_ID, id >= N_Procs or id < 0\n");

  return id;
}







//////////////////////////////////////
//                                  //
// Operator functions               //
//                                  //
//////////////////////////////////////

/* Assignment:  nonstandard in that it returns void.  To make it standard,
 * replace void -> ColumnBundle and uncomment the return *this; */
void
ColumnBundle::operator=(const ColumnBundle &Y)
{

  /* The sizes must agree */
  if (tot_ncols != Y.tot_ncols)
    die("In ColumnBundle::operator=, tot_ncols != Y.tot_ncols\n");
  if (my_ncols != Y.my_ncols)
    die("In ColumnBundle::operator=, my_ncols != Y.my_ncols\n");
  if (col_length != Y.col_length)
    die("In ColumnBundle::operator=, col_length != Y.col_length\n");
  if (distributed != Y.distributed)
    die("In ColumnBundle::operator=, distributed != Y.distributed\n");

  data = Y.data;

}


void
ColumnBundle::zero_out(void)
{
  data.zero_out();
}


/************************************************
 *  IPD: Linear Algebra - call the array algebra *
 ***********************************************/



void
ColumnBundle::operator+=(const ColumnBundle &in)
{
    // check the quantum numbers
    //...............
    //...............
    
    data += in.data;
}


void
ColumnBundle::operator-=(const ColumnBundle &in)
{
    data -= in.data;
}




void
ColumnBundle::operator*=(const double &s)
{
    data *= s;
}

void
ColumnBundle::operator*=(const complex &s)
{
    data *= s;
}



void
ColumnBundle::operator/=(const double &s)
{
    data*=(1./s);
}



void
ColumnBundle::operator/=(const complex &s)
{
        data*= (1./s);
}

/**** Binary operators - call the unary operators ********/
ColumnBundle operator+(const ColumnBundle &a, const ColumnBundle &b)
{
    ColumnBundle out(a);
    out+=b;
    return out;
}


ColumnBundle operator-(const ColumnBundle &a, const ColumnBundle &b)
{
    ColumnBundle out(a);
    out -= b;

    return out;
}



ColumnBundle operator*(const double &d, const ColumnBundle &b)
{
    ColumnBundle out(b);

    out *= d;
    return out;
}



ColumnBundle operator*(const ColumnBundle &b, const double &d)
{
    ColumnBundle out(b);
 
    out *= d;

    return out;
}



ColumnBundle operator*(const complex &d, const ColumnBundle &b)
{
    ColumnBundle out(b);
    out *= d;
    return out;
}



ColumnBundle operator*(const ColumnBundle &b, const complex &d)
{
    ColumnBundle out(b);

    out *= d;
    return out;
}


ColumnBundle operator/(const ColumnBundle &b, const complex &c)
{
    ColumnBundle out(b);
    
    out /= c;
    
    return out;
}



/* Does Yout += s*Yin */
void scale_accumulate(real r,const ColumnBundle &Yin,ColumnBundle &Yout)
{
  scale_accumulate(r, Yin.data, Yout.data);
}

/* Does Yout[k] += s * Yin[k] */
void scale_accumulate(int nbundles,real r,ColumnBundle **Yin,
                      ColumnBundle **Yout)
{
  for (int k=0; k < nbundles; k++)
    scale_accumulate(r,*Yin[k],*Yout[k]);
}

/* Does Yout = s1*Y1 + s2*Y2 */
void
scaled_sum(real r1,ColumnBundle &Y1,
           real r2,ColumnBundle &Y2,
           ColumnBundle &Yout)
{
  if (Y1.my_ncols != Y2.my_ncols || Y1.my_ncols != Yout.my_ncols ||
      Y1.col_length != Y2.col_length || Y1.col_length != Yout.col_length)
    die("Incomaptible sizes in scaled_sum(s1,Y1,s2,Y2,Yout)");

  scaled_sum(r1, Y1.data, r2, Y2.data, Yout.data);
}

/* Does Yout[i] = s1*Y1[i] + s2*Y2[i] */
void
scaled_sum(int nbundles,
           real r1,ColumnBundle **Y1,
           real r2,ColumnBundle **Y2,
           ColumnBundle **Yout)
{
  for (int i=0; i < nbundles; i++)
    scaled_sum(r1,*Y1[i],r2,*Y2[i],*Yout[i]);
}


/* Sum of absolute squares of all elements in Y */
real
abs2(const ColumnBundle &Y)
{
  real Y2;
  int i,j;

  Y2 = 0.0;
  for (i=0; i < Y.my_ncols; i++)
    for (j=0; j < Y.col_length; j++){
#if defined WFS_COMPLEX
      Y2 += Y.col[i]->data.d[j].x * Y.col[i]->data.d[j].x +
        Y.col[i]->data.d[j].y * Y.col[i]->data.d[j].y;
#elif defined WFS_REAL
      Y2 += Y.col[i]->data.d[j] * Y.col[i]->data.d[j];
#else
#error wavefnctions are neither real nor complex!
#endif
    }

  // If the object is distributed, do a global
  // summation across the processors.
  if (Y.distributed)
    {
#ifdef DFT_MPI

#ifdef DFT_PROFILING
      timerOn(13);  // Turn on other MPI_Allreduce timer
      counterIncr(21); // other reduces counter
      counterIncr(22,sizeof(double)); // Bytes reduced
#endif // DFT_PROFILING

      real temp = 0.0;
      MPI_Allreduce ( &Y2, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
      Y2 = temp;

#ifdef DFT_PROFILING
      timerOff(13);  // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING

#endif // DFT_MPI
    }

  return Y2;
}

/* Sum of absolute squares of all elements in array of ColumnBundles Y:
 * i.e. it is just the sum of abs2() on all nbundle elements of Y */
real
abs2(int nbundles,ColumnBundle **Y)
{
  int i;
  real Y2;

  Y2 = 0.0;
  for (i=0; i < nbundles; i++)
    Y2 += abs2(*Y[i]);
  return Y2;
}


/* Take "dot-product" of two ColumnBundles:  REsum the diagonals of Y1^Y2 */
real dot(const ColumnBundle &Y1,const ColumnBundle &Y2)
{
  if (Y1.tot_ncols != Y2.tot_ncols)
    die("Y1.tot_ncols != Y2.tot_ncols in dot_ColumnBundles\n");
  if (Y1.my_ncols != Y2.my_ncols) {
    dft_log(DFT_SILENCE,
            "Different distribution of the same size arrays!!\n");
    die("Y.my_ncols != Y2.my_ncols in dot_ColumnBundles\n");
  }
  if (Y1.col_length != Y2.col_length)
    die("Y1.col_length != Y2.col_length in dot_ColumnBundles\n");
  if (Y1.distributed != Y2.distributed)
    die("Y1, Y2 are not the same distributed type\n");
  
  int j;
  int i;
  real d;


  d = 0.0;
  for (i=0; i < Y1.my_ncols; i++)
    for (j=0; j < Y1.col_length; j++){
	/* do d += RE(conj(Y1.col[i].c[j])*Y2.col[i].c[j];) */
#ifdef WFS_COMPLEX
      d += Y1.col[i]->data.d[j].x*Y2.col[i]->data.d[j].x + 
        Y1.col[i]->data.d[j].y*Y2.col[i]->data.d[j].y ;
#elif defined WFS_REAL
      d += Y1.col[i].data.d[j]*Y2.col[i].data.d[j];
#else
#error wavefunctions are neither real nor complex!
#endif
    }

  // If the object is distributed, then do a global summation
  if ( Y1.distributed ){
#ifdef DFT_MPI
      real temp = 0.0;

#ifdef DFT_PROFILING
      timerOn(13);  // Turn on other MPI_Allreduce timer
      counterIncr(21,1); // reduce count
      counterIncr(22,sizeof(real)); // byte count
#endif // DFT_PROFILING

      MPI_Allreduce ( &d, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );

      d = temp;

#ifdef DFT_PROFILING
      timerOff(13);  // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING

#endif // DFT_MPI
  }
  
  return d;

}

/* Take "dot-product" of nbundles pairs of ColumnBundles: i.e.
 * do dot_ColumnBundles does, but just loop over the pairs. */
real dot(int nbundles,ColumnBundle **Y1,ColumnBundle **Y2)
{
  int i;
  real d;

  d = 0.0;
  for (i=0; i < nbundles; i++)
    d += dot(*Y1[i],*Y2[i]);

  return d;

}


Matrix operator^(const ColumnBundle &Y1,const ColumnBundle &Y2)
{
  // Y1 is Nxn1, Y2 is Nxn2, and Y1dY2 is n1xn2.
  int n1, n2, N;
  n1 = Y1.my_ncols;
  n2 = Y2.my_ncols;
  N  = Y1.col_length;

  if (Y1.col_length != Y2.col_length)
      die("In operator ^ on ColumnBundles, Y1.col_length != Y2.col_length\n");

#ifdef DFT_PROFILING
  timerOn(3); // Y1^Y2  timer
  counterIncr(1);  // Y1^Y2 count
  // Y1^Y2 flop count
  counterIncr(0, 8.0*(double)n1*(double)Y2.tot_ncols*(double)N);
#endif // DFT_PROFILING

  Matrix Y1dY2(Y1.tot_ncols,Y2.tot_ncols);
  
  // zero out Y1dY2
  Y1dY2.zero_out();

#ifdef DFT_MPI // with MPI, Y1 or Y2  might be distributed 

  // both Y1 and Y2 are distribyted
  if ( (Y1.distributed == 1) && (Y2.distributed == 1) ) {
    // use the code in dist_multiply.c by Ken Esler.
    do_Y1dag_Y2_distributed(Y1, Y2, Y1dY2);
#ifdef DFT_PROFILING
    timerOff(3); // turn off  Y1^Y2  timer
    counterIncr(2); // Distributed Y1^Y2 counter
#endif // DFT_PROFILING
    
    // done
    return Y1dY2;
  }
  // Y1 is local, Y2 is distributed
  else if ( Y1.distributed == 0 ) {
    // At the end, this needs a global reduction (see below).
    // do the multiply with proper offset
    Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,Y2.start_ncol,0,0);
  }
  
  // Y1 is distributed, Y2 is local
  else if ( Y2.distributed == 0 ) {
    // At the end, this needs a global reduction (see below).
    // do the multiply with proper offset
    Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,Y1.start_ncol,0,0,0);
  }
#endif //DFT_MPI

  // Y1 and Y2 are both local.
  if ( (Y1.distributed == 0) && (Y2.distributed == 0) ) {
    // do the multiply
    Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,0,0,0);
    
#ifdef DFT_PROFILING
    timerOff(3); // turn off  Y1dag Y2  timer
#endif // DFT_PROFILING
    
    // both are local, so we've got the final answer!  We're done.
    return Y1dY2;
  }



#ifdef DFT_MPI
  // If we are running MPI and have distributed ColumnBundles,
  // then we must do a global reduction of all the processors's
  // Y1dY2 matrices into result to get the final answer.
  Matrix result(Y1.tot_ncols, Y2.tot_ncols);

  // zero out where the final answer is going
  result.zero_out();

  // do the global sum over all processors
#ifdef DFT_PROFILING
  timerOn(12);  // Turn on Y1^Y2 MPI_Allreduce timer
  // Number of bytes reduced
  counterIncr(3,sizeof(scalar)*(double)Y1.tot_ncols*(double)Y2.tot_ncols);
#endif

  MPI_Allreduce ( &(Y1dY2.c[0]), &(result.c[0]), 
                 Y1.tot_ncols * Y2.tot_ncols * SCALAR_SIZE, 
                 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );

#ifdef DFT_PROFILING
  timerOff(12);  // Turn off Y1^Y2 MPI_Allreduce timer
#endif // DFT_PROFILING

#else  // DFT_MPI
  // We aren't running MPI, then we don't do any reduction, and
  //  the final answer is just Y1dY2 and we'll just make result 
  // point to it.
  Matrix &result = Y1dY2;
#endif // DFT_MPI

  return result;
}



/*
 * Multiply ColumnBundle by a matrix on the right:
 * YM = Y*M
 */

ColumnBundle operator*(const ColumnBundle &Y, const Matrix &M)
{
  if (Y.tot_ncols != M.nr)
    die("In ColumnBundle::operator*, Y.tot_ncols != M.nr\n");

  ColumnBundle YM(Y.col_length, M.nc);
  
  /* Call the router function below!! */
  do_ColumnBundle_matrix_mult(Y,M,YM,0);

  return YM;
}

void
mult(const ColumnBundle Y, const Matrix &M, ColumnBundle &YM)
{
  do_ColumnBundle_matrix_mult(Y,M,YM,0);
}


/*
 * Does YM = Y*M or YM += Y*M.
 */
void
do_ColumnBundle_matrix_mult(const ColumnBundle &Y,
                            const Matrix &M,
                            ColumnBundle &YM,
                            int accum)
{

#ifdef DFT_PROFILING
  timerOn(4); // turn on  Y * M  timer
  counterIncr(5); // Y*M count
#endif // DFT_PROFILING

  if (Y.tot_ncols != M.nr)
    die("In do_ColumnBundle_matrix_mult(), Y.tot_ncols != M.nr\n");
  if (Y.col_length != YM.col_length)
    die("In do_ColumnBundle_matrix_mult(), Y.col_length != YM.col_length\n");
  if (M.nc != YM.tot_ncols)
    die("In do_ColumnBundle_matrix_mult(), M.nc != YM.tot_ncols\n");

  // We assume that YM and Y both have valid and congruent column structures

  int N     = Y.col_length;
  int ncols = YM.my_ncols;
  int nrows = Y.my_ncols;

#ifdef DFT_PROFILING
  // Flop count of Y*M
  counterIncr(4,8.0*(double)N*(double)ncols*Y.tot_ncols);
#endif

  /* Zero out final result if not in accumulation mode */
  if (!accum)
    YM.zero_out();

  // both YM and Y are distributed
  if ( (Y.distributed == 1) && (YM.distributed == 1) ) {
    // Use code in dist_multiply.c
    do_Y_M_mult_distributed(Y, M, YM, accum);
    
#ifdef DFT_PROFILING
    counterIncr(6); // distributed Y*M count
#endif // DFT_PROFILING
  }
  
  // Y and YM are both local.
  else if ( (Y.distributed == 0) && (YM.distributed == 0) ) {   
    Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,0,0,accum);
  }
  
  // Y is local, YM is distributed.
  else if ( Y.distributed == 0 ) {
    Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,YM.start_ncol,0,accum);
  }
  
  // Y is distributed, YM is local.
  else if ( YM.distributed == 0 ) {
    
#ifdef DFT_MPI
    die("YM = Y*M where Y is distributed and YM is local is not supported\n");
/*        // If in MPI mode, then we have to create a new ColumnBundle */
/*        // to hold the local result before we sum up accros processors. */
/*        ColumnBundle Ytemp(YM.tot_ncols, YM.col_length, YM.basis, "local"); */
/*        copy_innards_ColumnBundle(&YM, &Ytemp); */
/*        // If in accum mode, then the zeroth processor starts with YM */
/*        // in its local result and accumulates into it.  All other proc. */
/*        // and for all other cases start with zero in their local result. */
/*        if (accum && System::Get_procID()==0) */
/*  	Ytemp = YM; */
/*        else */
/*  	Ytemp.zero_out(); */
/*  #else */
/*        // Non MPI mode... just make Ytemp point to YM and zero it out */
/*        // if not in accum mode. */
/*        ColumnBundle &Ytemp = YM; */
/*        if (!accum) */
/*  	Ytemp.zero_out(); */
#endif

/*        // do the multiply into Ytemp */
/*        Y_M_block_matrix_mult(Y,M,Ytemp,N,nrows,ncols,Y.start_ncol,0,0,accum); */

/*  #ifdef DFT_MPI */

/*  #ifdef DFT_PROFILING */
/*        timerOn(13);  // Turn on other MPI_Allreduce timer */
/*        counterIncr(21); // Number of other reduces */
/*        // Bytes reduced */
/*        counterIncr(22,sizeof(scalar)*(double)N*(double)ncols); */
/*  #endif */

/*        // do a global reduction on Ytemp, store in YM; */
/*        MPI_Allreduce ( &(Ytemp.col[0].c[0]), &(YM.col[0].c[0]),  */
/*                       N * ncols * SCALAR_SIZE,  */
/*                       MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); */
/*  #ifdef DFT_PROFILING */
/*        timerOff(13);  // Turn off other MPI_Allreduce timer */
/*  #endif */

/*  #endif // DFT_MPI */

    }

#ifdef DFT_PROFILING
  timerOff(4); // turn off  Y * M  timer
#endif // DFT_PROFILING
}



/* randomize: this is basis dependent, so call the 
 * relevant function on the columns
 */
void
ColumnBundle::randomize()
{
  for(int i = 0; i < my_ncols; i++)
    col[i]->randomize(); 
}



// Write ColumnBundle in binary to file fname
//////////////////////////////////////////////////
// Processor identified by MPI_IO has regular   //
// IO functionalities. See MPI_Get_Attr for     //
// furthur details. For now, assume processor 0 //
// is this processor.                           //
//////////////////////////////////////////////////
void
ColumnBundle::write(char *fname)
{
  FILE *fp;

  fp = dft_fopen(fname, "w");
  write_stream(fp);
  dft_fclose(fp);
}


void
write_ColumnBundle_array(char *fname,int nbundles,ColumnBundle **Y)
{
  FILE *fp = NULL;
  int i;

  fp = dft_fopen(fname,"w");
  for (i=0; i < nbundles; i++)
    Y[i]->write_stream(fp);
  dft_fclose(fp);
}

/* Write ColumnBundle in binary to the already open IO stream...don't
 * close it! */
///////////////////////////////
// See comments for write    //
// above.                    //
///////////////////////////////
void
ColumnBundle::write_stream(FILE *fp)
{

#ifdef DFT_MPI

  if (System::Get_procID() == System::Get_IOprocID() ) {
    // Temporary holding places for MPI.
    int nproc;
    int remote_ndata;
    // here we assume that the IO node  has as many columns as the
    // node with the most columns
    CoeffSpaceWavefunctionData tmp_data(data.ndata);
    
    MPI_Status status;
    // CRITICAL: data.col[i].c[j] must be continuous, with j being
    // the inner index.

    // loop over all MPI processes
    for ( nproc = 0; nproc < System::Get_N_Procs(); nproc++ ) {
      // if you are at the IO MPI process, just write to file
      if ( nproc == System::Get_procID() ) 
        fwrite(data.d, data.datasize, data.ndata, fp);
      else {  
        // Receive the size of other proces' data  into remote_ndata.
        MPI_Recv(&remote_ndata, 1, MPI_INT, nproc, 0,
                 MPI_COMM_WORLD, &status);
        
        // Receive the actual data.
        if (remote_ndata > 0){
          MPI_Recv(tmp_data.d,
                   remote_ndata*(tmp_data.datasize/sizeof(double)),
                   MPI_DOUBLE, nproc, 1,
                   MPI_COMM_WORLD, &status);
          
          fwrite(tmp_data.d,
                 tmp_data.datasize,
                 remote_ndata,
                 fp);
        }
      }
    }
  } else {   
    // Send data to processor io_node for writing.
    MPI_Send(&data.ndata, 1, MPI_INT, System::Get_IOprocID(),
             0, MPI_COMM_WORLD);
    
    if (data.ndata > 0)
      MPI_Send(data.d, data.ndata*(data.datasize/sizeof(double)),
               MPI_DOUBLE, 
               System::Get_IOprocID(), 1, MPI_COMM_WORLD);
  }
  
  MPI_Barrier(MPI_COMM_WORLD);  // just to be safe, synchronize.
  
  
#else // DFT_MPI
  
  fwrite(data.d, data.datasize, data.ndata, fp);
  
#endif // DFT_MPI
}

void
ColumnBundle::read(char *fname)
{
  FILE *fp;

  fp = dft_fopen(fname, "r");
  read_stream(fp);
  dft_fclose(fp);

}

/* Read ColumnBundle in binary form from an already open IO stream...
 * don't close it! */
void
ColumnBundle::read_stream(FILE *fp)
{
#ifdef DFT_MPI

  if (System::Get_procID() == System::Get_IOprocID() ) {

    // Temporary holding places for MPI.
    int nproc;
    int remote_ndata;
    CoeffSpaceWavefunctionData tmp_data(data.ndata); // here we assume that the IO node
                                 // has as many columns as the
                                 // node with the most columns

    MPI_Status status;
    // CRITICAL: data.col[i].c[j] must be continuous, with j being
    // the inner index.
    // And my_ncols must NOT differ by more than 1 across processes.
    for ( nproc = 0; nproc < System::Get_N_Procs(); nproc++ ) {
      if ( nproc == System::Get_procID() ) {
	fread(data.d,
	      data.datasize,
	      data.ndata,
	      fp);
	
      } else {
	MPI_Recv(&remote_ndata, 1, MPI_INT, nproc, 0, MPI_COMM_WORLD, &status);
	
	if (remote_ndata > 0){	
	  fread(tmp_data.d,
		tmp_data.datasize,
		remote_ndata,
		fp);
	  MPI_Send(tmp_data.d, remote_ndata*(tmp_data.datasize/sizeof(double)),
		   MPI_DOUBLE, nproc, 1, MPI_COMM_WORLD);
	}
      }
    }
  } else {
    
    MPI_Status status;
    
    MPI_Send(&data.ndata, 1, MPI_INT, System::Get_IOprocID(),
	     0, MPI_COMM_WORLD);
    
    // Receive data from processor io_node.
    if (my_ncols > 0) 
      MPI_Recv(data.d, data.ndata*(data.datasize/sizeof(double)), MPI_DOUBLE,
	       System::Get_IOprocID(), 1, MPI_COMM_WORLD, &status);
  }
  
  MPI_Barrier(MPI_COMM_WORLD);  // just to be safe, synchronize.
  
#else // DFT_MPI
  
  fread(data.d,
	data.datasize,
	data.ndata,
	fp);

#endif // DFT_MPI
}

/* Copies all the internal data of Y1 into Y2:  Y1 -> Y2 */
/*  void  */
/*  copy_innards_ColumnBundle(const ColumnBundle *Y1,ColumnBundle *Y2) */
/*  { */
/*    Y2->tot_ncols=Y1->tot_ncols; */
/*    Y2->my_ncols=Y1->my_ncols; */
/*    Y2->start_ncol=Y1->start_ncol;  */
/*    Y2->end_ncol=Y1->end_ncol; */
/*    Y2->col_length=Y1->col_length; */
/*    Y2->distributed=Y1->distributed; */
/*  } */