/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

/* $Id: conv3d.cpp,v 1.1.2.8 2003/05/29 18:54:15 ivan Exp $ */
/* 
 * $Log: conv3d.cpp,v $
 * Revision 1.1.2.8  2003/05/29 18:54:15  ivan
 * Update the PRL reference
 *
 * Revision 1.1.2.7  2003/05/23 01:49:00  sahak
 * added the new header
 *
 * Revision 1.1.2.6  2003/04/03 01:50:48  ivan
 * minor cosmetic fixes to compile on balboa (Alpha) and fiber (SGI)
 * with the native compilers
 *
 * Revision 1.1.2.5  2003/04/02 23:35:13  ivan
 * Removing C-dependencies from the code. Now it needs only C++ compiler
 *
 * Revision 1.1.2.4  2002/10/28 22:07:29  matthew
 * Non-orthogonal Laplaciancvs add coeff_d.c!  Works for orthorhombic cells, still in need of debugging for other unit cells.
 *
 * Revision 1.2  2001/08/21 09:05:35  daykov
 * Overlapping Grids: Idag working
 *    (still can do speed optimization and currently the deepest certical segment
 *     in the grid tree must be the first in order to visit everybody in the
 *     transforms - actually Idag)
 *
 * Revision 1.1.1.1  2001/04/18 17:15:13  daykov
 * starting project
 *
 * Revision 1.1  2000/07/11 01:05:04  daykov
 * Initial revision
 *
 * Revision 1.1  1999/09/14 19:39:52  torkel
 * Initial revision
 *
 * Revision 1.14  1999/08/06 19:25:03  torkel
 * *** empty log message ***
 *
 * Revision 1.13  1999/07/04 20:48:31  torkel
 * Full O implemented.
 *
 * Revision 1.12  1999/07/02 07:03:54  torkel
 * the convolution in Osame seems to work for this version.
 *
 * Revision 1.11  1999/07/02 03:12:07  torkel
 * In the middle of making Osame work
 *
 * Revision 1.10  1999/06/23 22:19:26  torkel
 * revised version where Obelow is dagger of Oabove for grid without
 * siblings.
 *
 * Revision 1.9  1999/06/15 23:24:24  torkel
 * debugging, trying to make Obelow work
 *
 * Revision 1.8  1999/06/11 12:52:48  torkel
 * including convolutions for obelow
 *
 * Revision 1.5  1999/04/25 22:32:56  muchomas
 * *** empty log message ***
 *
 * Revision 1.4  1999/04/25 18:35:39  muchomas
 * Contains new routines for same-level operator convolutions
 * Working toward more arbitrary length convolution routines
 *
 * Revision 1.3  1999/04/20 15:59:37  muchomas
 * Fully documented code
 *
 * Revision 1.2  1999/04/19 18:58:39  muchomas
 * Consistent set of calling conventions for aconv3d routines
 *
 * Revision 1.1  1999/04/19 18:12:11  muchomas
 * Initial revision
 *
 * Revision 1.13  1999/04/17 22:00:09  muchomas
 * Working dagger operators, still slow...
 *
 * Revision 1.12  1999/04/14 19:48:37  muchomas
 * Working transpose transform, but doesn't take input/output arrays -- yet!
 *
 * Revision 1.11  1999/04/13 15:11:36  muchomas
 * Does ghosts, making compact version, but still in larger array
 *
 * Revision 1.10  1999/04/13 14:32:53  muchomas
 * Idag that computes only only ghosts, leaving sparse data distribution
 *
 * Revision 1.9  1999/04/11 20:21:47  muchomas
 * Repackaged calling sequence for upper/lower forward/inverse transform package
 *
 * Revision 1.8  1999/04/11 20:03:03  muchomas
 * Workspace dimensions now determined by aconvP3d
 *
 * Revision 1.7  1999/04/11 19:53:46  muchomas
 * Forward/Inverse package working
 *
 * Revision 1.6  1999/04/11 03:06:31  muchomas
 * Forward transform with proper indexing into "upper" array
 *
 * Revision 1.5  1999/04/10 23:01:44  muchomas
 * Thinks of points and sets of points, rather than lines and planes
 *
 * Revision 1.4  1999/04/10 22:38:57  muchomas
 * Now using log functionality
 *
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "convsets.h"


/* High speed, 3d convolution routines */

/*
  Back and forth, linear convolution for +/- 3 filters using padding (IN PLACE)
  out: input/output array for in place computation
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convpts3(double inout[],int n,int npad,double reversefilter[],
	 double forwardfilter[])
{
  register int i;

  /* First, make sure pad is zero */
  for (i=n; i<npad; i++) inout[i]=0.;

  /* Reverse part */
  {
      register double t0=reversefilter[0],t1=reversefilter[1],
                      t2=reversefilter[2],t3=reversefilter[3]; 
      for (i=npad-1; i>=3; i--)
          inout[i]=t0*inout[i]+t1*inout[i-1]+t2*inout[i-2]+t3*inout[i-3];
      /* Careful with back end of data! */
      inout[2]=t0*inout[2]+t1*inout[1]+t2*inout[0];
      inout[1]=t0*inout[1]+t1*inout[0];
      inout[0]=t0*inout[0];
  }

  /* Forward part (padding now ensures room!) */
  {
      register double t0=forwardfilter[0],t1=forwardfilter[1],
                      t2=forwardfilter[2],t3=forwardfilter[3]; 
      for (i=0; i<n; i++)
          inout[i]=t0*inout[i]+t1*inout[i+1]+t2*inout[i+2]+t3*inout[i+3];
  }
}

/*
  Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
  out: input/output array for in place computation
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convpts5(double inout[],int n,int npad,
	 double reversefilter[],double forwardfilter[])
{
  register int i;

  /* First, make sure pad is zero */
  for (i=n; i<npad; i++) inout[i]=0.;

  /* Reverse part */
  {
    register double t0=reversefilter[0], t1=reversefilter[1],
                    t2=reversefilter[2], t3=reversefilter[3],
                    t4=reversefilter[4], t5=reversefilter[5];

    for (i=npad-1; i>=5; i--)
        inout[i]=t0*inout[i]+t1*inout[i-1]+t2*inout[i-2]+t3*inout[i-3]+
                 t4*inout[i-4]+t5*inout[i-5];

    /* Careful with back end of data! */
    inout[4]=t0*inout[4]+t1*inout[3]+t2*inout[2]+t3*inout[1]+t4*inout[0];
    inout[3]=t0*inout[3]+t1*inout[2]+t2*inout[1]+t3*inout[0];
    inout[2]=t0*inout[2]+t1*inout[1]+t2*inout[0];
    inout[1]=t0*inout[1]+t1*inout[0];
    inout[0]=t0*inout[0];
  }

  /* Forward part (padding now ensures room!) */
  {
    register double t0=forwardfilter[0], t1=forwardfilter[1],
                    t2=forwardfilter[2], t3=forwardfilter[3],
                    t4=forwardfilter[4], t5=forwardfilter[5]; 
    for (i=0; i<n; i++)
        inout[i]=t0*inout[i]+t1*inout[i+1]+t2*inout[i+2]+t3*inout[i+3]+
                 t4*inout[i+4]+t5*inout[i+5];
  }
}


/*
  Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
  out: input/output array for in place computation
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convptsioper5(double out[],double in[],int n,int npad,
	      double reversefilter[],double forwardfilter[])
{
  register int i;
  
  /* Reverse part */
  {
  register double t0=reversefilter[0], t1=reversefilter[1],
                  t2=reversefilter[2], t3=reversefilter[3],
                  t4=reversefilter[4], t5=reversefilter[5];
  
  out[npad-1]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0]+t5*in[n-1];
  out[npad-2]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0]
             +t4*in[n-1]+t5*in[n-2];
  out[npad-3]=t0*in[2]+t1*in[1]+t2*in[0]+t3*in[n-1]
             +t4*in[n-2]+t5*in[n-3];
  out[npad-4]=t0*in[1]+t1*in[0]
             +t2*in[n-1]+t3*in[n-2]+t4*in[n-3]+t5*in[n-4];
  out[npad-5]=t0*in[0]+t1*in[n-1]+t2*in[n-2]
             +t3*in[n-3]+t4*in[n-4]+t5*in[n-5];
  
  for (i=npad-6; i>=5; i--)
      out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
             t4*in[i-4]+t5*in[i-5];
  
  out[4]=out[npad-1];
  out[3]=out[npad-2];
  out[2]=out[npad-3];
  out[1]=out[npad-4];
  out[0]=out[npad-5];
  }
  
  
  /* Forward part (padding now ensures room!) */
  {
    register double t0=forwardfilter[0], t1=forwardfilter[1],
                    t2=forwardfilter[2], t3=forwardfilter[3],
                    t4=forwardfilter[4], t5=forwardfilter[5];
    for (i=0; i<n; i++)
      out[i]=t0*out[i]+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+
	t4*out[i+4]+t5*out[i+5];
  }
}

/*
  Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
  out: input/output array for in place computation
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convptsio5(double out[],double in[],int n,int npad,
	   double reversefilter[],double forwardfilter[])
{
  register int i;

  /* First, make sure pad is zero */
  for (i=n; i<npad; i++) out[i]=0.;

  /* Reverse part */
  {
    register double t0=reversefilter[0], t1=reversefilter[1],
                    t2=reversefilter[2], t3=reversefilter[3],
                    t4=reversefilter[4], t5=reversefilter[5];

    out[npad-1]=t5*in[npad-6];
    out[npad-2]=t4*in[npad-6]+t5*in[npad-7];
    out[npad-3]=t3*in[npad-6]+t4*in[npad-7]+t5*in[npad-8];
    out[npad-4]=t2*in[npad-6]+t3*in[npad-7]+t4*in[npad-8]+t5*in[npad-9];
    out[npad-5]=t1*in[npad-6]+t2*in[npad-7]
               +t3*in[npad-8]+t4*in[npad-9]+t5*in[npad-10];

    for (i=npad-6; i>=5; i--)
        out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
	       t4*in[i-4]+t5*in[i-5];

    out[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
    out[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
    out[2]=t0*in[2]+t1*in[1]+t2*in[0];
    out[1]=t0*in[1]+t1*in[0];
    out[0]=t0*in[0];

  }


  /* Forward part (padding now ensures room!) */
  {
    register double t0=forwardfilter[0], t1=forwardfilter[1],
                    t2=forwardfilter[2], t3=forwardfilter[3],
                    t4=forwardfilter[4], t5=forwardfilter[5]; 
    for (i=0; i<n; i++)
      out[i]=t0*out[i]+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+
	t4*out[i+4]+t5*out[i+5];
  }
}


/*
  Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
  out: input/output array for in place computation
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convptsio5full(double out[],double in[],int n,int npad,
	       double fullfilter[])
{
  register int i;

  register double t0=fullfilter[0], t1=fullfilter[1],
    t2=fullfilter[2], t3=fullfilter[3],
    t4=fullfilter[4], t5=fullfilter[5];
  
  if(t0 != 0.0) 
    { /* symmetric filter */
/*
    i=0;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
    t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
  ---------------------------------------------------
  i=0 case is doubled?? - probably left from cut and paste edditing
  to correct the edges (avoiding pad) 
  seems that we are not using this function anymore, though
  07/19/00 i.d.
*/
      i=0;
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i];

      i=1;
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1];

      i=2;
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2];

      i=3;
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3];

      i=4;
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4];

      for (i=5; i<n-5; i++)
	out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	  t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
    
      i=n-5;
      out[i]=t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];

      i=n-4;
      out[i]=t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];

      i=n-3;
      out[i]=t2*in[i+2]+t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];

      i=n-2;
      out[i]=t1*in[i+1]+
	t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];

      i=n-1;
      out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
    }
  else
    { /* antisymmetric filter */
    i=0;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i];

    i=1;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1];

    i=2;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2];

    i=3;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3];

    i=4;
    out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4];

    for (i=5; i<n-5; i++)
      out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
	t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
    
    i=n-5;
    out[i]=t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];

    i=n-4;
    out[i]=t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];

    i=n-3;
    out[i]=t2*in[i+2]+t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];

    i=n-2;
    out[i]=t1*in[i+1]+
      t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];

    i=n-1;
    out[i]=t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
    }

}

/* Full 3d convolution using data transposition (IN PLACE)
   (Verified and tested against conv3dtest for full 3D data sets.  Note
   that to pass that test, we need a padding region of size pd=n+3 for
   third-order interpolets.)
   datout: input/output data
   nx, ny, nz: size of region actually being computed
   npx, npy, npz: size of padded region needed for "leakage" of
                  foward filters  
                  *** NOTE ***: must also equal DIMENSIONS of datout!
   reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
   forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
   hfl: +/- reach of filter, filter "half" length
*/
void 
conv3d(double datout[], double *reversefilter, double *forwardfilter,
       double *fullfilter,
       int nx, int ny, int nz, 
       int npx, int npy, int npz,
       int hfl)
{
  int i,j;
  int npyz=npy*npz;
  double *datoutp=datout;
  
  /* Convolution of each yz plane */
  for (i=0; i<nx; i++) {
      datoutp=datout+i*npyz;
      /* Along z first */
      /* (Note: When inlined in C, this routine is faster than fortran!) */
      switch (hfl) {
	case 3:
            for (j=0; j<ny; j++)
             convpts3(datout+(i*npy+j)*npz,nz,npz,reversefilter,forwardfilter);
            break;
	case 5:
            for (j=0; j<ny; j++)   
	     convpts5(datout+(i*npy+j)*npz,nz,npz,reversefilter,forwardfilter);
	  break;
	default:
            fprintf(stderr,"\n\nhfl=%d not supported in conv3d (1)\n\n",hfl);
            exit(1);
      }

      /* Now do plane along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      switch (hfl) {
	case 3:
            convsets3_(datoutp,
                       datoutp-3*npz,datoutp-2*npz,datoutp-npz,
                       datoutp+npz,datoutp+2*npz,datoutp+3*npz,
                       &nz,&npz,&ny,&npy,reversefilter+1*(hfl+1),
                       forwardfilter+1*(hfl+1));
            break;
	case 5:
            convsets5_(datoutp,
                       datoutp-5*npz,datoutp-4*npz,
                       datoutp-3*npz,datoutp-2*npz,datoutp-npz,
                       datoutp+npz,datoutp+2*npz,datoutp+3*npz,
                       datoutp+4*npz,datoutp+5*npz,
                       &nz,&npz,&ny,&npy,
                       reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
            break;
	default:
            fprintf(stderr,"\n\nhfl=%d not supported in conv3d (2)\n\n",hfl);
            exit(1);
      }
  }

  /* Convolutions along z in small chunks */
  for (j=0; j<ny; j++) {
      datoutp=datout+j*npz;
      switch (hfl) {
	case 3: convsets3_(datoutp,
                           datoutp-3*npyz,datoutp-2*npyz,datoutp-npyz,
                           datoutp+npyz,datoutp+2*npyz,datoutp+3*npyz,
                           &nz,&npyz,&nx,&npx,
                           reversefilter+2*(hfl+1),forwardfilter+2*(hfl+1));
        break;
	case 5: convsets5_(datoutp,
                           datoutp-5*npyz,datoutp-4*npyz,
                           datoutp-3*npyz,datoutp-2*npyz,datoutp-npyz,
                           datoutp+npyz,datoutp+2*npyz,datoutp+3*npyz,
                           datoutp+4*npyz,datoutp+5*npyz,
                           &nz,&npyz,&nx,&npx,
                           reversefilter+2*(hfl+1),forwardfilter+2*(hfl+1));
        break;
	default:
            fprintf(stderr,"\n\nhfl=%d not supported in conv3d (3)\n\n",hfl);
            exit(1);
      }
  }
  
}


/* Full 3d convolution where results are placed on a different
   grid than the input grid.
   datin: input data
   datout: output data, assumed to have same dimensions as datin
   ifcum: specifies whether we accumulate results onto datout or not
   reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
   forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
   nlx, nly, nlz: linear dimensions of data sets
   hfl: +/- reach of filter, filter "half" length
   work: worspace for convulution
*/
void 
conv3dnew(double datin[],double datout[],int ifcum,int periodic,
	  double *reversefilter, double *forwardfilter, double *fullfilter,
	  int nlx, int nly, int nlz, 
	  int hfl, double work[], double work_extra[])
{
  int i,j;

  /*Constants giving region of workspace which must be computed */
  int nnx=nlx+hfl;
  int nny=nly+hfl;
  int nnz=nlz+hfl;
 
  int nnyz=nny*nnz;


  /* Constants describing filling of result into lower region*/
  int nlyz=nly*nlz;

  double *workp,*datoutp;

  /* Convolution of each yz plane */
  for (i=0; i<nlx; i++){
      workp=work+i*nnyz;
      /* Along z first */
      /* (Note: When inlined in C, this routine is faster than fortran!) */
      switch (hfl) {
	case 5:
            for (j=0; j<nly; j++)   
                if (periodic)
                    convptsioper5(work+(i*nny+j)*nnz,
                                  datin+(i*nly+j)*nlz,
                                  nlz,nnz,
                                  reversefilter+2*(hfl+1),
                                  forwardfilter+2*(hfl+1));
                else
/* 	      convptsio5full(work+(i*nny+j)*nnz, */
/* 			     datin+(i*nly+j)*nlz, */
/* 			     nlz,nnz,fullfilter); */
                    convptsio5(work+(i*nny+j)*nnz,
                               datin+(i*nly+j)*nlz,
                               nlz,nnz,
                               reversefilter+2*(hfl+1),
                               forwardfilter+2*(hfl+1));
            break;
	default:
           fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (1)\n\n",hfl);
           exit(1);
      }
      
      /* Now do plane along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      switch (hfl) {
	case 5:
            if (periodic)
                convsetsper5_(workp,
                              workp-5*nnz,workp-4*nnz,
                              workp-3*nnz,workp-2*nnz,workp-nnz,
                              workp+nnz,workp+2*nnz,workp+3*nnz,
                              workp+4*nnz,workp+5*nnz,
                              &nlz,&nnz,&nly,&nny,
                              reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
            else
                convsets5_(workp,
                           workp-5*nnz,workp-4*nnz,
                           workp-3*nnz,workp-2*nnz,workp-nnz,
                           workp+nnz,workp+2*nnz,workp+3*nnz,
                           workp+4*nnz,workp+5*nnz,
                           &nlz,&nnz,&nly,&nny,
                           reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
            break;
	default:
           fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (2)\n\n",hfl);
           exit(1);
      }
  }
  
  
  /* Convolutions along z in small chunks */
  if (ifcum==0) /*No cumulative addition, just replacing data on out.*/
      for (j=0; j<nly; j++) {
          workp=work+j*nnz;
          datoutp=datout+j*nlz;  
          switch (hfl) {
            case 5:
                if (periodic)
                    convsetsioppercum5_(datoutp,
                                        workp,
                                        workp-5*nnyz,workp-4*nnyz,
                                        workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
                                        workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
                                        workp+4*nnyz,workp+5*nnyz,
                                        &nlz,&nnyz,&nlx,&nnx,&nlyz,
                                        reversefilter+0*(hfl+1),
                                        forwardfilter+0*(hfl+1));
                else
                    convsetsio5full_(datoutp,
                                     workp,
                                     workp-5*nnyz,workp-4*nnyz,
                                     workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
                                     workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
                                     workp+4*nnyz,workp+5*nnyz,
                                     &nlz,&nnyz,&nlx,&nnx,&nlyz,
                                     fullfilter+0*(hfl+1) );
                break;
            default:
	   fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (3)\n\n",hfl);
           exit(1);
	  }
      }
  if (ifcum == 1)
      /*Add the values of the convolution to the values already on out*/
      for (j=0; j<nly; j++) {
          workp=work+j*nnz;
          datoutp=datout+j*nlz;
          
          switch (hfl) {
            case 5:
                if (periodic)
                    convsetsiopper5_(datoutp,
                                     workp,
                                     workp-5*nnyz,workp-4*nnyz,
                                     workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
                                     workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
                                     workp+4*nnyz,workp+5*nnyz,
                                     &nlz,&nnyz,&nlx,&nnx,&nlyz,
                                     reversefilter+0*(hfl+1),
                                     forwardfilter+0*(hfl+1));
                else
                    convsetsiop5full_(datoutp,
                                      workp,
                                      workp-5*nnyz,workp-4*nnyz,
                                      workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
                                      workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
                                      workp+4*nnyz,workp+5*nnyz,
                                      &nlz,&nnyz,&nlx,&nnx,&nlyz,
                                      fullfilter+0*(hfl+1));
/* 	      convsetsiop5_(datoutp, */
/* 			    workp, */
/* 			    workp-5*nnyz,workp-4*nnyz, */
/* 			    workp-3*nnyz,workp-2*nnyz,workp-1*nnyz, */
/* 			    workp+1*nnyz,workp+2*nnyz,workp+3*nnyz, */
/* 			    workp+4*nnyz,workp+5*nnyz, */
/* 			    &nlz,&nnyz,&nlx,&nnx,&nlyz, */
/* 			    reversefilter+0*(hfl+1), */
/* 			    forwardfilter+0*(hfl+1)); */
                break;
            default:
	   fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (3)\n\n",hfl);
           exit(1);
	  }
      }
  
}

/* 3d conv computing onto ghost points only (IN PLACE)
   (Key Lesson:
   For this convolution, where our output data was "spotty", the trick
   was to do things in the opposite from the forward transform, so
   that our processing never involved skipping over "holes" in cache
   lines.
   This also teaches that once you have a good cache sequence for an
   operation, a good sequence for the transpose operation involves
   reversing the sequence, just as in (A*B*C)'=C'*B'*A'.  Hmmm....)

   inout: input/output data
   nx, ny, nz: dimension of region actually being computed
   npx, npy, npz: dimension of padded region needed for "leakage" of
                  foward filters  
   reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
   forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
*/
void 
Pconv3d(double inout[], int nx, int ny, int nz, int npx, int npy, int npz, 
	double reversefilter[3][4], double forwardfilter[3][4])
{
  int i,j;
  int npyz=npy*npz;
  double *inoutp=inout;

  /* Convolutions, in linear y chuncks, of yz planes along x */
  for (j=0; j<ny; j++) {
      inoutp=inout+j*npz;
      pconvsets3_(inoutp,
                  inoutp-3*npyz,inoutp-2*npyz,inoutp-npyz,
                  inoutp+npyz,inoutp+2*npyz,inoutp+3*npyz,
                  &nz,&npyz,&nx,&npx,reversefilter[2],forwardfilter[2]);
  }
  
  /* Convolution of each yz plane */
  for (i=0; i<nx; i+=2){
      /* Plane i along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      inoutp=inout+i*npyz;
      pconvsets3_(inoutp,
                  inoutp-3*npz,inoutp-2*npz,inoutp-npz,
                  inoutp+npz,inoutp+2*npz,inoutp+3*npz,
                  &nz,&npz,&ny,&npy,reversefilter[1],forwardfilter[1]);
      
      /* Finally, along z */
      /* (Note: When inlined in C, this routine is faster than fortran!) */
      for (j=0; j<ny; j+=2)   
        convpts3(inout+(i*npy+j)*npz,nz,npz,reversefilter[0],forwardfilter[0]);
  }


}

/*
  Back and forth, linear convolution using padding
     (IN / +=OUT on ghosts)
  out: output array of just ghosts (upper level)
  in: input array (lower level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
  ------------------
  not used currently  07/19/00 i.d.
*/
void 
aPconvptsp3(double out[],double in[],int n,
	    int npad,double fullfilter[])
{
    {
        
        register int i;
        register double t1=fullfilter[1], t3=fullfilter[3]; 
        
        i=0;
        out[i]+=t3*in[2*i+3];

        i=1;
        out[i]+=in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];

        i=2;
        out[i]+=t1*in[2*i-1]+in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];

        for (i=3; i<n/2-3; i++)
            out[i]+=t3*in[2*i-3]+t1*in[2*i-1]
                +in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];

        i=n/2-3;
        out[i]+=t3*in[2*i-3]+t1*in[2*i-1]
            +in[2*i]+t1*in[2*i+1];

        i=n/2-2;
        out[i]+=t3*in[2*i-3]+t1*in[2*i-1];

        i=n/2-1;
        out[i]+=t3*in[2*i-3];
    }
}

/*
  Back and forth, linear convolution using padding
     (IN / +=OUT on ghosts)
  out: output array of just ghosts (upper level)
  in: input array (lower level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
aPconvptsp8full(double out[],double in[],int n,
		int npad,double fullfilter[])
{
  register int i;
  register double t0=fullfilter[0], t1=fullfilter[1], t2=fullfilter[2],
    t3=fullfilter[3], t4=fullfilter[4], t5=fullfilter[5], 
    t6=fullfilter[6], t7=fullfilter[7], t8=fullfilter[8]; 

  if(t0 != 0.0)
    { /* symmetric filter */
      i=0;
      out[i]+=t8*in[2*i+8];
      i=1;
      out[i]+=t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=2;
      out[i]+=t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=3;
      out[i]+=t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
	+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=4;
      out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=5;
      out[i]+=t2*in[2*i-2]+t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]
	+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]
	+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=6;
      out[i]+=t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
	+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=7;
      out[i]+=t6*in[2*i-6]+t5*in[2*i-5]+t4*in[2*i-4]+t3*in[2*i-3]
	+t2*in[2*i-2]+t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]
	+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]
	+t7*in[2*i+7]+t8*in[2*i+8];

      for (i=8; i<n/2-8; i++)
	out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	  +t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	  +t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	  +t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
	  +t8*in[2*i+8];

      i=n/2-8;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7];
      i=n/2-7;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5];
      i=n/2-6;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3];
      i=n/2-5;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1];
      i=n/2-4;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1];
      i=n/2-3;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
	+t4*in[2*i-4]+t3*in[2*i-3];
      i=n/2-2;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5];
      i=n/2-1;
      out[i]+=t8*in[2*i-8]+t7*in[2*i-7];
      /*     out[i]+=t3*in[2*i-3]+t1*in[2*i-1]+in[2*i]; */
    }
  else
    { /* antisymmetric filter */
      i=0;
      out[i]+=t8*in[2*i+8];
      i=1;
      out[i]+=t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=2;
      out[i]+=t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=3;
      out[i]+=t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
	+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=4;
      out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=5;
      out[i]+=-1.*t2*in[2*i-2]-t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]
	+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]
	+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=6;
      out[i]+=-1.*t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
	+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
      i=7;
      out[i]+=-1.*t6*in[2*i-6]-t5*in[2*i-5]-t4*in[2*i-4]-t3*in[2*i-3]
	-t2*in[2*i-2]-t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]
	+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]
	+t7*in[2*i+7]+t8*in[2*i+8];

      for (i=8; i<n/2-8; i++)
	out[i]+= -1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	  -t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	  +t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	  +t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
	  +t8*in[2*i+8];

      i=n/2-8;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7];
      i=n/2-7;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	+t4*in[2*i+4]+t5*in[2*i+5];
      i=n/2-6;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3];
      i=n/2-5;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
	+t0*in[2*i]+t1*in[2*i+1];
      i=n/2-4;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1];
      i=n/2-3;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
	-t4*in[2*i-4]-t3*in[2*i-3];
      i=n/2-2;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5];
      i=n/2-1;
      out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7];
      /*     out[i]+=t3*in[2*i-3]+t1*in[2*i-1]+in[2*i]; */
    }
}

/*
  Back and forth, linear convolution using padding
     (IN / +=OUT on ghosts)
  out: output array of just ghosts (upper level)
  in: input array (lower level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
  ---------------------------
  currently not used 07/19/00 i.d.
*/
void 
aPconvptsp8(double out[],double in[],int n,int npad,
	    double reversefilter[],double forwardfilter[])
{
  register int i;

  for (i=n; i<npad; i++) in[i]=0.; /* First, make sure pad is zero */

  /* Reverse part */
  {
    register double t0=reversefilter[0], t1=reversefilter[1],
                    t2=reversefilter[2], t3=reversefilter[3],
                    t4=reversefilter[4], t5=reversefilter[5],
                    t6=reversefilter[6], t7=reversefilter[7],
                    t8=reversefilter[8];
    for (i=npad-1; i>7; i--)
        in[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
              t4*in[i-4]+t5*in[i-5]+t6*in[i-6]+t7*in[i-7]+
              t8*in[i-8];
    
    in[7]=t0*in[7]+t1*in[6]+t2*in[5]+t3*in[4]
         +t4*in[3]+t5*in[2]+t6*in[1]+t7*in[0];
    in[6]=t0*in[6]+t1*in[5]+t2*in[4]+t3*in[3]+t4*in[2]+t5*in[1]+t6*in[0];
    in[5]=t0*in[5]+t1*in[4]+t2*in[3]+t3*in[2]+t4*in[1]+t5*in[0];
    in[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
    in[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
    in[2]=t0*in[2]+t1*in[1]+t2*in[0];
    in[1]=t0*in[1]+t1*in[0];
    in[0]=t0*in[0];
  }

  /* Forward part (padding now ensures room!) */
  /* Note that even points on "in" are accumulated onto "out" */
  {
    register double t0=forwardfilter[0], t1=forwardfilter[1],
                    t2=forwardfilter[2], t3=forwardfilter[3],
                    t4=forwardfilter[4], t5=forwardfilter[5],
                    t6=forwardfilter[6], t7=forwardfilter[7],
                    t8=forwardfilter[8];
    for (i=0; i<n/2; i++)
	out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	       +t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
               +t8*in[2*i+8];
  }
}

/*
  Back and forth, linear convolution using padding
     (IN / -=OUT on ghosts)
  out: output array of just ghosts (upper level)
  in: input array (lower level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  fullfilter: convolution filter
*/
void 
aPconvptsm3(double out[],double in[],int n,int npad,double fullfilter[])
{
  register int i;
  /* First, make sure pad is zero */ 
  /*   for (i=n; i<npad; i++) in[i]=0.;i*/
  
  /* Note that even points on "in" are accumulated onto "out" */
  {
    register double t1=fullfilter[1], t3=fullfilter[3]; 
    i=0;
    out[i]-=in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
    i=1;
    out[i]-=t1*in[2*i-1]+in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
    for (i=2; i<n/2-2; i++)
        out[i]-= t3*in[2*i-3] +t1*in[2*i-1] +in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
    i=n/2-2;
    out[i]-= t3*in[2*i-3] + t1*in[2*i-1] + in[2*i]+t1*in[2*i+1];
    i=n/2-1;
    out[i]-= t3*in[2*i-3] + t1*in[2*i-1] + in[2*i];
  }
}

/*
  Back and forth, linear convolution using padding
  (IN / -=OUT on ghosts)
  out: output array of just ghosts (upper level)
  in: input array (lower level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
  -----------------------------------
  currently not used 07/19/00 i.d.
*/
void 
aPconvptsm8(double out[],double in[],int n,int npad,
	    double reversefilter[],double forwardfilter[])
{
  register int i;

  for (i=n; i<npad; i++) in[i]=0.; /* First, make sure pad is zero */

  /* Reverse part */
  {
    register double t0=reversefilter[0], t1=reversefilter[1],
                    t2=reversefilter[2], t3=reversefilter[3],
                    t4=reversefilter[4], t5=reversefilter[5],
                    t6=reversefilter[6], t7=reversefilter[7]; 
    register double t8=reversefilter[8];
    for (i=npad-1; i>7; i--)
        in[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
              t4*in[i-4]+t5*in[i-5]+t6*in[i-6]+t7*in[i-7]+
              t8*in[i-8];

    in[7]=t0*in[7]+t1*in[6]+t2*in[5]+t3*in[4]
         +t4*in[3]+t5*in[2]+t6*in[1]+t7*in[0];
    in[6]=t0*in[6]+t1*in[5]+t2*in[4]+t3*in[3]+t4*in[2]+t5*in[1]+t6*in[0];
    in[5]=t0*in[5]+t1*in[4]+t2*in[3]+t3*in[2]+t4*in[1]+t5*in[0];
    in[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
    in[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
    in[2]=t0*in[2]+t1*in[1]+t2*in[0];
    in[1]=t0*in[1]+t1*in[0];
    in[0]=t0*in[0];
  }

  /* Forward part (padding now ensures room!) */
  /* Note that even points on "in" are accumulated onto "out" */
  {
    register double t0=forwardfilter[0], t1=forwardfilter[1],
                    t2=forwardfilter[2], t3=forwardfilter[3],
                    t4=forwardfilter[4], t5=forwardfilter[5],
                    t6=forwardfilter[6], t7=forwardfilter[7]; 
    register double t8=forwardfilter[8];
    for (i=0; i<n/2; i++)
      out[i]-=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
	     +t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
  }
}

/*
  Full two-level 3d convolution of conjugate transform type (IN PLACE)
     Computes I +/- P I ~P, where P is projection onto ghosts, ~P is the
     projection onto non-ghosts, and I is the simple interpolet
     convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output.
  upper: Points to location in upper data array corresponding to
         origin of lower array.
  lower: Points to origin of lower array
  sign: Sign of transform, + is Idag, - is Jdag
  reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
  forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
  fullfilter:    non-factorized filter coefficients
  nux, nuy, nuz: dimensions of upper array
  nlx, nly, nlz: dimensions of lower array
  hfl: length of half-filters
  work, work_extra: workspaces of size 
                    at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void 
aPconv3d(double upper[], double lower[], int sign,
	 double reversefilter[], double forwardfilter[], 
	 double fullfilter[],
	 int nux, int nuy, int nuz,
	 int nlx, int nly, int nlz,
	 int hfl,
	 double work[],
	 double work_extra[])
{
  int i,j,k;
  int nlyz=nly*nlz;
  int nx=nlx+2*hfl, ny=nly+2*hfl, nz=nlz+2*hfl; 
  int nwx=nlx+3*hfl, nwy=nly+3*hfl, nwz=nlz+3*hfl;
  int nwyz=nwy*nwz;

  int fptl=2,lptl=2+nlz-1;
  double *workp,*lowerp;
  double *workp_extra;
  int flag;

  int padxb,padyb,padzb;

  padxb=2*(hfl/2);
  padyb=2*(hfl/2);
  padzb=2*(hfl/2);

  fptl=padxb; lptl=padxb+nlz-1;

/*   printf("HFL and reversefilter are %d %12.5lf\n",hfl,reversefilter[0]); */

  /* INDEXING NOTES:

     (1) Our basic indexing refers to the work array, which has a
         buffer region that circumscribes the lower level.  The
	 beginning of the lower level is thus at (2,2,2) in the
	 indexing below.

     (2) "upper" is defined to reference the point in the upper array
	 that corresponds to the origin of the lower array.  There are
	 two things needed to translate properly: a homogeneous scale
	 factor of two AND to adjust for the fact that the origin of
	 the work array refers to the point (-1,-1,-1) from the point
	 in the upper array which corresponds to the origin of the
	 lower.  The scale factor is handled explicitly in the code
	 below.  To adjust for the issue of the origin, we include a
	 shift +((i/2-1)*nuy+(j/2-1))*nuz-1 where upper is used below.

     (3) "lower" references the origin of the "lower" array where the
         input comes from.  In the lower arrray, the locations of the
	 origin of the work array is (-2,-2,-2), and thus we use
         lowerp=lower+(-2)*nlyz+(j-2)*nlz-2 below.
	 */


  /* Convolutions, in linear y chuncks, of yz planes along x, with
     transfer of data from lower to work */
  for (j=0; j<ny; j++) {
      workp=work+j*nwz;
      workp_extra=work_extra+j*nwz;
      lowerp=lower+(-padxb)*nlyz+(j-padyb)*nlz-padzb;
      
      if (j<padyb || j>=padyb+nly) {
	  /* fill in pads with zeros */
	  for (i=0; i<nx; i++)
              for (k=0; k<nz; k++) {
                  *(workp+i*nwyz+k)=0.;
                  *(workp_extra+i*nwyz+k)=0.;
	      }
      }
      else {
	  /* Convolutions along z in small chuncks, with transfer of
	     data from lower into work */ 
	  flag=j%2;
	  switch (hfl) {
	    case 3:
                pconvsetsi3new_(workp_extra,
                                lowerp,
                                lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz,
                                lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
                                &nz,&nwyz,&nx,&nwx,
                                &nlyz,&fptl,&lptl,
                                fullfilter+0*(hfl+1),&flag,&sign);
/*   	      pconvsetsi3old_( */
/*  			      workp, */
/*  			      workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/*  			      workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/*  			      lowerp, */
/*  			      lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/*  			      lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/*  			      &nz,&nwyz,&nx,&nwx, */
/*  			      &nlyz,&fptl,&lptl, */
/*  			      reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1),&flag,&sign); */
                break;
	    case 8:
                pconvsetsi8new_(workp,
                                lowerp,
                                lowerp-8*nlyz,lowerp-7*nlyz,lowerp-6*nlyz,
                                lowerp-5*nlyz,lowerp-4*nlyz,lowerp-3*nlyz,
                                lowerp-2*nlyz,lowerp-1*nlyz,
                                lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
                                lowerp+4*nlyz,lowerp+5*nlyz,lowerp+6*nlyz,
                                lowerp+7*nlyz,lowerp+8*nlyz,
                                &nz,&nwyz,&nx,&nwx,
                                &nlyz,&fptl,&lptl,
                                fullfilter+0*(hfl+1),&flag,&sign);
/*   	      pconvsetsi8_( */
/*   			   workp, */
/*   			   workp-8*nwyz,workp-7*nwyz, */
/*   			   workp-6*nwyz,workp-5*nwyz,workp-4*nwyz, */
/*   			   workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/*   			   workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/*   			   workp+4*nwyz,workp+5*nwyz,workp+6*nwyz, */
/*   			   workp+7*nwyz,workp+8*nwyz, */
/*   			   lowerp, */
/*   			   lowerp-8*nlyz,lowerp-7*nlyz, */
/*   			   lowerp-6*nlyz,lowerp-5*nlyz,lowerp-4*nlyz, */
/*   			   lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/*   			   lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/*   			   lowerp+4*nlyz,lowerp+5*nlyz,lowerp+6*nlyz, */
/*   			   lowerp+7*nlyz,lowerp+8*nlyz, */
/*   			   &nz,&nwyz,&nx,&nwx, */
/*   			   &nlyz,&fptl,&lptl, */
/*   			   reversefilter+0*(hfl+1), */
/*   			   forwardfilter+0*(hfl+1),&flag,&sign); */
                break;
	    default:
                fprintf(stderr,
                        "\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
                exit(1);
          }
      }
  }
  
  /* Convolution of each yz plane */
  for (i=0; i<nx; i+=2) {
      /* Convolution of plane i along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      workp=work+i*nwyz;
      workp_extra=work_extra+i*nwyz;
	  switch (hfl) {
	    case 3:
                pconvsetsio3full_(workp,workp_extra,
                                  workp_extra-3*nwz,workp_extra-2*nwz,
                                  workp_extra-nwz,
                                  workp_extra+nwz,workp_extra+2*nwz,
                                  workp_extra+3*nwz,
                                  &nz,&nwz,&ny,&nwy,fullfilter+1*(hfl+1));
                break;
	    case 8:
                pconvsetsio8full_(workp_extra,workp,workp-8*nwz,
                                  workp-7*nwz,workp-6*nwz,
                                  workp-5*nwz,workp-4*nwz,
                                  workp-3*nwz,workp-2*nwz,
                                  workp-nwz,
                                  workp+nwz,workp+2*nwz,
                                  workp+3*nwz,workp+4*nwz,
                                  workp+5*nwz,workp+6*nwz,
                                  workp+7*nwz,workp+8*nwz,
                                  &nz,&nwz,&ny,&nwy,fullfilter+1*(hfl+1));

/* 	      pconvsets8_(workp, */
/* 			  workp-8*nwz,workp-7*nwz, */
/* 			  workp-6*nwz,workp-5*nwz,workp-4*nwz, */
/* 			  workp-3*nwz,workp-2*nwz,workp-nwz, */
/* 			  workp+nwz,workp+2*nwz,workp+3*nwz, */
/* 			  workp+4*nwz,workp+5*nwz,workp+6*nwz, */
/* 			  workp+7*nwz,workp+8*nwz, */
/* 			  &nz,&nwz,&ny,&nwy, */
/* 			  reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1)); */
                break; 
	    default:
                fprintf(stderr,
                        "\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
                exit(1);
	    }
      
      /* Finally, convolution along z, from work with accumulation
	 onto upper */
      /* (Note: When inlined in C, this routine is faster than fortran!) */
          if (sign==1)
              for (j=0; j<ny; j+=2) {
                  switch (hfl) {
                    case 3:
                        aPconvptsp3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1,
                                    work+(i*nwy+j)*nwz,nz,nwz,
                                    fullfilter+2*(hfl+1));
/*                   	     aPconvptsp3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1,  */
/*  		 			  work+(i*nwy+j)*nwz,  */
/*  		 			  nz,nwz, */
/*  					  reversefilter+2*(hfl+1), */
/*  					  forwardfilter+2*(hfl+1));  */
                        break;
                    case 8:
                        aPconvptsp8full(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2))
                                        *nuz-padzb/2,
                                        work_extra+(i*nwy+j)*nwz,nz,nwz,
                                        fullfilter+2*(hfl+1) );
/* 		aPconvptsp8(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2)) */
/* 			    *nuz-padzb/2, */
/* 			    work+(i*nwy+j)*nwz,nz,nwz, */
/* 			    reversefilter+2*(hfl+1), */
/* 			    forwardfilter+2*(hfl+1) ); */
                        break;
                    default:
                        fprintf(stderr,
                                "\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
                        exit(1);
                  }
                  
                  /* Impose real/coef space constraint on output
                     (could be done in pconvsetsi3 with 2.5% speedup, but
                     this way makes code more managable */
                  if (i>=padxb && i<padxb+nlx && j>=padyb && j<padyb+nly)
                      for (k=padzb; k<padzb+nlz; k+=2)
                          *(lower+((i-padxb)*nly+(j-padyb))*nlz+(k-padzb))=0.;
              }
          else if (sign == -1 )
              for (j=0; j<ny; j+=2) 
              {
                  switch (hfl)
                  {
                    case 3:
                        aPconvptsm3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1,work
                                    +(i*nwy+j)*nwz,
                                    nz,nwz,fullfilter+2*(hfl+1));
                        break;
                    default:
                        fprintf(stderr,
                           "\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
                        exit(1);
                  }
	    
	    /* Impose real/coef space constraint on output
	       (could be done in pconvsetsi3 with 2.5% speedup, but
	       this way makes code more managable */
	    if (i>=padxb && i<padxb+nlx && j>=padyb && j<padyb+nly)
                for (k=padzb; k<padzb+nlz; k+=2)
                    *(lower+((i-padxb)*nly+(j-padyb))*nlz+(k-padzb))=
                        *(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2))*nuz+(k/2-padzb/2));
              }
          
  }
}

/*
  Full two-level 3d convolution of conjugate transform type (IN PLACE)
     Computes I +/- P I ~P, where P is projection onto ghosts, ~P is the
     projection onto non-ghosts, and I is the simple interpolet
     convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output.
  upper: Points to location in upper data array corresponding to
         origin of lower array.
  lower: Points to origin of lower array
  sign: Sign of transform, + is Idag, - is Jdag
  reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
  forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
  nux, nuy, nuz: dimensions of upper array
  nlx, nly, nlz: dimensions of lower array
  hfl: length of half-filters
  work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
  -----------------------
  currently not used 07/19/00 i.d.
*/
void 
aPconv3dtest(double upper[], double lower[], int sign,
	     double reversefilter[], double forwardfilter[], 
	     double fullfilter[],
	     int nux, int nuy, int nuz,
	     int nlx, int nly, int nlz,
	     int hfl,
	     double work[])
{
  int i,j,k;
  int nlyz=nly*nlz;
  int nx=nlx+2*hfl, ny=nly+2*hfl, nz=nlz+2*hfl; 
  int nwx=nlx+3*hfl, nwy=nly+3*hfl, nwz=nlz+3*hfl;
  int nwyz=nwy*nwz;

  int fptl=2,lptl=2+nlz-1;
  double *workp,*lowerp;
  int flag;

  int padxb,padyb,padzb;

  padxb=2*(hfl/2);
  padyb=2*(hfl/2);
  padzb=2*(hfl/2);

  fptl=padxb; lptl=padxb+nlz-1;

  /* Convolutions, in linear y chuncks, of yz planes along x, with
     transfer of data from lower to work */
/*   j=2; */
  for (j=0; j<ny; j++) {
      workp=work+j*nwz;
      lowerp=lower+(-padxb)*nlyz+(j-padyb)*nlz-padzb;
      
      if (j<padyb || j>=padyb+nly) {
	  /* fill in pads with zeros */
	  for (i=0; i<nx; i++)
              for (k=0; k<nz; k++)
                  *(workp+i*nwyz+k)=0.;
      }
      else {
	  /* Convolutions along z in small chuncks, with transfer of
	     data from lower into work */ 
	  flag=j%2;
	  switch (hfl) {
	    case 3:
/*  	      pconvsetsi3new_( */
/*  			      workp, */
/*  			      lowerp, */
/*  			      lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/*  			      lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/*  			      &nz,&nwyz,&nx,&nwx, */
/*  			      &nlyz,&fptl,&lptl, */
/*  			      fullfilter+0*(hfl+1),&flag,&sign); */
                pconvsetsi3old_(workp,
                                workp-3*nwyz,workp-2*nwyz,workp-nwyz,
                                workp+nwyz,workp+2*nwyz,workp+3*nwyz,
                                lowerp,
                                lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz,
                                lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
                                &nz,&nwyz,&nx,&nwx,
                                &nlyz,&fptl,&lptl,
                                reversefilter+0*(hfl+1),
                                forwardfilter+0*(hfl+1),&flag,&sign);
                break;
	    default:
                fprintf(stderr,
                        "\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
          }
      }
  } 
  
}

/*
  Back and forth, linear convolution for +/- 3 filters using padding
     (IN / OUT on non-ghosts)
  out: output array (lower level)
  up: input array of just ghosts (upper level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
        over the edge
  reversefilter: reverse-time filter coefficients	
  filt: forward-time filter coefficients
*/
void 
convptsP3(double out[],double up[],int n,int npad,double filt[])
{
  register int i;
  
  register double t1=filt[1],t3=filt[3]; 
  
  /* Main reverse loop exploiting zeros */
  for (i=n-5; i>2; i-=2) {
      out[i]= t3*up[(i-3)/2]+t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2];
      out[i-1]=up[(i-1)/2];
    }
}

/*
  Back and forth, linear convolution for +/- 8 filters using padding
  (IN / OUT on non-ghosts)
  out: output array (lower level)
  up: input array of just ghosts (upper level)
  n: size of actual data set being convolved
  npad: size of padded region needed for "leakage" of foward filters
  over the edge
  reversefilter: reverse-time filter coefficients	
  forwardfilter: forward-time filter coefficients
*/
void 
convptsP8(double out[],double up[],int n,int npad,
	  double reversefilter[],double forwardfilter[])
{
  register int i;

  /* Reverse part */
  {
    register double
      t0=reversefilter[0],
      t1=reversefilter[1],
      t2=reversefilter[2],
      t3=reversefilter[3],
      t4=reversefilter[4],
      t5=reversefilter[5],
      t6=reversefilter[6],
      t7=reversefilter[7],
      t8=reversefilter[8];

    /* Pad region */
/*     out[npad-1]=0.; */
/*     out[npad-2]=t8*up[(npad-2-8)/2]; */
/*     out[npad-3]=t7*up[(npad-3-7)/2]; */
/*     out[npad-4]=t6*up[(npad-4-6)/2]+t8*up[(npad-4-8)/2]; */
/*     out[npad-5]=t5*up[(npad-5-5)/2]+t7*up[(npad-5-7)/2]; */
/*     out[npad-6]=t4*up[(npad-6-4)/2]+t6*up[(npad-6-6)/2]+t8*up[(npad-6-8)/2]; */
/*     out[npad-7]=t3*up[(npad-7-3)/2]+t5*up[(npad-7-5)/2]+t7*up[(npad-7-7)/2]; */
/*     out[npad-8]=t2*up[(npad-8-2)/2]+t4*up[(npad-8-4)/2]+t6*up[(npad-8-6)/2]+t8*up[(npad-8-8)/2]; */

    /* Main reverse loop exploiting zeros */
    for (i=n-1; i>=8; i-=2)
      {
	out[i]=t1*up[(i-1)/2]+t3*up[(i-3)/2]+t5*up[(i-5)/2]+t7*up[(i-7)/2];
	out[i-1]=t0*up[(i-1)/2]+t2*up[(i-1-2)/2]
	  +t4*up[(i-1-4)/2]+t6*up[(i-1-6)/2]+t8*up[(i-1-8)/2];
      }
    
    /* Fence posting */
/*     out[2]=t0*in[2]         +t2*in[0]; */
/*     out[7]=t1*up[(7-1)/2]+t3*up[(7-3)/2]+t5*up[(7-5)/2]+t7*up[(7-7)/2]; */
/*     out[6]=t0*up[(6)/2]+t2*up[(6-2)/2]+t4*up[(6-4)/2]+t6*up[(6-6)/2]; */
/*     out[5]=t1*up[(5-1)/2]+t3*up[(5-3)/2]+t5*up[(5-5)/2]; */
/*     out[4]=t0*up[(4)/2]+t2*up[(4-2)/2]+t4*up[(4-4)/2]; */
/*     out[3]=t1*up[(3-1)/2]+t3*up[(3-3)/2]; */
/*     out[2]=t0*up[(2)/2]+t2*up[(2-2)/2]; */
/*     out[1]=t1*up[(1-1)/2]; */
/*     out[0]=t0*up[(0)/2]; */
  }

  /* Forward part (padding now ensures room!) */
  {
    register double
      t0=forwardfilter[0],
      t1=forwardfilter[1],
      t2=forwardfilter[2],
      t3=forwardfilter[3],
      t4=forwardfilter[4],
      t5=forwardfilter[5],
      t6=forwardfilter[6],
      t7=forwardfilter[7],
      t8=forwardfilter[8];
    for (i=8; i<n-8; i++)
      out[i]=
	t0*out[i]
	+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+t4*out[i+4]
	+t5*out[i+5]+t6*out[i+6]+t7*out[i+7]+t8*out[i+8];
  }
}

void 
convptsP8full(double out[],double up[],int n,int npad,double fullfilt[])
{
  register int i;
  
  register double t0=fullfilt[0],t1=fullfilt[1],t2=fullfilt[2],
    t3=fullfilt[3],t4=fullfilt[4],t5=fullfilt[5],
    t6=fullfilt[6],t7=fullfilt[7],t8=fullfilt[8]; 
  
  if(t0 != 0.0)
    { /* symmetric filter */
      /* Main reverse loop exploiting zeros */
      for (i=n-9; i>8; i-=2)
	{
	  out[i]= t7*up[(i-7)/2]+t5*up[(i-5)/2]+t3*up[(i-3)/2]
	    +t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2]
	    +t5*up[(i+5)/2]+t7*up[(i+7)/2];
	  out[i-1]= t8*up[(i-8)/2]+t6*up[(i-6)/2]+t4*up[(i-4)/2]
	    +t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
	    +t6*up[(i+6)/2]	+t8*up[(i+8)/2];
	}
    }
  else
    { /* antisymmetric filter */
      /* Main reverse loop exploiting zeros */
      for (i=n-9; i>8; i-=2)
	{
	  out[i]= -1.*t7*up[(i-7)/2]-t5*up[(i-5)/2]-t3*up[(i-3)/2]
	    -t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2]
	    +t5*up[(i+5)/2]+t7*up[(i+7)/2];
	  out[i-1]= -1.*t8*up[(i-8)/2]-t6*up[(i-6)/2]-t4*up[(i-4)/2]
	    -t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
	    +t6*up[(i+6)/2]	+t8*up[(i+8)/2];
	}
    }
}


/* I'm not going to change this routine to use anti/symmetric filters, since
   it's optimized for L anyway (which is a symmetric filter). -- mhe 4/12/02 */
void 
convptsP8fullL(double out[],double up[],int n,int npad,double fullfilt[])
{
  register int i;
  
  register double t0=fullfilt[0],t1=fullfilt[1],t2=fullfilt[2],
    t3=fullfilt[3],t4=fullfilt[4],t6=fullfilt[6]; 
  
  /* Main reverse loop exploiting zeros */
  for (i=n-9; i>8; i-=2)
    {
      out[i]= t3*up[(i-3)/2]
	+t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2];
      out[i-1]= t6*up[(i-6)/2]+t4*up[(i-4)/2]
	+t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
	+t6*up[(i+6)/2];
    }
}


/*
  Full two-level 3d convolution of transform type (IN PLACE)
     Computes I +/- ~P I P, where P is projection onto ghosts, ~P is the
     projection onto non-ghosts, and I is the simple interpolet
     convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output,
     PROVIDED they are obeyed on input.
  upper: Points to location in upper data array corresponding to
         origin of lower array.
  lower: Points to origin of lower array
  sign: Sign of transform, + is Idag, - is Jdag 
  (should be I and J, non??(Torkel))
  reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
  forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
  fullfilter: full filter
  nux, nuy, nuz: dimensions of upper array
  nlx, nly, nlz: dimensions of lower array
  hfl: length of half-filters
  work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void 
aconvP3d(double upper[], double lower[], int sign, 
	 double reversefilter[], double forwardfilter[],
	 double fullfilter[],
	 int nux, int nuy, int nuz,
	 int nlx, int nly, int nlz, 
	 int hfl, 
	 double work[],double work_extra[])
{
  int i,j;
  /* Constants giving region of workspace which must be computed */
  int nx=nlx+2*hfl;
  int ny=nly+2*hfl;
  int nz=nlz+2*hfl;

  /* Constants giving total size of workspace, including padding for
     factored convolultion strategy */
  int nwx=nx+hfl;
  int nwy=ny+hfl;
  int nwz=nz+hfl;
  
  int nwyz=nwy*nwz;
  
  /* Constants describing filling of results into lower region */
  int 
    nlyz=nly*nlz,
    fptl= 2*(hfl/2), lptl= 2*(hfl/2)+nlz-1, 
    fsetl=2*(hfl/2), lsetl=2*(hfl/2)+nlx-1;

  double *workp,*upperp,*lowerp;
  double *work_extrap;

  /* INDEXING NOTES:

     (1) Our basic indexing refers to the work array, which has a
	 buffer region that circumscribes the lower level.  The
	 beginning of the lower level is thus at
	 (2(hfl/2),2(hfl/2),2(hfl/2)) [in INTEGER arthimetic in the
	 indexing below.

     (2) "upper" is defined to reference the point in the upper array
	 that corresponds to the origin of the lower array.  There are
	 two things needed to translate properly: a homogeneous scale
	 factor of two AND to adjust for the fact that the origin of
	 the work array refers to the point
	 (-(hfl/2),-(hfl/2),-(hfl/2)) from the origin of the lower
	 array in the upper array.  The scale factor is handled
	 explicitly in the code below.  To adjust for the issue of the
	 origin, we now simply include this shift into the pointer.

     (3) "lower" references the origin of the "lower" array where the
         output is supposed to go.  We now shift it so that
	 convsetspo_ can use the referencing of the work array.

	 */


  upperp=upper-((1*nuy+1)*nuz+1)*(hfl/2);
  lowerp=lower-((2*nly+2)*nlz+2)*(hfl/2);
  /* Convolution of each yz plane */
  for (i=0; i<nx; i+=2) {
      workp=work+i*nwyz;
      work_extrap=work_extra+i*nwyz;
      
      /* Along z first, pulling data from upper into the work array */
      switch (hfl) {
	case 3:
            for (j=0; j<ny; j+=2)   
                convptsP3(work+(i*nwy+j)*nwz, upperp+(i*nuy+j)*nuz/2,
                          nz, nwz, fullfilter+2*(hfl+1));
	  break;
	case 8:
	  for (j=0; j<ny; j+=2)
	    convptsP8full(work+(i*nwy+j)*nwz, upperp+(i*nuy+j)*nuz/2,
			  nz, nwz,fullfilter+2*(hfl+1) );
/* 	    convptsP8(work+(i*nwy+j)*nwz,upperp+(i*nuy+j)*nuz/2, */
/* 		      nz,nwz,reversefilter+2*(hfl+1), */
/* 		      forwardfilter+2*(hfl+1) ); */
	  break;
	default:
	  fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (1)\n\n",hfl);
	  exit(1);
	}
      
      /* Now do plane along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      switch (hfl) {
	case 3:
	  convsetsp3_(work_extrap,
		      workp,
		      workp-3*nwz,workp-2*nwz,workp-nwz,
		      workp+nwz,workp+2*nwz,workp+3*nwz,
		      &nz,&nwz,&ny,&nwy,
		      reversefilter+1*(hfl+1),fullfilter+1*(hfl+1) );
/* 	  convsetsp3_(workp, */
/* 		     workp-3*nwz,workp-2*nwz,workp-nwz, */
/* 		     workp+nwz,workp+2*nwz,workp+3*nwz, */
/* 		     &nz,&nwz,&ny,&nwy,reversefilter+hfl+1,forwardfilter+hfl+1); */
	  break;
	case 8:
	  convsetsp8full_(work_extrap, workp,
			  workp-8*nwz,workp-7*nwz,workp-6*nwz,workp-5*nwz,
			  workp-4*nwz,workp-3*nwz,workp-2*nwz,workp-nwz,
			  workp+nwz,workp+2*nwz,workp+3*nwz,workp+4*nwz,
			  workp+5*nwz,workp+6*nwz,workp+7*nwz,workp+8*nwz,
			  &nz,&nwz,&ny,&nwy,
			  fullfilter+1*(hfl+1) );
/* 	  convsetsp8_(workp, */
/* 		      workp-8*nwz,workp-7*nwz,workp-6*nwz,workp-5*nwz, */
/* 		      workp-4*nwz,workp-3*nwz,workp-2*nwz,workp-nwz, */
/* 		      workp+nwz,workp+2*nwz,workp+3*nwz,workp+4*nwz, */
/* 		      workp+5*nwz,workp+6*nwz,workp+7*nwz,workp+8*nwz, */
/* 		      &nz,&nwz,&ny,&nwy, */
/* 		      reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1) ); */
	  break;
	default:
	  fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (2)\n\n",hfl);
	  exit(1);
	}
    }

  /* Convolutions along z in small chunks, with accumulation from work
   onto lower */

  for (j=2*(hfl/2); j<2*(hfl/2)+nly; j++) {
      workp=work+j*nwz;
      work_extrap=work_extra+j*nwz;
      if (sign==1){
	  switch (hfl){
	    case 3:
	      convsetspop3_(lowerp+j*nlz,work_extrap,
			    work_extrap-3*nwyz,work_extrap-2*nwyz,
			    work_extrap-nwyz,
			    work_extrap+nwyz,work_extrap+2*nwyz,
			    work_extrap+3*nwyz,
			    &nz,&nwyz,&nx,&nwx,
			    &nlyz,&fptl,&lptl,&fsetl,&lsetl,
			    fullfilter+0*(hfl+1) );
	      break;
	    case 8:
/* 	      printf("Calling convsetspop8full\n\n"); */
	      convsetspop8full_(lowerp+j*nlz,work_extrap,
				work_extrap-8*nwyz,work_extrap-7*nwyz,
				work_extrap-6*nwyz,work_extrap-5*nwyz, 
				work_extrap-4*nwyz,work_extrap-3*nwyz,
				work_extrap-2*nwyz,work_extrap-nwyz,
				work_extrap+nwyz,work_extrap+2*nwyz,
				work_extrap+3*nwyz,work_extrap+4*nwyz,
				work_extrap+5*nwyz,work_extrap+6*nwyz,
				work_extrap+7*nwyz,work_extrap+8*nwyz,
				&nz,&nwyz,&nx,&nwx,
				&nlyz,&fptl,&lptl,&fsetl,&lsetl,
				fullfilter+0*(hfl+1));
/* 	      convsetspop8_(lowerp+j*nlz,workp, */
/* 			   workp-8*nwyz,workp-7*nwyz,workp-6*nwyz,workp-5*nwyz, */
/* 			   workp-4*nwyz,workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* 			   workp+nwyz,workp+2*nwyz,workp+3*nwyz,workp+4*nwyz, */
/* 			   workp+5*nwyz,workp+6*nwyz,workp+7*nwyz,workp+8*nwyz, */
/* 			   &nz,&nwyz,&nx,&nwx, */
/* 			   &nlyz,&fptl,&lptl,&fsetl,&lsetl, */
/* 			   reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1)); */
	      break;
	    default:
	      fprintf(stderr,
		      "\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
	      exit(1);
	    }
	}
      /*New possibility, included by Torkel 5/10 */
      if (sign==0) {
	  switch (hfl) {
	    case 3:
	      convsetspo3_(lowerp+j*nlz,work_extrap,
			   work_extrap-3*nwyz,work_extrap-2*nwyz,
			   work_extrap-nwyz,
			   work_extrap+nwyz,work_extrap+2*nwyz,
			   work_extrap+3*nwyz,
			   &nz,&nwyz,&nx,&nwx,
			   &nlyz,&fptl,&lptl,&fsetl,&lsetl,
			   reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1));
	      break;
	    case 8:
	      convsetspo8full_(lowerp+j*nlz,work_extrap,
			       work_extrap-8*nwyz,work_extrap-7*nwyz,
			       work_extrap-6*nwyz,work_extrap-5*nwyz,
			       work_extrap-4*nwyz,work_extrap-3*nwyz,
			       work_extrap-2*nwyz,work_extrap-nwyz,
			       work_extrap+nwyz,work_extrap+2*nwyz,
			       work_extrap+3*nwyz,work_extrap+4*nwyz,
			       work_extrap+5*nwyz,work_extrap+6*nwyz,
			       work_extrap+7*nwyz,work_extrap+8*nwyz,
			       &nz,&nwyz,&nx,&nwx,
			       &nlyz,&fptl,&lptl,&fsetl,&lsetl,
			       fullfilter+0*(hfl+1));
/* 	      convsetspo8_(lowerp+j*nlz,workp, */
/* 			  workp-8*nwyz,workp-7*nwyz, */
/* 			  workp-6*nwyz,workp-5*nwyz,workp-4*nwyz, */
/* 			  workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* 			  workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/* 			  workp+4*nwyz,workp+5*nwyz,workp+6*nwyz, */
/* 			  workp+7*nwyz,workp+8*nwyz, */
/* 			  &nz,&nwyz,&nx,&nwx, */
/* 			  &nlyz,&fptl,&lptl,&fsetl,&lsetl, */
/* 			  reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1)); */
	      break;
	    default:
	      fprintf(stderr,
		      "\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
	      exit(1);
	    }
	}
      if (sign==-1)
	{
	  switch (hfl)
	    {
	    case 3:
	      convsetspom3_(lowerp+j*nlz,work_extrap,
			    work_extrap-3*nwyz,work_extrap-2*nwyz,
			    work_extrap-nwyz,
			    work_extrap+nwyz,work_extrap+2*nwyz,
			    work_extrap+3*nwyz,
			    &nz,&nwyz,&nx,&nwx,
			    &nlyz,&fptl,&lptl,&fsetl,&lsetl,
			    fullfilter+0*(hfl+1));
	      break;
	    default:
	      fprintf(stderr,
		      "\n\nhfl=%d not supported in aconvP3d (3b)\n\n",hfl);
	      exit(1);
	    }
	}
    }
}


/*
  Full two-level 3d convolution of transform type (IN PLACE)
     Computes I +/- ~P I P, where P is projection onto ghosts, ~P is the
     projection onto non-ghosts, and I is the simple interpolet
     convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output,
     PROVIDED they are obeyed on input.
  upper: Points to location in upper data array corresponding to
         origin of lower array.
  lower: Points to origin of lower array
  addon: adds this array to the lower array
  sign: Sign of transform, + is I, - is J 
  revfilt: reverse-time filter coefficients (2d array: [x,y,z][filter])
  forwfilt: forward-time filter coefficients (2d array: [x,y,z][filter])
  nux, nuy, nuz: dimensions of upper array
  nlx, nly, nlz: dimensions of lower array
  hfl: length of half-filters
  work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void 
aconvP3dadd(double upper[], double lower[], double addon[],
	    double revfilt[], double forwfilt[], double fullfilt[],
	    int nux, int nuy, int nuz,
	    int nlx, int nly, int nlz,int hfl, 
	    double work[],double work_extra[])
{
  int i,j;
  /* Constants giving region of workspace which must be computed */
  int nx=nlx+2*hfl;
  int ny=nly+2*hfl;
  int nz=nlz+2*hfl;

  /* Constants giving total size of workspace, including padding for
     factored convolultion strategy */
  int nwx=nx+hfl;
  int nwy=ny+hfl;
  int nwz=nz+hfl;

  int nwyz=nwy*nwz;


  /* Constants describing filling of results into lower region */
  int 
    nlyz=nly*nlz,
    fptl= 2*(hfl/2), lptl= 2*(hfl/2)+nlz-1, 
    fsetl=2*(hfl/2), lsetl=2*(hfl/2)+nlx-1;

  double *workp,*upperp,*lowerp,*addonp;

  /*  seems unused 07/10/00 i.d.
  int padxb,padyb,padzb;
  
  padxb=2*(hfl/2);
  padyb=2*(hfl/2);
  padzb=2*(hfl/2);
  */
  
  /* INDEXING NOTES:

     (1) Our basic indexing refers to the work array, which has a
	 buffer region that circumscribes the lower level.  The
	 beginning of the lower level is thus at
	 (2(hfl/2),2(hfl/2),2(hfl/2)) [in INTEGER arthimetic in the
	 indexing below.

     (2) "upper" is defined to reference the point in the upper array
	 that corresponds to the origin of the lower array.  There are
	 two things needed to translate properly: a homogeneous scale
	 factor of two AND to adjust for the fact that the origin of
	 the work array refers to the point
	 (-(hfl/2),-(hfl/2),-(hfl/2)) from the origin of the lower
	 array in the upper array.  The scale factor is handled
	 explicitly in the code below.  To adjust for the issue of the
	 origin, we now simply include this shift into the pointer.

     (3) "lower" references the origin of the "lower" array where the
         output is supposed to go.  We now shift it so that
	 convsetspo_ can use the referencing of the work array.

  */


  upperp=upper-((1*nuy+1)*nuz+1)*(hfl/2);
  lowerp=lower-((2*nly+2)*nlz+2)*(hfl/2);
  addonp=addon-((2*nly+2)*nlz+2)*(hfl/2);
  /* Convolution of each yz plane */
  for (i=0; i<nx; i+=2){
      workp=work+i*nwyz;
      
      /* Along z first, pulling data from upper into the work array */
      switch (hfl){
	case 3:
            for (j=0; j<ny; j+=2)   
                convptsP3(work+(i*nwy+j)*nwz,upperp+(i*nuy+j)*nuz/2,
                          nz,nwz,fullfilt+2*(hfl+1) );
            break;
	default:
            fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (1)\n\n",hfl);
            exit(1);
      }
      
      /* Now do plane along y */
      /* (This seems too big to inline, and fortran runs a lot faster!) */
      switch (hfl){
	case 3:
            convsetsp3_(workp,
                        workp,
                        workp-3*nwz,workp-2*nwz,workp-nwz,
                        workp+nwz,workp+2*nwz,workp+3*nwz,
                        &nz,&nwz,&ny,&nwy,revfilt+hfl+1,fullfilt+hfl+1);
/* 	  convsetsp3_(work_extrap, */
/* 		      workp, */
/* 		      workp-3*nwz,workp-2*nwz,workp-nwz, */
/* 		      workp+nwz,workp+2*nwz,workp+3*nwz, */
/* 		      &nz,&nwz,&ny,&nwy,tr+hfl+1,fullfilter+hfl+1); */
            break;
	default:
            fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (2)\n\n",hfl);
            exit(1);
      }
    }

  /* Convolutions along z in small chunks, with accumulation from work
   onto lower */
  for (j=2*(hfl/2); j<2*(hfl/2)+nly; j++){
      workp=work+j*nwz;
      switch (hfl){
        case 3:
            convsetspoadd3_(lowerp+j*nlz,addonp+j*nlz,
                            workp,workp-3*nwyz,
                            workp-2*nwyz,workp-nwyz,
                            workp+nwyz,workp+2*nwyz,workp+3*nwyz,
                            &nz,&nwyz,&nx,&nwx,
                            &nlyz,&fptl,&lptl,&fsetl,&lsetl,
                            revfilt+0*(hfl+1),forwfilt+0*(hfl+1));
            break;
        default:
            fprintf(stderr,
                    "\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
            exit(1);
      }
  }
}