/*
DFT++ is a density functional package developed by the research group
of Professor Tomas Arias
Copyright 1996-2003 Sohrab Ismail-Beigi
This file is part of DFT++.
DFT++ is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
DFT++ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DFT++; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Please see the file CREDITS for a list of authors.
For academic users, we request that publications using results obtained with
this software reference
"New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).
and, if using the wavelet basis, further reference
"Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).
and
"Robust ab initio calculation of condensed matter: transparent convergence through
semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).
For your convenience, preprints of the above articles may be obtained from
http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/
/* $Id: conv3d.cpp,v 1.1.2.8 2003/05/29 18:54:15 ivan Exp $ */
/*
* $Log: conv3d.cpp,v $
* Revision 1.1.2.8 2003/05/29 18:54:15 ivan
* Update the PRL reference
*
* Revision 1.1.2.7 2003/05/23 01:49:00 sahak
* added the new header
*
* Revision 1.1.2.6 2003/04/03 01:50:48 ivan
* minor cosmetic fixes to compile on balboa (Alpha) and fiber (SGI)
* with the native compilers
*
* Revision 1.1.2.5 2003/04/02 23:35:13 ivan
* Removing C-dependencies from the code. Now it needs only C++ compiler
*
* Revision 1.1.2.4 2002/10/28 22:07:29 matthew
* Non-orthogonal Laplaciancvs add coeff_d.c! Works for orthorhombic cells, still in need of debugging for other unit cells.
*
* Revision 1.2 2001/08/21 09:05:35 daykov
* Overlapping Grids: Idag working
* (still can do speed optimization and currently the deepest certical segment
* in the grid tree must be the first in order to visit everybody in the
* transforms - actually Idag)
*
* Revision 1.1.1.1 2001/04/18 17:15:13 daykov
* starting project
*
* Revision 1.1 2000/07/11 01:05:04 daykov
* Initial revision
*
* Revision 1.1 1999/09/14 19:39:52 torkel
* Initial revision
*
* Revision 1.14 1999/08/06 19:25:03 torkel
* *** empty log message ***
*
* Revision 1.13 1999/07/04 20:48:31 torkel
* Full O implemented.
*
* Revision 1.12 1999/07/02 07:03:54 torkel
* the convolution in Osame seems to work for this version.
*
* Revision 1.11 1999/07/02 03:12:07 torkel
* In the middle of making Osame work
*
* Revision 1.10 1999/06/23 22:19:26 torkel
* revised version where Obelow is dagger of Oabove for grid without
* siblings.
*
* Revision 1.9 1999/06/15 23:24:24 torkel
* debugging, trying to make Obelow work
*
* Revision 1.8 1999/06/11 12:52:48 torkel
* including convolutions for obelow
*
* Revision 1.5 1999/04/25 22:32:56 muchomas
* *** empty log message ***
*
* Revision 1.4 1999/04/25 18:35:39 muchomas
* Contains new routines for same-level operator convolutions
* Working toward more arbitrary length convolution routines
*
* Revision 1.3 1999/04/20 15:59:37 muchomas
* Fully documented code
*
* Revision 1.2 1999/04/19 18:58:39 muchomas
* Consistent set of calling conventions for aconv3d routines
*
* Revision 1.1 1999/04/19 18:12:11 muchomas
* Initial revision
*
* Revision 1.13 1999/04/17 22:00:09 muchomas
* Working dagger operators, still slow...
*
* Revision 1.12 1999/04/14 19:48:37 muchomas
* Working transpose transform, but doesn't take input/output arrays -- yet!
*
* Revision 1.11 1999/04/13 15:11:36 muchomas
* Does ghosts, making compact version, but still in larger array
*
* Revision 1.10 1999/04/13 14:32:53 muchomas
* Idag that computes only only ghosts, leaving sparse data distribution
*
* Revision 1.9 1999/04/11 20:21:47 muchomas
* Repackaged calling sequence for upper/lower forward/inverse transform package
*
* Revision 1.8 1999/04/11 20:03:03 muchomas
* Workspace dimensions now determined by aconvP3d
*
* Revision 1.7 1999/04/11 19:53:46 muchomas
* Forward/Inverse package working
*
* Revision 1.6 1999/04/11 03:06:31 muchomas
* Forward transform with proper indexing into "upper" array
*
* Revision 1.5 1999/04/10 23:01:44 muchomas
* Thinks of points and sets of points, rather than lines and planes
*
* Revision 1.4 1999/04/10 22:38:57 muchomas
* Now using log functionality
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "convsets.h"
/* High speed, 3d convolution routines */
/*
Back and forth, linear convolution for +/- 3 filters using padding (IN PLACE)
out: input/output array for in place computation
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convpts3(double inout[],int n,int npad,double reversefilter[],
double forwardfilter[])
{
register int i;
/* First, make sure pad is zero */
for (i=n; i<npad; i++) inout[i]=0.;
/* Reverse part */
{
register double t0=reversefilter[0],t1=reversefilter[1],
t2=reversefilter[2],t3=reversefilter[3];
for (i=npad-1; i>=3; i--)
inout[i]=t0*inout[i]+t1*inout[i-1]+t2*inout[i-2]+t3*inout[i-3];
/* Careful with back end of data! */
inout[2]=t0*inout[2]+t1*inout[1]+t2*inout[0];
inout[1]=t0*inout[1]+t1*inout[0];
inout[0]=t0*inout[0];
}
/* Forward part (padding now ensures room!) */
{
register double t0=forwardfilter[0],t1=forwardfilter[1],
t2=forwardfilter[2],t3=forwardfilter[3];
for (i=0; i<n; i++)
inout[i]=t0*inout[i]+t1*inout[i+1]+t2*inout[i+2]+t3*inout[i+3];
}
}
/*
Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
out: input/output array for in place computation
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convpts5(double inout[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
/* First, make sure pad is zero */
for (i=n; i<npad; i++) inout[i]=0.;
/* Reverse part */
{
register double t0=reversefilter[0], t1=reversefilter[1],
t2=reversefilter[2], t3=reversefilter[3],
t4=reversefilter[4], t5=reversefilter[5];
for (i=npad-1; i>=5; i--)
inout[i]=t0*inout[i]+t1*inout[i-1]+t2*inout[i-2]+t3*inout[i-3]+
t4*inout[i-4]+t5*inout[i-5];
/* Careful with back end of data! */
inout[4]=t0*inout[4]+t1*inout[3]+t2*inout[2]+t3*inout[1]+t4*inout[0];
inout[3]=t0*inout[3]+t1*inout[2]+t2*inout[1]+t3*inout[0];
inout[2]=t0*inout[2]+t1*inout[1]+t2*inout[0];
inout[1]=t0*inout[1]+t1*inout[0];
inout[0]=t0*inout[0];
}
/* Forward part (padding now ensures room!) */
{
register double t0=forwardfilter[0], t1=forwardfilter[1],
t2=forwardfilter[2], t3=forwardfilter[3],
t4=forwardfilter[4], t5=forwardfilter[5];
for (i=0; i<n; i++)
inout[i]=t0*inout[i]+t1*inout[i+1]+t2*inout[i+2]+t3*inout[i+3]+
t4*inout[i+4]+t5*inout[i+5];
}
}
/*
Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
out: input/output array for in place computation
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convptsioper5(double out[],double in[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
/* Reverse part */
{
register double t0=reversefilter[0], t1=reversefilter[1],
t2=reversefilter[2], t3=reversefilter[3],
t4=reversefilter[4], t5=reversefilter[5];
out[npad-1]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0]+t5*in[n-1];
out[npad-2]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0]
+t4*in[n-1]+t5*in[n-2];
out[npad-3]=t0*in[2]+t1*in[1]+t2*in[0]+t3*in[n-1]
+t4*in[n-2]+t5*in[n-3];
out[npad-4]=t0*in[1]+t1*in[0]
+t2*in[n-1]+t3*in[n-2]+t4*in[n-3]+t5*in[n-4];
out[npad-5]=t0*in[0]+t1*in[n-1]+t2*in[n-2]
+t3*in[n-3]+t4*in[n-4]+t5*in[n-5];
for (i=npad-6; i>=5; i--)
out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
t4*in[i-4]+t5*in[i-5];
out[4]=out[npad-1];
out[3]=out[npad-2];
out[2]=out[npad-3];
out[1]=out[npad-4];
out[0]=out[npad-5];
}
/* Forward part (padding now ensures room!) */
{
register double t0=forwardfilter[0], t1=forwardfilter[1],
t2=forwardfilter[2], t3=forwardfilter[3],
t4=forwardfilter[4], t5=forwardfilter[5];
for (i=0; i<n; i++)
out[i]=t0*out[i]+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+
t4*out[i+4]+t5*out[i+5];
}
}
/*
Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
out: input/output array for in place computation
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convptsio5(double out[],double in[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
/* First, make sure pad is zero */
for (i=n; i<npad; i++) out[i]=0.;
/* Reverse part */
{
register double t0=reversefilter[0], t1=reversefilter[1],
t2=reversefilter[2], t3=reversefilter[3],
t4=reversefilter[4], t5=reversefilter[5];
out[npad-1]=t5*in[npad-6];
out[npad-2]=t4*in[npad-6]+t5*in[npad-7];
out[npad-3]=t3*in[npad-6]+t4*in[npad-7]+t5*in[npad-8];
out[npad-4]=t2*in[npad-6]+t3*in[npad-7]+t4*in[npad-8]+t5*in[npad-9];
out[npad-5]=t1*in[npad-6]+t2*in[npad-7]
+t3*in[npad-8]+t4*in[npad-9]+t5*in[npad-10];
for (i=npad-6; i>=5; i--)
out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
t4*in[i-4]+t5*in[i-5];
out[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
out[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
out[2]=t0*in[2]+t1*in[1]+t2*in[0];
out[1]=t0*in[1]+t1*in[0];
out[0]=t0*in[0];
}
/* Forward part (padding now ensures room!) */
{
register double t0=forwardfilter[0], t1=forwardfilter[1],
t2=forwardfilter[2], t3=forwardfilter[3],
t4=forwardfilter[4], t5=forwardfilter[5];
for (i=0; i<n; i++)
out[i]=t0*out[i]+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+
t4*out[i+4]+t5*out[i+5];
}
}
/*
Back and forth, linear convolution for +/- 5 filters using padding (IN PLACE)
out: input/output array for in place computation
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convptsio5full(double out[],double in[],int n,int npad,
double fullfilter[])
{
register int i;
register double t0=fullfilter[0], t1=fullfilter[1],
t2=fullfilter[2], t3=fullfilter[3],
t4=fullfilter[4], t5=fullfilter[5];
if(t0 != 0.0)
{ /* symmetric filter */
/*
i=0;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
---------------------------------------------------
i=0 case is doubled?? - probably left from cut and paste edditing
to correct the edges (avoiding pad)
seems that we are not using this function anymore, though
07/19/00 i.d.
*/
i=0;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i];
i=1;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1];
i=2;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2];
i=3;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3];
i=4;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4];
for (i=5; i<n-5; i++)
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
i=n-5;
out[i]=t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
i=n-4;
out[i]=t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
i=n-3;
out[i]=t2*in[i+2]+t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
i=n-2;
out[i]=t1*in[i+1]+
t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
i=n-1;
out[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+t4*in[i-4]+t5*in[i-5];
}
else
{ /* antisymmetric filter */
i=0;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i];
i=1;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1];
i=2;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2];
i=3;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3];
i=4;
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4];
for (i=5; i<n-5; i++)
out[i]=t5*in[i+5]+t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
i=n-5;
out[i]=t4*in[i+4]+t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
i=n-4;
out[i]=t3*in[i+3]+t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
i=n-3;
out[i]=t2*in[i+2]+t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
i=n-2;
out[i]=t1*in[i+1]+
t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
i=n-1;
out[i]=t0*in[i]-t1*in[i-1]-t2*in[i-2]-t3*in[i-3]-t4*in[i-4]-t5*in[i-5];
}
}
/* Full 3d convolution using data transposition (IN PLACE)
(Verified and tested against conv3dtest for full 3D data sets. Note
that to pass that test, we need a padding region of size pd=n+3 for
third-order interpolets.)
datout: input/output data
nx, ny, nz: size of region actually being computed
npx, npy, npz: size of padded region needed for "leakage" of
foward filters
*** NOTE ***: must also equal DIMENSIONS of datout!
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
hfl: +/- reach of filter, filter "half" length
*/
void
conv3d(double datout[], double *reversefilter, double *forwardfilter,
double *fullfilter,
int nx, int ny, int nz,
int npx, int npy, int npz,
int hfl)
{
int i,j;
int npyz=npy*npz;
double *datoutp=datout;
/* Convolution of each yz plane */
for (i=0; i<nx; i++) {
datoutp=datout+i*npyz;
/* Along z first */
/* (Note: When inlined in C, this routine is faster than fortran!) */
switch (hfl) {
case 3:
for (j=0; j<ny; j++)
convpts3(datout+(i*npy+j)*npz,nz,npz,reversefilter,forwardfilter);
break;
case 5:
for (j=0; j<ny; j++)
convpts5(datout+(i*npy+j)*npz,nz,npz,reversefilter,forwardfilter);
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3d (1)\n\n",hfl);
exit(1);
}
/* Now do plane along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
switch (hfl) {
case 3:
convsets3_(datoutp,
datoutp-3*npz,datoutp-2*npz,datoutp-npz,
datoutp+npz,datoutp+2*npz,datoutp+3*npz,
&nz,&npz,&ny,&npy,reversefilter+1*(hfl+1),
forwardfilter+1*(hfl+1));
break;
case 5:
convsets5_(datoutp,
datoutp-5*npz,datoutp-4*npz,
datoutp-3*npz,datoutp-2*npz,datoutp-npz,
datoutp+npz,datoutp+2*npz,datoutp+3*npz,
datoutp+4*npz,datoutp+5*npz,
&nz,&npz,&ny,&npy,
reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3d (2)\n\n",hfl);
exit(1);
}
}
/* Convolutions along z in small chunks */
for (j=0; j<ny; j++) {
datoutp=datout+j*npz;
switch (hfl) {
case 3: convsets3_(datoutp,
datoutp-3*npyz,datoutp-2*npyz,datoutp-npyz,
datoutp+npyz,datoutp+2*npyz,datoutp+3*npyz,
&nz,&npyz,&nx,&npx,
reversefilter+2*(hfl+1),forwardfilter+2*(hfl+1));
break;
case 5: convsets5_(datoutp,
datoutp-5*npyz,datoutp-4*npyz,
datoutp-3*npyz,datoutp-2*npyz,datoutp-npyz,
datoutp+npyz,datoutp+2*npyz,datoutp+3*npyz,
datoutp+4*npyz,datoutp+5*npyz,
&nz,&npyz,&nx,&npx,
reversefilter+2*(hfl+1),forwardfilter+2*(hfl+1));
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3d (3)\n\n",hfl);
exit(1);
}
}
}
/* Full 3d convolution where results are placed on a different
grid than the input grid.
datin: input data
datout: output data, assumed to have same dimensions as datin
ifcum: specifies whether we accumulate results onto datout or not
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
nlx, nly, nlz: linear dimensions of data sets
hfl: +/- reach of filter, filter "half" length
work: worspace for convulution
*/
void
conv3dnew(double datin[],double datout[],int ifcum,int periodic,
double *reversefilter, double *forwardfilter, double *fullfilter,
int nlx, int nly, int nlz,
int hfl, double work[], double work_extra[])
{
int i,j;
/*Constants giving region of workspace which must be computed */
int nnx=nlx+hfl;
int nny=nly+hfl;
int nnz=nlz+hfl;
int nnyz=nny*nnz;
/* Constants describing filling of result into lower region*/
int nlyz=nly*nlz;
double *workp,*datoutp;
/* Convolution of each yz plane */
for (i=0; i<nlx; i++){
workp=work+i*nnyz;
/* Along z first */
/* (Note: When inlined in C, this routine is faster than fortran!) */
switch (hfl) {
case 5:
for (j=0; j<nly; j++)
if (periodic)
convptsioper5(work+(i*nny+j)*nnz,
datin+(i*nly+j)*nlz,
nlz,nnz,
reversefilter+2*(hfl+1),
forwardfilter+2*(hfl+1));
else
/* convptsio5full(work+(i*nny+j)*nnz, */
/* datin+(i*nly+j)*nlz, */
/* nlz,nnz,fullfilter); */
convptsio5(work+(i*nny+j)*nnz,
datin+(i*nly+j)*nlz,
nlz,nnz,
reversefilter+2*(hfl+1),
forwardfilter+2*(hfl+1));
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (1)\n\n",hfl);
exit(1);
}
/* Now do plane along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
switch (hfl) {
case 5:
if (periodic)
convsetsper5_(workp,
workp-5*nnz,workp-4*nnz,
workp-3*nnz,workp-2*nnz,workp-nnz,
workp+nnz,workp+2*nnz,workp+3*nnz,
workp+4*nnz,workp+5*nnz,
&nlz,&nnz,&nly,&nny,
reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
else
convsets5_(workp,
workp-5*nnz,workp-4*nnz,
workp-3*nnz,workp-2*nnz,workp-nnz,
workp+nnz,workp+2*nnz,workp+3*nnz,
workp+4*nnz,workp+5*nnz,
&nlz,&nnz,&nly,&nny,
reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1));
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (2)\n\n",hfl);
exit(1);
}
}
/* Convolutions along z in small chunks */
if (ifcum==0) /*No cumulative addition, just replacing data on out.*/
for (j=0; j<nly; j++) {
workp=work+j*nnz;
datoutp=datout+j*nlz;
switch (hfl) {
case 5:
if (periodic)
convsetsioppercum5_(datoutp,
workp,
workp-5*nnyz,workp-4*nnyz,
workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
workp+4*nnyz,workp+5*nnyz,
&nlz,&nnyz,&nlx,&nnx,&nlyz,
reversefilter+0*(hfl+1),
forwardfilter+0*(hfl+1));
else
convsetsio5full_(datoutp,
workp,
workp-5*nnyz,workp-4*nnyz,
workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
workp+4*nnyz,workp+5*nnyz,
&nlz,&nnyz,&nlx,&nnx,&nlyz,
fullfilter+0*(hfl+1) );
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (3)\n\n",hfl);
exit(1);
}
}
if (ifcum == 1)
/*Add the values of the convolution to the values already on out*/
for (j=0; j<nly; j++) {
workp=work+j*nnz;
datoutp=datout+j*nlz;
switch (hfl) {
case 5:
if (periodic)
convsetsiopper5_(datoutp,
workp,
workp-5*nnyz,workp-4*nnyz,
workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
workp+4*nnyz,workp+5*nnyz,
&nlz,&nnyz,&nlx,&nnx,&nlyz,
reversefilter+0*(hfl+1),
forwardfilter+0*(hfl+1));
else
convsetsiop5full_(datoutp,
workp,
workp-5*nnyz,workp-4*nnyz,
workp-3*nnyz,workp-2*nnyz,workp-1*nnyz,
workp+1*nnyz,workp+2*nnyz,workp+3*nnyz,
workp+4*nnyz,workp+5*nnyz,
&nlz,&nnyz,&nlx,&nnx,&nlyz,
fullfilter+0*(hfl+1));
/* convsetsiop5_(datoutp, */
/* workp, */
/* workp-5*nnyz,workp-4*nnyz, */
/* workp-3*nnyz,workp-2*nnyz,workp-1*nnyz, */
/* workp+1*nnyz,workp+2*nnyz,workp+3*nnyz, */
/* workp+4*nnyz,workp+5*nnyz, */
/* &nlz,&nnyz,&nlx,&nnx,&nlyz, */
/* reversefilter+0*(hfl+1), */
/* forwardfilter+0*(hfl+1)); */
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in conv3dnew (3)\n\n",hfl);
exit(1);
}
}
}
/* 3d conv computing onto ghost points only (IN PLACE)
(Key Lesson:
For this convolution, where our output data was "spotty", the trick
was to do things in the opposite from the forward transform, so
that our processing never involved skipping over "holes" in cache
lines.
This also teaches that once you have a good cache sequence for an
operation, a good sequence for the transpose operation involves
reversing the sequence, just as in (A*B*C)'=C'*B'*A'. Hmmm....)
inout: input/output data
nx, ny, nz: dimension of region actually being computed
npx, npy, npz: dimension of padded region needed for "leakage" of
foward filters
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
*/
void
Pconv3d(double inout[], int nx, int ny, int nz, int npx, int npy, int npz,
double reversefilter[3][4], double forwardfilter[3][4])
{
int i,j;
int npyz=npy*npz;
double *inoutp=inout;
/* Convolutions, in linear y chuncks, of yz planes along x */
for (j=0; j<ny; j++) {
inoutp=inout+j*npz;
pconvsets3_(inoutp,
inoutp-3*npyz,inoutp-2*npyz,inoutp-npyz,
inoutp+npyz,inoutp+2*npyz,inoutp+3*npyz,
&nz,&npyz,&nx,&npx,reversefilter[2],forwardfilter[2]);
}
/* Convolution of each yz plane */
for (i=0; i<nx; i+=2){
/* Plane i along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
inoutp=inout+i*npyz;
pconvsets3_(inoutp,
inoutp-3*npz,inoutp-2*npz,inoutp-npz,
inoutp+npz,inoutp+2*npz,inoutp+3*npz,
&nz,&npz,&ny,&npy,reversefilter[1],forwardfilter[1]);
/* Finally, along z */
/* (Note: When inlined in C, this routine is faster than fortran!) */
for (j=0; j<ny; j+=2)
convpts3(inout+(i*npy+j)*npz,nz,npz,reversefilter[0],forwardfilter[0]);
}
}
/*
Back and forth, linear convolution using padding
(IN / +=OUT on ghosts)
out: output array of just ghosts (upper level)
in: input array (lower level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
------------------
not used currently 07/19/00 i.d.
*/
void
aPconvptsp3(double out[],double in[],int n,
int npad,double fullfilter[])
{
{
register int i;
register double t1=fullfilter[1], t3=fullfilter[3];
i=0;
out[i]+=t3*in[2*i+3];
i=1;
out[i]+=in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
i=2;
out[i]+=t1*in[2*i-1]+in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
for (i=3; i<n/2-3; i++)
out[i]+=t3*in[2*i-3]+t1*in[2*i-1]
+in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
i=n/2-3;
out[i]+=t3*in[2*i-3]+t1*in[2*i-1]
+in[2*i]+t1*in[2*i+1];
i=n/2-2;
out[i]+=t3*in[2*i-3]+t1*in[2*i-1];
i=n/2-1;
out[i]+=t3*in[2*i-3];
}
}
/*
Back and forth, linear convolution using padding
(IN / +=OUT on ghosts)
out: output array of just ghosts (upper level)
in: input array (lower level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
aPconvptsp8full(double out[],double in[],int n,
int npad,double fullfilter[])
{
register int i;
register double t0=fullfilter[0], t1=fullfilter[1], t2=fullfilter[2],
t3=fullfilter[3], t4=fullfilter[4], t5=fullfilter[5],
t6=fullfilter[6], t7=fullfilter[7], t8=fullfilter[8];
if(t0 != 0.0)
{ /* symmetric filter */
i=0;
out[i]+=t8*in[2*i+8];
i=1;
out[i]+=t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=2;
out[i]+=t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=3;
out[i]+=t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=4;
out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=5;
out[i]+=t2*in[2*i-2]+t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]
+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]
+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=6;
out[i]+=t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=7;
out[i]+=t6*in[2*i-6]+t5*in[2*i-5]+t4*in[2*i-4]+t3*in[2*i-3]
+t2*in[2*i-2]+t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]
+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]
+t7*in[2*i+7]+t8*in[2*i+8];
for (i=8; i<n/2-8; i++)
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
+t8*in[2*i+8];
i=n/2-8;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7];
i=n/2-7;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5];
i=n/2-6;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3];
i=n/2-5;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1];
i=n/2-4;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3]+t2*in[2*i-2]+t1*in[2*i-1];
i=n/2-3;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5]
+t4*in[2*i-4]+t3*in[2*i-3];
i=n/2-2;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7]+t6*in[2*i-6]+t5*in[2*i-5];
i=n/2-1;
out[i]+=t8*in[2*i-8]+t7*in[2*i-7];
/* out[i]+=t3*in[2*i-3]+t1*in[2*i-1]+in[2*i]; */
}
else
{ /* antisymmetric filter */
i=0;
out[i]+=t8*in[2*i+8];
i=1;
out[i]+=t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=2;
out[i]+=t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=3;
out[i]+=t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=4;
out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=5;
out[i]+=-1.*t2*in[2*i-2]-t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]
+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]
+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=6;
out[i]+=-1.*t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]+t4*in[2*i+4]
+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
i=7;
out[i]+=-1.*t6*in[2*i-6]-t5*in[2*i-5]-t4*in[2*i-4]-t3*in[2*i-3]
-t2*in[2*i-2]-t1*in[2*i-1]+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]
+t3*in[2*i+3]+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]
+t7*in[2*i+7]+t8*in[2*i+8];
for (i=8; i<n/2-8; i++)
out[i]+= -1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
+t8*in[2*i+8];
i=n/2-8;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7];
i=n/2-7;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5];
i=n/2-6;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3];
i=n/2-5;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1]
+t0*in[2*i]+t1*in[2*i+1];
i=n/2-4;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3]-t2*in[2*i-2]-t1*in[2*i-1];
i=n/2-3;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5]
-t4*in[2*i-4]-t3*in[2*i-3];
i=n/2-2;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7]-t6*in[2*i-6]-t5*in[2*i-5];
i=n/2-1;
out[i]+=-1.*t8*in[2*i-8]-t7*in[2*i-7];
/* out[i]+=t3*in[2*i-3]+t1*in[2*i-1]+in[2*i]; */
}
}
/*
Back and forth, linear convolution using padding
(IN / +=OUT on ghosts)
out: output array of just ghosts (upper level)
in: input array (lower level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
---------------------------
currently not used 07/19/00 i.d.
*/
void
aPconvptsp8(double out[],double in[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
for (i=n; i<npad; i++) in[i]=0.; /* First, make sure pad is zero */
/* Reverse part */
{
register double t0=reversefilter[0], t1=reversefilter[1],
t2=reversefilter[2], t3=reversefilter[3],
t4=reversefilter[4], t5=reversefilter[5],
t6=reversefilter[6], t7=reversefilter[7],
t8=reversefilter[8];
for (i=npad-1; i>7; i--)
in[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
t4*in[i-4]+t5*in[i-5]+t6*in[i-6]+t7*in[i-7]+
t8*in[i-8];
in[7]=t0*in[7]+t1*in[6]+t2*in[5]+t3*in[4]
+t4*in[3]+t5*in[2]+t6*in[1]+t7*in[0];
in[6]=t0*in[6]+t1*in[5]+t2*in[4]+t3*in[3]+t4*in[2]+t5*in[1]+t6*in[0];
in[5]=t0*in[5]+t1*in[4]+t2*in[3]+t3*in[2]+t4*in[1]+t5*in[0];
in[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
in[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
in[2]=t0*in[2]+t1*in[1]+t2*in[0];
in[1]=t0*in[1]+t1*in[0];
in[0]=t0*in[0];
}
/* Forward part (padding now ensures room!) */
/* Note that even points on "in" are accumulated onto "out" */
{
register double t0=forwardfilter[0], t1=forwardfilter[1],
t2=forwardfilter[2], t3=forwardfilter[3],
t4=forwardfilter[4], t5=forwardfilter[5],
t6=forwardfilter[6], t7=forwardfilter[7],
t8=forwardfilter[8];
for (i=0; i<n/2; i++)
out[i]+=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]
+t8*in[2*i+8];
}
}
/*
Back and forth, linear convolution using padding
(IN / -=OUT on ghosts)
out: output array of just ghosts (upper level)
in: input array (lower level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
fullfilter: convolution filter
*/
void
aPconvptsm3(double out[],double in[],int n,int npad,double fullfilter[])
{
register int i;
/* First, make sure pad is zero */
/* for (i=n; i<npad; i++) in[i]=0.;i*/
/* Note that even points on "in" are accumulated onto "out" */
{
register double t1=fullfilter[1], t3=fullfilter[3];
i=0;
out[i]-=in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
i=1;
out[i]-=t1*in[2*i-1]+in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
for (i=2; i<n/2-2; i++)
out[i]-= t3*in[2*i-3] +t1*in[2*i-1] +in[2*i]+t1*in[2*i+1]+t3*in[2*i+3];
i=n/2-2;
out[i]-= t3*in[2*i-3] + t1*in[2*i-1] + in[2*i]+t1*in[2*i+1];
i=n/2-1;
out[i]-= t3*in[2*i-3] + t1*in[2*i-1] + in[2*i];
}
}
/*
Back and forth, linear convolution using padding
(IN / -=OUT on ghosts)
out: output array of just ghosts (upper level)
in: input array (lower level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
-----------------------------------
currently not used 07/19/00 i.d.
*/
void
aPconvptsm8(double out[],double in[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
for (i=n; i<npad; i++) in[i]=0.; /* First, make sure pad is zero */
/* Reverse part */
{
register double t0=reversefilter[0], t1=reversefilter[1],
t2=reversefilter[2], t3=reversefilter[3],
t4=reversefilter[4], t5=reversefilter[5],
t6=reversefilter[6], t7=reversefilter[7];
register double t8=reversefilter[8];
for (i=npad-1; i>7; i--)
in[i]=t0*in[i]+t1*in[i-1]+t2*in[i-2]+t3*in[i-3]+
t4*in[i-4]+t5*in[i-5]+t6*in[i-6]+t7*in[i-7]+
t8*in[i-8];
in[7]=t0*in[7]+t1*in[6]+t2*in[5]+t3*in[4]
+t4*in[3]+t5*in[2]+t6*in[1]+t7*in[0];
in[6]=t0*in[6]+t1*in[5]+t2*in[4]+t3*in[3]+t4*in[2]+t5*in[1]+t6*in[0];
in[5]=t0*in[5]+t1*in[4]+t2*in[3]+t3*in[2]+t4*in[1]+t5*in[0];
in[4]=t0*in[4]+t1*in[3]+t2*in[2]+t3*in[1]+t4*in[0];
in[3]=t0*in[3]+t1*in[2]+t2*in[1]+t3*in[0];
in[2]=t0*in[2]+t1*in[1]+t2*in[0];
in[1]=t0*in[1]+t1*in[0];
in[0]=t0*in[0];
}
/* Forward part (padding now ensures room!) */
/* Note that even points on "in" are accumulated onto "out" */
{
register double t0=forwardfilter[0], t1=forwardfilter[1],
t2=forwardfilter[2], t3=forwardfilter[3],
t4=forwardfilter[4], t5=forwardfilter[5],
t6=forwardfilter[6], t7=forwardfilter[7];
register double t8=forwardfilter[8];
for (i=0; i<n/2; i++)
out[i]-=t0*in[2*i]+t1*in[2*i+1]+t2*in[2*i+2]+t3*in[2*i+3]
+t4*in[2*i+4]+t5*in[2*i+5]+t6*in[2*i+6]+t7*in[2*i+7]+t8*in[2*i+8];
}
}
/*
Full two-level 3d convolution of conjugate transform type (IN PLACE)
Computes I +/- P I ~P, where P is projection onto ghosts, ~P is the
projection onto non-ghosts, and I is the simple interpolet
convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output.
upper: Points to location in upper data array corresponding to
origin of lower array.
lower: Points to origin of lower array
sign: Sign of transform, + is Idag, - is Jdag
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
fullfilter: non-factorized filter coefficients
nux, nuy, nuz: dimensions of upper array
nlx, nly, nlz: dimensions of lower array
hfl: length of half-filters
work, work_extra: workspaces of size
at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void
aPconv3d(double upper[], double lower[], int sign,
double reversefilter[], double forwardfilter[],
double fullfilter[],
int nux, int nuy, int nuz,
int nlx, int nly, int nlz,
int hfl,
double work[],
double work_extra[])
{
int i,j,k;
int nlyz=nly*nlz;
int nx=nlx+2*hfl, ny=nly+2*hfl, nz=nlz+2*hfl;
int nwx=nlx+3*hfl, nwy=nly+3*hfl, nwz=nlz+3*hfl;
int nwyz=nwy*nwz;
int fptl=2,lptl=2+nlz-1;
double *workp,*lowerp;
double *workp_extra;
int flag;
int padxb,padyb,padzb;
padxb=2*(hfl/2);
padyb=2*(hfl/2);
padzb=2*(hfl/2);
fptl=padxb; lptl=padxb+nlz-1;
/* printf("HFL and reversefilter are %d %12.5lf\n",hfl,reversefilter[0]); */
/* INDEXING NOTES:
(1) Our basic indexing refers to the work array, which has a
buffer region that circumscribes the lower level. The
beginning of the lower level is thus at (2,2,2) in the
indexing below.
(2) "upper" is defined to reference the point in the upper array
that corresponds to the origin of the lower array. There are
two things needed to translate properly: a homogeneous scale
factor of two AND to adjust for the fact that the origin of
the work array refers to the point (-1,-1,-1) from the point
in the upper array which corresponds to the origin of the
lower. The scale factor is handled explicitly in the code
below. To adjust for the issue of the origin, we include a
shift +((i/2-1)*nuy+(j/2-1))*nuz-1 where upper is used below.
(3) "lower" references the origin of the "lower" array where the
input comes from. In the lower arrray, the locations of the
origin of the work array is (-2,-2,-2), and thus we use
lowerp=lower+(-2)*nlyz+(j-2)*nlz-2 below.
*/
/* Convolutions, in linear y chuncks, of yz planes along x, with
transfer of data from lower to work */
for (j=0; j<ny; j++) {
workp=work+j*nwz;
workp_extra=work_extra+j*nwz;
lowerp=lower+(-padxb)*nlyz+(j-padyb)*nlz-padzb;
if (j<padyb || j>=padyb+nly) {
/* fill in pads with zeros */
for (i=0; i<nx; i++)
for (k=0; k<nz; k++) {
*(workp+i*nwyz+k)=0.;
*(workp_extra+i*nwyz+k)=0.;
}
}
else {
/* Convolutions along z in small chuncks, with transfer of
data from lower into work */
flag=j%2;
switch (hfl) {
case 3:
pconvsetsi3new_(workp_extra,
lowerp,
lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz,
lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,
fullfilter+0*(hfl+1),&flag,&sign);
/* pconvsetsi3old_( */
/* workp, */
/* workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/* lowerp, */
/* lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/* lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/* &nz,&nwyz,&nx,&nwx, */
/* &nlyz,&fptl,&lptl, */
/* reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1),&flag,&sign); */
break;
case 8:
pconvsetsi8new_(workp,
lowerp,
lowerp-8*nlyz,lowerp-7*nlyz,lowerp-6*nlyz,
lowerp-5*nlyz,lowerp-4*nlyz,lowerp-3*nlyz,
lowerp-2*nlyz,lowerp-1*nlyz,
lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
lowerp+4*nlyz,lowerp+5*nlyz,lowerp+6*nlyz,
lowerp+7*nlyz,lowerp+8*nlyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,
fullfilter+0*(hfl+1),&flag,&sign);
/* pconvsetsi8_( */
/* workp, */
/* workp-8*nwyz,workp-7*nwyz, */
/* workp-6*nwyz,workp-5*nwyz,workp-4*nwyz, */
/* workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/* workp+4*nwyz,workp+5*nwyz,workp+6*nwyz, */
/* workp+7*nwyz,workp+8*nwyz, */
/* lowerp, */
/* lowerp-8*nlyz,lowerp-7*nlyz, */
/* lowerp-6*nlyz,lowerp-5*nlyz,lowerp-4*nlyz, */
/* lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/* lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/* lowerp+4*nlyz,lowerp+5*nlyz,lowerp+6*nlyz, */
/* lowerp+7*nlyz,lowerp+8*nlyz, */
/* &nz,&nwyz,&nx,&nwx, */
/* &nlyz,&fptl,&lptl, */
/* reversefilter+0*(hfl+1), */
/* forwardfilter+0*(hfl+1),&flag,&sign); */
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
exit(1);
}
}
}
/* Convolution of each yz plane */
for (i=0; i<nx; i+=2) {
/* Convolution of plane i along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
workp=work+i*nwyz;
workp_extra=work_extra+i*nwyz;
switch (hfl) {
case 3:
pconvsetsio3full_(workp,workp_extra,
workp_extra-3*nwz,workp_extra-2*nwz,
workp_extra-nwz,
workp_extra+nwz,workp_extra+2*nwz,
workp_extra+3*nwz,
&nz,&nwz,&ny,&nwy,fullfilter+1*(hfl+1));
break;
case 8:
pconvsetsio8full_(workp_extra,workp,workp-8*nwz,
workp-7*nwz,workp-6*nwz,
workp-5*nwz,workp-4*nwz,
workp-3*nwz,workp-2*nwz,
workp-nwz,
workp+nwz,workp+2*nwz,
workp+3*nwz,workp+4*nwz,
workp+5*nwz,workp+6*nwz,
workp+7*nwz,workp+8*nwz,
&nz,&nwz,&ny,&nwy,fullfilter+1*(hfl+1));
/* pconvsets8_(workp, */
/* workp-8*nwz,workp-7*nwz, */
/* workp-6*nwz,workp-5*nwz,workp-4*nwz, */
/* workp-3*nwz,workp-2*nwz,workp-nwz, */
/* workp+nwz,workp+2*nwz,workp+3*nwz, */
/* workp+4*nwz,workp+5*nwz,workp+6*nwz, */
/* workp+7*nwz,workp+8*nwz, */
/* &nz,&nwz,&ny,&nwy, */
/* reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1)); */
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
exit(1);
}
/* Finally, convolution along z, from work with accumulation
onto upper */
/* (Note: When inlined in C, this routine is faster than fortran!) */
if (sign==1)
for (j=0; j<ny; j+=2) {
switch (hfl) {
case 3:
aPconvptsp3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1,
work+(i*nwy+j)*nwz,nz,nwz,
fullfilter+2*(hfl+1));
/* aPconvptsp3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1, */
/* work+(i*nwy+j)*nwz, */
/* nz,nwz, */
/* reversefilter+2*(hfl+1), */
/* forwardfilter+2*(hfl+1)); */
break;
case 8:
aPconvptsp8full(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2))
*nuz-padzb/2,
work_extra+(i*nwy+j)*nwz,nz,nwz,
fullfilter+2*(hfl+1) );
/* aPconvptsp8(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2)) */
/* *nuz-padzb/2, */
/* work+(i*nwy+j)*nwz,nz,nwz, */
/* reversefilter+2*(hfl+1), */
/* forwardfilter+2*(hfl+1) ); */
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
exit(1);
}
/* Impose real/coef space constraint on output
(could be done in pconvsetsi3 with 2.5% speedup, but
this way makes code more managable */
if (i>=padxb && i<padxb+nlx && j>=padyb && j<padyb+nly)
for (k=padzb; k<padzb+nlz; k+=2)
*(lower+((i-padxb)*nly+(j-padyb))*nlz+(k-padzb))=0.;
}
else if (sign == -1 )
for (j=0; j<ny; j+=2)
{
switch (hfl)
{
case 3:
aPconvptsm3(upper+((i/2-1)*nuy+(j/2-1))*nuz-1,work
+(i*nwy+j)*nwz,
nz,nwz,fullfilter+2*(hfl+1));
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
exit(1);
}
/* Impose real/coef space constraint on output
(could be done in pconvsetsi3 with 2.5% speedup, but
this way makes code more managable */
if (i>=padxb && i<padxb+nlx && j>=padyb && j<padyb+nly)
for (k=padzb; k<padzb+nlz; k+=2)
*(lower+((i-padxb)*nly+(j-padyb))*nlz+(k-padzb))=
*(upper+((i/2-padxb/2)*nuy+(j/2-padyb/2))*nuz+(k/2-padzb/2));
}
}
}
/*
Full two-level 3d convolution of conjugate transform type (IN PLACE)
Computes I +/- P I ~P, where P is projection onto ghosts, ~P is the
projection onto non-ghosts, and I is the simple interpolet
convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output.
upper: Points to location in upper data array corresponding to
origin of lower array.
lower: Points to origin of lower array
sign: Sign of transform, + is Idag, - is Jdag
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
nux, nuy, nuz: dimensions of upper array
nlx, nly, nlz: dimensions of lower array
hfl: length of half-filters
work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
-----------------------
currently not used 07/19/00 i.d.
*/
void
aPconv3dtest(double upper[], double lower[], int sign,
double reversefilter[], double forwardfilter[],
double fullfilter[],
int nux, int nuy, int nuz,
int nlx, int nly, int nlz,
int hfl,
double work[])
{
int i,j,k;
int nlyz=nly*nlz;
int nx=nlx+2*hfl, ny=nly+2*hfl, nz=nlz+2*hfl;
int nwx=nlx+3*hfl, nwy=nly+3*hfl, nwz=nlz+3*hfl;
int nwyz=nwy*nwz;
int fptl=2,lptl=2+nlz-1;
double *workp,*lowerp;
int flag;
int padxb,padyb,padzb;
padxb=2*(hfl/2);
padyb=2*(hfl/2);
padzb=2*(hfl/2);
fptl=padxb; lptl=padxb+nlz-1;
/* Convolutions, in linear y chuncks, of yz planes along x, with
transfer of data from lower to work */
/* j=2; */
for (j=0; j<ny; j++) {
workp=work+j*nwz;
lowerp=lower+(-padxb)*nlyz+(j-padyb)*nlz-padzb;
if (j<padyb || j>=padyb+nly) {
/* fill in pads with zeros */
for (i=0; i<nx; i++)
for (k=0; k<nz; k++)
*(workp+i*nwyz+k)=0.;
}
else {
/* Convolutions along z in small chuncks, with transfer of
data from lower into work */
flag=j%2;
switch (hfl) {
case 3:
/* pconvsetsi3new_( */
/* workp, */
/* lowerp, */
/* lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz, */
/* lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz, */
/* &nz,&nwyz,&nx,&nwx, */
/* &nlyz,&fptl,&lptl, */
/* fullfilter+0*(hfl+1),&flag,&sign); */
pconvsetsi3old_(workp,
workp-3*nwyz,workp-2*nwyz,workp-nwyz,
workp+nwyz,workp+2*nwyz,workp+3*nwyz,
lowerp,
lowerp-3*nlyz,lowerp-2*nlyz,lowerp-nlyz,
lowerp+nlyz,lowerp+2*nlyz,lowerp+3*nlyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,
reversefilter+0*(hfl+1),
forwardfilter+0*(hfl+1),&flag,&sign);
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aPconv3d (1)\n\n",hfl);
}
}
}
}
/*
Back and forth, linear convolution for +/- 3 filters using padding
(IN / OUT on non-ghosts)
out: output array (lower level)
up: input array of just ghosts (upper level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
filt: forward-time filter coefficients
*/
void
convptsP3(double out[],double up[],int n,int npad,double filt[])
{
register int i;
register double t1=filt[1],t3=filt[3];
/* Main reverse loop exploiting zeros */
for (i=n-5; i>2; i-=2) {
out[i]= t3*up[(i-3)/2]+t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2];
out[i-1]=up[(i-1)/2];
}
}
/*
Back and forth, linear convolution for +/- 8 filters using padding
(IN / OUT on non-ghosts)
out: output array (lower level)
up: input array of just ghosts (upper level)
n: size of actual data set being convolved
npad: size of padded region needed for "leakage" of foward filters
over the edge
reversefilter: reverse-time filter coefficients
forwardfilter: forward-time filter coefficients
*/
void
convptsP8(double out[],double up[],int n,int npad,
double reversefilter[],double forwardfilter[])
{
register int i;
/* Reverse part */
{
register double
t0=reversefilter[0],
t1=reversefilter[1],
t2=reversefilter[2],
t3=reversefilter[3],
t4=reversefilter[4],
t5=reversefilter[5],
t6=reversefilter[6],
t7=reversefilter[7],
t8=reversefilter[8];
/* Pad region */
/* out[npad-1]=0.; */
/* out[npad-2]=t8*up[(npad-2-8)/2]; */
/* out[npad-3]=t7*up[(npad-3-7)/2]; */
/* out[npad-4]=t6*up[(npad-4-6)/2]+t8*up[(npad-4-8)/2]; */
/* out[npad-5]=t5*up[(npad-5-5)/2]+t7*up[(npad-5-7)/2]; */
/* out[npad-6]=t4*up[(npad-6-4)/2]+t6*up[(npad-6-6)/2]+t8*up[(npad-6-8)/2]; */
/* out[npad-7]=t3*up[(npad-7-3)/2]+t5*up[(npad-7-5)/2]+t7*up[(npad-7-7)/2]; */
/* out[npad-8]=t2*up[(npad-8-2)/2]+t4*up[(npad-8-4)/2]+t6*up[(npad-8-6)/2]+t8*up[(npad-8-8)/2]; */
/* Main reverse loop exploiting zeros */
for (i=n-1; i>=8; i-=2)
{
out[i]=t1*up[(i-1)/2]+t3*up[(i-3)/2]+t5*up[(i-5)/2]+t7*up[(i-7)/2];
out[i-1]=t0*up[(i-1)/2]+t2*up[(i-1-2)/2]
+t4*up[(i-1-4)/2]+t6*up[(i-1-6)/2]+t8*up[(i-1-8)/2];
}
/* Fence posting */
/* out[2]=t0*in[2] +t2*in[0]; */
/* out[7]=t1*up[(7-1)/2]+t3*up[(7-3)/2]+t5*up[(7-5)/2]+t7*up[(7-7)/2]; */
/* out[6]=t0*up[(6)/2]+t2*up[(6-2)/2]+t4*up[(6-4)/2]+t6*up[(6-6)/2]; */
/* out[5]=t1*up[(5-1)/2]+t3*up[(5-3)/2]+t5*up[(5-5)/2]; */
/* out[4]=t0*up[(4)/2]+t2*up[(4-2)/2]+t4*up[(4-4)/2]; */
/* out[3]=t1*up[(3-1)/2]+t3*up[(3-3)/2]; */
/* out[2]=t0*up[(2)/2]+t2*up[(2-2)/2]; */
/* out[1]=t1*up[(1-1)/2]; */
/* out[0]=t0*up[(0)/2]; */
}
/* Forward part (padding now ensures room!) */
{
register double
t0=forwardfilter[0],
t1=forwardfilter[1],
t2=forwardfilter[2],
t3=forwardfilter[3],
t4=forwardfilter[4],
t5=forwardfilter[5],
t6=forwardfilter[6],
t7=forwardfilter[7],
t8=forwardfilter[8];
for (i=8; i<n-8; i++)
out[i]=
t0*out[i]
+t1*out[i+1]+t2*out[i+2]+t3*out[i+3]+t4*out[i+4]
+t5*out[i+5]+t6*out[i+6]+t7*out[i+7]+t8*out[i+8];
}
}
void
convptsP8full(double out[],double up[],int n,int npad,double fullfilt[])
{
register int i;
register double t0=fullfilt[0],t1=fullfilt[1],t2=fullfilt[2],
t3=fullfilt[3],t4=fullfilt[4],t5=fullfilt[5],
t6=fullfilt[6],t7=fullfilt[7],t8=fullfilt[8];
if(t0 != 0.0)
{ /* symmetric filter */
/* Main reverse loop exploiting zeros */
for (i=n-9; i>8; i-=2)
{
out[i]= t7*up[(i-7)/2]+t5*up[(i-5)/2]+t3*up[(i-3)/2]
+t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2]
+t5*up[(i+5)/2]+t7*up[(i+7)/2];
out[i-1]= t8*up[(i-8)/2]+t6*up[(i-6)/2]+t4*up[(i-4)/2]
+t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
+t6*up[(i+6)/2] +t8*up[(i+8)/2];
}
}
else
{ /* antisymmetric filter */
/* Main reverse loop exploiting zeros */
for (i=n-9; i>8; i-=2)
{
out[i]= -1.*t7*up[(i-7)/2]-t5*up[(i-5)/2]-t3*up[(i-3)/2]
-t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2]
+t5*up[(i+5)/2]+t7*up[(i+7)/2];
out[i-1]= -1.*t8*up[(i-8)/2]-t6*up[(i-6)/2]-t4*up[(i-4)/2]
-t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
+t6*up[(i+6)/2] +t8*up[(i+8)/2];
}
}
}
/* I'm not going to change this routine to use anti/symmetric filters, since
it's optimized for L anyway (which is a symmetric filter). -- mhe 4/12/02 */
void
convptsP8fullL(double out[],double up[],int n,int npad,double fullfilt[])
{
register int i;
register double t0=fullfilt[0],t1=fullfilt[1],t2=fullfilt[2],
t3=fullfilt[3],t4=fullfilt[4],t6=fullfilt[6];
/* Main reverse loop exploiting zeros */
for (i=n-9; i>8; i-=2)
{
out[i]= t3*up[(i-3)/2]
+t1*up[(i-1)/2]+t1*up[(i+1)/2]+t3*up[(i+3)/2];
out[i-1]= t6*up[(i-6)/2]+t4*up[(i-4)/2]
+t2*up[(i-2)/2]+t0*up[i/2]+t2*up[(i+2)/2]+t4*up[(i+4)/2]
+t6*up[(i+6)/2];
}
}
/*
Full two-level 3d convolution of transform type (IN PLACE)
Computes I +/- ~P I P, where P is projection onto ghosts, ~P is the
projection onto non-ghosts, and I is the simple interpolet
convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output,
PROVIDED they are obeyed on input.
upper: Points to location in upper data array corresponding to
origin of lower array.
lower: Points to origin of lower array
sign: Sign of transform, + is Idag, - is Jdag
(should be I and J, non??(Torkel))
reversefilter: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwardfilter: forward-time filter coefficients (2d array: [x,y,z][filter])
fullfilter: full filter
nux, nuy, nuz: dimensions of upper array
nlx, nly, nlz: dimensions of lower array
hfl: length of half-filters
work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void
aconvP3d(double upper[], double lower[], int sign,
double reversefilter[], double forwardfilter[],
double fullfilter[],
int nux, int nuy, int nuz,
int nlx, int nly, int nlz,
int hfl,
double work[],double work_extra[])
{
int i,j;
/* Constants giving region of workspace which must be computed */
int nx=nlx+2*hfl;
int ny=nly+2*hfl;
int nz=nlz+2*hfl;
/* Constants giving total size of workspace, including padding for
factored convolultion strategy */
int nwx=nx+hfl;
int nwy=ny+hfl;
int nwz=nz+hfl;
int nwyz=nwy*nwz;
/* Constants describing filling of results into lower region */
int
nlyz=nly*nlz,
fptl= 2*(hfl/2), lptl= 2*(hfl/2)+nlz-1,
fsetl=2*(hfl/2), lsetl=2*(hfl/2)+nlx-1;
double *workp,*upperp,*lowerp;
double *work_extrap;
/* INDEXING NOTES:
(1) Our basic indexing refers to the work array, which has a
buffer region that circumscribes the lower level. The
beginning of the lower level is thus at
(2(hfl/2),2(hfl/2),2(hfl/2)) [in INTEGER arthimetic in the
indexing below.
(2) "upper" is defined to reference the point in the upper array
that corresponds to the origin of the lower array. There are
two things needed to translate properly: a homogeneous scale
factor of two AND to adjust for the fact that the origin of
the work array refers to the point
(-(hfl/2),-(hfl/2),-(hfl/2)) from the origin of the lower
array in the upper array. The scale factor is handled
explicitly in the code below. To adjust for the issue of the
origin, we now simply include this shift into the pointer.
(3) "lower" references the origin of the "lower" array where the
output is supposed to go. We now shift it so that
convsetspo_ can use the referencing of the work array.
*/
upperp=upper-((1*nuy+1)*nuz+1)*(hfl/2);
lowerp=lower-((2*nly+2)*nlz+2)*(hfl/2);
/* Convolution of each yz plane */
for (i=0; i<nx; i+=2) {
workp=work+i*nwyz;
work_extrap=work_extra+i*nwyz;
/* Along z first, pulling data from upper into the work array */
switch (hfl) {
case 3:
for (j=0; j<ny; j+=2)
convptsP3(work+(i*nwy+j)*nwz, upperp+(i*nuy+j)*nuz/2,
nz, nwz, fullfilter+2*(hfl+1));
break;
case 8:
for (j=0; j<ny; j+=2)
convptsP8full(work+(i*nwy+j)*nwz, upperp+(i*nuy+j)*nuz/2,
nz, nwz,fullfilter+2*(hfl+1) );
/* convptsP8(work+(i*nwy+j)*nwz,upperp+(i*nuy+j)*nuz/2, */
/* nz,nwz,reversefilter+2*(hfl+1), */
/* forwardfilter+2*(hfl+1) ); */
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (1)\n\n",hfl);
exit(1);
}
/* Now do plane along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
switch (hfl) {
case 3:
convsetsp3_(work_extrap,
workp,
workp-3*nwz,workp-2*nwz,workp-nwz,
workp+nwz,workp+2*nwz,workp+3*nwz,
&nz,&nwz,&ny,&nwy,
reversefilter+1*(hfl+1),fullfilter+1*(hfl+1) );
/* convsetsp3_(workp, */
/* workp-3*nwz,workp-2*nwz,workp-nwz, */
/* workp+nwz,workp+2*nwz,workp+3*nwz, */
/* &nz,&nwz,&ny,&nwy,reversefilter+hfl+1,forwardfilter+hfl+1); */
break;
case 8:
convsetsp8full_(work_extrap, workp,
workp-8*nwz,workp-7*nwz,workp-6*nwz,workp-5*nwz,
workp-4*nwz,workp-3*nwz,workp-2*nwz,workp-nwz,
workp+nwz,workp+2*nwz,workp+3*nwz,workp+4*nwz,
workp+5*nwz,workp+6*nwz,workp+7*nwz,workp+8*nwz,
&nz,&nwz,&ny,&nwy,
fullfilter+1*(hfl+1) );
/* convsetsp8_(workp, */
/* workp-8*nwz,workp-7*nwz,workp-6*nwz,workp-5*nwz, */
/* workp-4*nwz,workp-3*nwz,workp-2*nwz,workp-nwz, */
/* workp+nwz,workp+2*nwz,workp+3*nwz,workp+4*nwz, */
/* workp+5*nwz,workp+6*nwz,workp+7*nwz,workp+8*nwz, */
/* &nz,&nwz,&ny,&nwy, */
/* reversefilter+1*(hfl+1),forwardfilter+1*(hfl+1) ); */
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (2)\n\n",hfl);
exit(1);
}
}
/* Convolutions along z in small chunks, with accumulation from work
onto lower */
for (j=2*(hfl/2); j<2*(hfl/2)+nly; j++) {
workp=work+j*nwz;
work_extrap=work_extra+j*nwz;
if (sign==1){
switch (hfl){
case 3:
convsetspop3_(lowerp+j*nlz,work_extrap,
work_extrap-3*nwyz,work_extrap-2*nwyz,
work_extrap-nwyz,
work_extrap+nwyz,work_extrap+2*nwyz,
work_extrap+3*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
fullfilter+0*(hfl+1) );
break;
case 8:
/* printf("Calling convsetspop8full\n\n"); */
convsetspop8full_(lowerp+j*nlz,work_extrap,
work_extrap-8*nwyz,work_extrap-7*nwyz,
work_extrap-6*nwyz,work_extrap-5*nwyz,
work_extrap-4*nwyz,work_extrap-3*nwyz,
work_extrap-2*nwyz,work_extrap-nwyz,
work_extrap+nwyz,work_extrap+2*nwyz,
work_extrap+3*nwyz,work_extrap+4*nwyz,
work_extrap+5*nwyz,work_extrap+6*nwyz,
work_extrap+7*nwyz,work_extrap+8*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
fullfilter+0*(hfl+1));
/* convsetspop8_(lowerp+j*nlz,workp, */
/* workp-8*nwyz,workp-7*nwyz,workp-6*nwyz,workp-5*nwyz, */
/* workp-4*nwyz,workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* workp+nwyz,workp+2*nwyz,workp+3*nwyz,workp+4*nwyz, */
/* workp+5*nwyz,workp+6*nwyz,workp+7*nwyz,workp+8*nwyz, */
/* &nz,&nwyz,&nx,&nwx, */
/* &nlyz,&fptl,&lptl,&fsetl,&lsetl, */
/* reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1)); */
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
exit(1);
}
}
/*New possibility, included by Torkel 5/10 */
if (sign==0) {
switch (hfl) {
case 3:
convsetspo3_(lowerp+j*nlz,work_extrap,
work_extrap-3*nwyz,work_extrap-2*nwyz,
work_extrap-nwyz,
work_extrap+nwyz,work_extrap+2*nwyz,
work_extrap+3*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1));
break;
case 8:
convsetspo8full_(lowerp+j*nlz,work_extrap,
work_extrap-8*nwyz,work_extrap-7*nwyz,
work_extrap-6*nwyz,work_extrap-5*nwyz,
work_extrap-4*nwyz,work_extrap-3*nwyz,
work_extrap-2*nwyz,work_extrap-nwyz,
work_extrap+nwyz,work_extrap+2*nwyz,
work_extrap+3*nwyz,work_extrap+4*nwyz,
work_extrap+5*nwyz,work_extrap+6*nwyz,
work_extrap+7*nwyz,work_extrap+8*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
fullfilter+0*(hfl+1));
/* convsetspo8_(lowerp+j*nlz,workp, */
/* workp-8*nwyz,workp-7*nwyz, */
/* workp-6*nwyz,workp-5*nwyz,workp-4*nwyz, */
/* workp-3*nwyz,workp-2*nwyz,workp-nwyz, */
/* workp+nwyz,workp+2*nwyz,workp+3*nwyz, */
/* workp+4*nwyz,workp+5*nwyz,workp+6*nwyz, */
/* workp+7*nwyz,workp+8*nwyz, */
/* &nz,&nwyz,&nx,&nwx, */
/* &nlyz,&fptl,&lptl,&fsetl,&lsetl, */
/* reversefilter+0*(hfl+1),forwardfilter+0*(hfl+1)); */
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
exit(1);
}
}
if (sign==-1)
{
switch (hfl)
{
case 3:
convsetspom3_(lowerp+j*nlz,work_extrap,
work_extrap-3*nwyz,work_extrap-2*nwyz,
work_extrap-nwyz,
work_extrap+nwyz,work_extrap+2*nwyz,
work_extrap+3*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
fullfilter+0*(hfl+1));
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aconvP3d (3b)\n\n",hfl);
exit(1);
}
}
}
}
/*
Full two-level 3d convolution of transform type (IN PLACE)
Computes I +/- ~P I P, where P is projection onto ghosts, ~P is the
projection onto non-ghosts, and I is the simple interpolet
convolusion, and OBEYS REAL/COEFF SPACE CONVENTIONS on output,
PROVIDED they are obeyed on input.
upper: Points to location in upper data array corresponding to
origin of lower array.
lower: Points to origin of lower array
addon: adds this array to the lower array
sign: Sign of transform, + is I, - is J
revfilt: reverse-time filter coefficients (2d array: [x,y,z][filter])
forwfilt: forward-time filter coefficients (2d array: [x,y,z][filter])
nux, nuy, nuz: dimensions of upper array
nlx, nly, nlz: dimensions of lower array
hfl: length of half-filters
work: workspace of size at least (nlx+3*hfl)*(nly+3*hfl)*(nlz+3*hfl)
*/
void
aconvP3dadd(double upper[], double lower[], double addon[],
double revfilt[], double forwfilt[], double fullfilt[],
int nux, int nuy, int nuz,
int nlx, int nly, int nlz,int hfl,
double work[],double work_extra[])
{
int i,j;
/* Constants giving region of workspace which must be computed */
int nx=nlx+2*hfl;
int ny=nly+2*hfl;
int nz=nlz+2*hfl;
/* Constants giving total size of workspace, including padding for
factored convolultion strategy */
int nwx=nx+hfl;
int nwy=ny+hfl;
int nwz=nz+hfl;
int nwyz=nwy*nwz;
/* Constants describing filling of results into lower region */
int
nlyz=nly*nlz,
fptl= 2*(hfl/2), lptl= 2*(hfl/2)+nlz-1,
fsetl=2*(hfl/2), lsetl=2*(hfl/2)+nlx-1;
double *workp,*upperp,*lowerp,*addonp;
/* seems unused 07/10/00 i.d.
int padxb,padyb,padzb;
padxb=2*(hfl/2);
padyb=2*(hfl/2);
padzb=2*(hfl/2);
*/
/* INDEXING NOTES:
(1) Our basic indexing refers to the work array, which has a
buffer region that circumscribes the lower level. The
beginning of the lower level is thus at
(2(hfl/2),2(hfl/2),2(hfl/2)) [in INTEGER arthimetic in the
indexing below.
(2) "upper" is defined to reference the point in the upper array
that corresponds to the origin of the lower array. There are
two things needed to translate properly: a homogeneous scale
factor of two AND to adjust for the fact that the origin of
the work array refers to the point
(-(hfl/2),-(hfl/2),-(hfl/2)) from the origin of the lower
array in the upper array. The scale factor is handled
explicitly in the code below. To adjust for the issue of the
origin, we now simply include this shift into the pointer.
(3) "lower" references the origin of the "lower" array where the
output is supposed to go. We now shift it so that
convsetspo_ can use the referencing of the work array.
*/
upperp=upper-((1*nuy+1)*nuz+1)*(hfl/2);
lowerp=lower-((2*nly+2)*nlz+2)*(hfl/2);
addonp=addon-((2*nly+2)*nlz+2)*(hfl/2);
/* Convolution of each yz plane */
for (i=0; i<nx; i+=2){
workp=work+i*nwyz;
/* Along z first, pulling data from upper into the work array */
switch (hfl){
case 3:
for (j=0; j<ny; j+=2)
convptsP3(work+(i*nwy+j)*nwz,upperp+(i*nuy+j)*nuz/2,
nz,nwz,fullfilt+2*(hfl+1) );
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (1)\n\n",hfl);
exit(1);
}
/* Now do plane along y */
/* (This seems too big to inline, and fortran runs a lot faster!) */
switch (hfl){
case 3:
convsetsp3_(workp,
workp,
workp-3*nwz,workp-2*nwz,workp-nwz,
workp+nwz,workp+2*nwz,workp+3*nwz,
&nz,&nwz,&ny,&nwy,revfilt+hfl+1,fullfilt+hfl+1);
/* convsetsp3_(work_extrap, */
/* workp, */
/* workp-3*nwz,workp-2*nwz,workp-nwz, */
/* workp+nwz,workp+2*nwz,workp+3*nwz, */
/* &nz,&nwz,&ny,&nwy,tr+hfl+1,fullfilter+hfl+1); */
break;
default:
fprintf(stderr,"\n\nhfl=%d not supported in aconvP3d (2)\n\n",hfl);
exit(1);
}
}
/* Convolutions along z in small chunks, with accumulation from work
onto lower */
for (j=2*(hfl/2); j<2*(hfl/2)+nly; j++){
workp=work+j*nwz;
switch (hfl){
case 3:
convsetspoadd3_(lowerp+j*nlz,addonp+j*nlz,
workp,workp-3*nwyz,
workp-2*nwyz,workp-nwyz,
workp+nwyz,workp+2*nwyz,workp+3*nwyz,
&nz,&nwyz,&nx,&nwx,
&nlyz,&fptl,&lptl,&fsetl,&lsetl,
revfilt+0*(hfl+1),forwfilt+0*(hfl+1));
break;
default:
fprintf(stderr,
"\n\nhfl=%d not supported in aconvP3d (3a)\n\n",hfl);
exit(1);
}
}
}
syntax highlighted by Code2HTML, v. 0.9.1