program main implicit double precision (a-h, o-z) dimension q(1) include 'msgtypesf.h' parameter (niter = 30, maxp = 2047) dimension jlo(0:maxp), nj(0:maxp) data iqoff, iqaddr/0, 0/ c c Simple iterative (Jacobi) linear equation solver (Ax = b) c c Initialize the message passing environment c call pbeginf c call evon c c Read in the dimension of the matrix on process 0 and c broadcast its value to everyone else c if (nodeid() .eq. 0) then write(6,1) 1 format(' Input dimension of matrix: '$) read(5,*) n endif call brdcst(1+MSGINT, n, mitob(1), 0) c c makjlo assigns a range of columns to each process. c ncolp is the number that this process is to do. c call makjlo(n, jlo, nj) ncolp = nj(nodeid()) c c allocate memory c need = n*ncolp + 2*n + 2*ncolp call getmem(need, q, iqaddr, iqoff) if (iqaddr.eq.0) call parerr(999) ia = iqoff + 1 ib = ia + n*ncolp ix = ib + ncolp is = ix + ncolp iw = is + n c c Make matrix (a), rhs vector (b) and initial guess (x) c call makabx(n, jlo(nodeid()), ncolp, q(ia), q(ib), q(ix)) c c Do niter iterations of the Jacobi algorithm. c Synchronize first for accurate timings. c call synch(13) rjunk = timer() call jacobi(n, jlo, nj, q(ia), q(ib), q(ix), q(is), q(iw), niter) used = timer() c c Print out results c rmflop = dble(niter*2*n)*dble(n) / (used*1.0d6) if (nodeid() .eq. 0) write(6,2) n, used, nnodes(), rmflop 2 format(' N=',i4,' used ',f6.2,' secs with ',i3,' processes', $ ', mflop=', f8.2) c call pend call fexit end subroutine jacobi(n, jlo, nj, a, b, x, s, work, niter) implicit double precision (a-h, o-z) dimension a(n, *), b(*), x(*), s(n), work(n), $ jlo(0:*), nj(0:*) c c Apply niter iterations of the Jacobi algorithm. c me = nodeid() do 10 iter = 1, niter c c Compute matrix vector product Ax ... this is the real work c c Do the part that we have c call mxv(a, n, x, nj(me), s) c c Now we have to add up the result over all the processors c Call dgop for simple but inefficient version. Call mxvadd c for a much more efficient version c c call dgop(2, s, n, '+') c call mxvadd(s, work, jlo, nj) c c Compute our part of the update vector and compute c the residual error (the error requires a global sum) c err = 0.0d0 do 20 j = jlo(me), jlo(me) + nj(me) - 1 jj = j - jlo(me) + 1 x(jj) = x(jj) + (b(jj)-s(j))/a(j,jj) err = err + abs((b(jj)-s(j))) 20 continue c c Write out results every now and again c if (mod(iter,10).eq.0) then call dgop(3, err, 1, '+') if (nodeid().eq.0) write(6,1) iter, err 1 format(' Iteration',i3,', Error',d9.2) endif 10 continue c end subroutine makabx(n, jlo, nj, a, b, x) implicit double precision (a-h, o-z) dimension a(n, nj), b(nj), x(nj) c jhi = jlo + nj - 1 do 10 j = jlo, jhi jj = j - jlo + 1 cvd$ novector do 20 i = 1, n a(i,jj) = dble(i+j) / dble(abs(i-j)*n+n/50+1) 20 continue b(jj) = dble(mod(j,3)) x(jj) = b(jj) / a(j,jj) 10 continue c end subroutine makjlo(n, jlo, nj) dimension jlo(0:*), nj(0:*) c ncolp = n / nnodes() next = n - (ncolp*nnodes()) jjlo = 1 do 10 iproc = 0, nnodes()-1 jlo(iproc) = jjlo jjlo = jjlo + ncolp if (iproc.lt.next) jjlo = jjlo + 1 nj(iproc) = jjlo - jlo(iproc) 10 continue c end subroutine mxvadd(s, work, jlo, nj) implicit real*8 (a-h, o-z) include 'msgtypesf.h' dimension s(*), work(*), jlo(0:*), nj(0:*) logical synch parameter (synch=.true.) c c We have in s(1:n) this process's contribution to the c matrix vector product A*x where we had nj(me) elements c of x starting at element jlo(me). Each process needs c to end up with the same elements of the full result c vector s. c c Thus we need to send to each process (ip) the elements c s(jlo(ip)+k-1), k=1,nj(ip). And we need to receive from c each process our piece of s which we add onto our result c vector. c c If communication is synchronous then we must explictly pair up c send/receive requests on this process with the matching c receive/send operations on other processes. c c There is potential for much more asynch stuff but the damned c iPSC-i860 hangs (irreproducibly) if we send off too many c unresolved asynchronous messages (how many is too much?). c me = nodeid() nproc = nnodes() nn = nproc + mod(nproc,2) c if (synch) then do 10 iter = 1, nn-1 call pairup(nn, me, iter, ip) if (ip.lt.nproc) then if (me. lt. ip) then call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1) call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, & node, 1) else if (me.gt.ip) then call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, & node, 1) call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1) endif call vvadd(nj(me), s(jlo(me)), work) endif 10 continue else do 20 iter = 1, nn-1 call pairup(nn, me, iter, ip) if (ip.lt.nproc) then call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 0) call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, node, 1) call vvadd(nj(me), s(jlo(me)), work) endif 20 continue call waitcom(-1) endif c end subroutine pairup(n, me, iter, ipair) c c one of many ways of generating maximally overlapped pairs c (not all that good on a hypercube though!) c if (iter.eq.1) then ipair = mod(n+1-me,n) else if (me.eq.0) then ipair = iter else if (me.eq.iter) then ipair = 0 else if (ipair.eq.0) ipair = me ipair = ipair + 2 if (ipair.ge.n) ipair = ipair + 1 - n endif end subroutine vvadd(n, a, b) implicit real*8 (a-h, o-z) dimension a(*), b(*) c do 10 i = 1,n a(i) = a(i) + b(i) 10 continue c end