program scf
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      include 'msgtypesf.h'
      dimension orbs(nbfn*nbfn), dens(nnbfn), fock(nnbfn),
     $     work(nbfn*nbfn), evals(nbfn)
      data tinit, tonel, ttwoel, tdiag, tdens, tprint /6*0.0d0/
      data eone, etwo, energy, deltad /4*0.0d0/
c     
c     initalize the parallel message passing environment
c     
      call pbeginf
*      call ieeetrap
      me = nodeid()
      nproc = nnodes()
c     
c     initialize a bunch of stuff and initial density matrix
c     
      rjunk = timer()
      call ininrm
      call denges(dens, work)
      tinit = timer()
c
c     make initial orthogonal orbital set for jacobi diagonalizer
c
      call makeob(orbs, work)
c     
c     iterate
c     
      do 10 iter = 1, mxiter
c     
c     make info for sparsity test ... redone every iter to save space
c     
         call makesz(work, schwmax)
c     
c     make the one particle contribution to the fock matrix (in fock)
c     and the partial contribution to the energy
c     
         call oneel(dens, work, schwmax, fock, eone)
         call dgop(1, eone, 1, '+')
         tonel = tonel + timer()
c     
c     compute the two particle contribution and then add up the
c     contributions from each process with dgop
c     
         call twoel(dens, work, schwmax, fock, etwo)
         call dgop(2, etwo, 1, '+')
         call dgop(3, fock, nnbfn, '+')
         ttwoel = ttwoel + timer()
c     
c     only process 0 diagonalizes and updates the density
c     
         if (me.eq.0) then
c     
c     diagonalize the fock matrix ...
c     
            call diagon(fock, orbs, evals, work, tester, iter)
      call flush(6)
            tdiag = tdiag + timer()
c     
c     make the new density matrix in work from orbitals in orbs,
c     compute the norm of the change in the density matrix and
c     then update the density matrix in dens with damping.
c     
            call makden(orbs, work)
            deltad = dendif(dens, work)
            if (iter.eq.1) then
               scale = 0.0d0
            else if (iter .le. 5) then
               if (nbfn .gt. 60) then
                  scale = 0.5d0
               else
                  scale = 0.0d0
               endif
            else
               scale = 0.0d0
            endif
            call damp(scale, dens, work)
            tdens = tdens + timer()
c     
c     add up energy and print out convergence information
c     
            energy = enrep + eone + etwo
            call prnout(iter, energy, deltad, tester)
            tprint = tprint + timer()
         endif
c     
c     brodcast new density matrix and deltad to everyone
c     
         call brdcst(4+MSGDBL, dens, mdtob(nnbfn), 0)
         call brdcst(5+MSGDBL, deltad, mdtob(1), 0)
c     
c     if converged then exit iteration loop
c     
         if (deltad .lt. tol) goto 20
 10   continue
      if(me.eq.0)
     $     write(6,*) ' SCF failed to converge in ', mxiter, ' iters'
c     
c     finished ... print out eigenvalues and occupied orbitals
c     
 20   continue
      call igop(6, icut1, 1, '+')
      call igop(7, icut2, 1, '+')
      call igop(8, icut3, 1, '+')
      if (me.eq.0) then
c     
c     print out timing information
c     
         call prnfin(energy, evals, orbs)
         write(6,1) tinit, tonel, ttwoel, tdiag, tdens, tprint,
     $        nproc
 1       format(/'   init   onel  twoel   diag   dens   print  ncpu'/
     $        '  ------ ------ ------ ------ ------ ------ ------'/
     $        1x, 6f7.2, i7/)
c     
c     print out information on # integrals evaulated each iteration
c     
         nints = nnbfn*(nnbfn+1)/2
         frac  = dfloat(icut3)/dfloat(nints)
         write(6,2) icut1, icut2, icut3, nints, frac
 2       format(/'       No. of integrals screened or computed '
     $        /'       -------------------------------------'/
     $        /1x,'   #ij test   #kl test   #compute     #total',
     $        '  fraction',
     $        /1x,'  ---------  ---------  ---------  ---------',
     $        '  --------',
     $        /1x,4(2x,i9),f9.3)
         call stats
      endif
c     
      call pend
      call fexit
c     
      end
      subroutine makesz(schwarz, schwmax)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      include 'msgtypesf.h'
      dimension schwarz(nnbfn)
c
c     schwarz(ij) = (ij|ij) for sparsity test
c
      icut1 = 0
      icut2 = 0
      icut3 = 0
      ij = 0
      schwmax = 0.0d0
      call dfill(nnbfn, 0.0d0, schwarz, 1)
c
      me = nodeid()
      nproc = nnodes()
      do 10 i = 1, nbfn
         do 20 j = 1, i
            ij = ij + 1
            if (mod(ij,nproc).eq.me) then
               call g(gg, i, j, i, j)
               schwarz(ij) = sqrt(gg)
               schwmax = max(schwmax, schwarz(ij))
            endif
 20      continue
 10   continue
c
      call dgop(101+MSGDBL, schwarz, nnbfn, '+')
      call dgop(102+MSGDBL, schwmax, 1, 'max')
c
      end
      subroutine ininrm
      implicit double precision (a-h, o-z)
      include 'cscf.h'
c
c     write a little welcome message
c
      if (nodeid().eq.0) write(6,1) natom, nocc, nbfn, tol
1     format(/' Example Direct Self Consistent Field Program '/
     $        ' -------------------------------------------- '//
     $        ' no. of atoms ............... ',i3/
     $        ' no. of occupied orbitals ... ',i3/
     $        ' no. of basis functions ..... ',i3/
     $        ' convergence threshold ...... ',d9.2//)
c
c     generate normalisation coefficients for the basis functions
c     and the index array iky
c
      do 10 i = 1, nbfn
         iky(i) = i*(i-1)/2
 10   continue
c
      do 20 i = 1, nbfn
         rnorm(i) = (expnt(i)*2.0d0/pi)**0.75d0
 20   continue
c
c     initialize common for computing f0
c
      call setfm
c
      end
      double precision function h(i,j)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
cvd$r novector
cvd$r noconcur
c
c     generate the one particle hamiltonian matrix element
c     over the normalized primitive 1s functions i and j
c
      f0val = 0.0d0
      sum = 0.0d0
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      expij = exprjh(-facij*rab2)
      repij = (2.0d0*pi/(expnt(i)+expnt(j))) * expij
c
c     first do the nuclear attraction integrals
c
      do 10 iat = 1, natom
         xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
         yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
         zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
         rpc2 = (xp-ax(iat))**2 + (yp-ay(iat))**2 + (zp-az(iat))**2
c
         call f0(f0val, (expnt(i)+expnt(j))*rpc2)
         sum = sum - repij * q(iat) * f0val
 10   continue
c
c     add on the kinetic energy term
c
      sum = sum + facij*(3.0d0-2.0d0*facij*rab2) *
     $     (pi/(expnt(i)+expnt(j)))**1.5d0 * expij
c
c     finally multiply by the normalization constants
c
      h = sum * rnorm(i) * rnorm(j)
c
      end
      double precision function s(i,j)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
c
c     generate the overlap matrix element between the normalized
c     primitve gaussian 1s functions i and j
c
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      s = (pi/(expnt(i)+expnt(j)))**1.5d0 * exprjh(-facij*rab2) *
     $     rnorm(i)*rnorm(j)
c
      end
      subroutine makden(orbs, dens)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension orbs(nbfn, nbfn), dens(nnbfn)
c
c     generate density matrix from orbitals in orbs. the first
c     nocc orbitals are doubly occupied. Note that the diagonal
c     elements are scaled by 0.5
c
      ij = 0
      do 10 i = 1, nbfn
         do 20 j = 1, i
            p = 0.0d0
            do 30 k = 1, nocc
               p = p + orbs(i,k)*orbs(j,k)
 30         continue
            ij = ij + 1
            dens(ij) = 2.0d0 * p
 20      continue
         dens(ij) = dens(ij)*0.5d0
 10   continue
c
      end
      subroutine oneel(dens, schwarz, schwmax, fock, eone)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension dens(nnbfn), fock(nnbfn), schwarz(nnbfn)
c
c     fill in the one-electron part of the fock matrix and
c     compute the one-electron energy contribution
c
c     simple structure to share out the work between processes
c
      me = nodeid()
      nproc = nnodes()
c
      call dfill(nnbfn, 0.0d0, fock, 1)
      do 10 i = me+1, nbfn, nproc
         do 20 j = 1,i
            ij = iky(i) + j
            if (schwarz(ij)*schwmax.gt.tol2e) fock(ij) = h(i,j)
 20      continue
 10   continue
      eone = ddot(nnbfn, fock, 1, dens, 1)
c
      end
      integer function nxtask(nproc)
      parameter (ichunk = 10)
      save icount, nleft
      data nleft, icount /0, 0/
c
c     wrapper round nxtval() to increase granularity
c     and thus reduce no. of requests to shared counter
c
      if(nproc.gt.0) then
         if(nleft.eq.0) then
            icount = nxtval(nproc) * ichunk
            nleft = ichunk
         endif
         nxtask = icount
         icount = icount + 1
         nleft = nleft -1
      else
          nleft = 0
          nxtask = 0
          junk = nxtval(nproc)
      endif
c
c     following does dumb static load balancing
c
c$$$      if(nproc.gt.0) then
c$$$         if (nleft .eq. 0) then
c$$$            icount = nodeid()
c$$$            nleft = 1
c$$$         endif
c$$$         nxtask = icount
c$$$         icount = icount + nnodes()
c$$$      else
c$$$          nleft = 0
c$$$          nxtask = 0
c$$$      endif
      end
      subroutine twoel(dens, schwarz, schwmax, fock, etwo)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension dens(nnbfn), fock(nnbfn), schwarz(nnbfn)
c     
c     add in the two-electron contribution to the fock matrix
c     
      nproc = nnodes()
c     
      gg = 0.0d0
c     
      next = nnbfn - nxtask(nproc)
      do 10 i = nbfn, 1, -1
         do 20 j = i, 1, -1
            ij = iky(i) + j
            if (ij .eq. next) then
               if (schwarz(ij)*schwmax .lt. tol2e) then
                  icut1 = icut1 + ij
               else
                  do 30 k = 1, i
                     lhi = k
                     if (k.eq.i) lhi = j
                     do 40 l = 1, lhi
                        kl = iky(k) + l
                        if (schwarz(ij)*schwarz(kl).lt.tol2e) then
                           icut2 = icut2 + 1
                        else
                           icut3 = icut3 + 1
c     
c     compute value of integral (ij|kl) and add into fock matrix
c     
                           call g(gg, i, j, k, l)
                           call addin(gg*0.5d0, i, j, k, l, fock,
     $                          dens, iky)
                        endif
 40                  continue
 30               continue
               endif
               next = nnbfn - nxtask(nproc)
            endif
 20      continue
 10   continue
c     
      etwo = ddot(nnbfn, fock, 1, dens, 1)
c     
c     wait for all processes to finish work
c     
      junk = nxtask(-nproc)
c     
      end
      subroutine damp(fac, dens, work)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension dens(nnbfn), work(nnbfn)
c
      ofac = 1.0d0 - fac
      do 10 i = 1, nnbfn
         dens(i) = fac*dens(i) + ofac*work(i)
10    continue
c
      end
      subroutine prnout(iter, energy, deltad, tester)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
c
c     printout results of each iteration
c
      write(6,1) iter, energy, deltad, tester
      call flush(6)
1     format(' iter=',i3,', energy=',f13.8,', deltad=',d9.2,
     $     ', deltaf=',d9.2)
c
      end
      double precision function dendif(dens, work)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension dens(nnbfn), work(nnbfn)
c
c     compute largest change in density matrix elements
c
      denmax = 0.0d0
      do 10 i = 1, nnbfn
         denmax = max(denmax, abs(work(i)-dens(i)))
10    continue
      dendif = denmax
c
      end
      subroutine prnfin(energy, evals, orbs)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension evals(nbfn), orbs(nbfn, nbfn)
c
c     printout final results
c
      write(6,1) energy
 1    format(//' final energy = ',f16.11//' eigenvalues')
      call output(evals, 1, min(nbfn,nocc+5), 1, 1, nbfn, 1, 1)
      write(6,2)
2     format(//' eigenvectors ')
      call output(orbs, 1, nbfn, 1, nocc, nbfn, nbfn, 1)
c
      end
      subroutine g(value,i,j,k,l)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
c
c     compute the two electon integral (ij|kl) over normalized
c     primitive 1s gaussians
c
      f0val = 0.0d0
      rab2 = (x(i)-x(j))**2 + (y(i)-y(j))**2 + (z(i)-z(j))**2
      rcd2 = (x(k)-x(l))**2 + (y(k)-y(l))**2 + (z(k)-z(l))**2
      facij = expnt(i)*expnt(j)/(expnt(i)+expnt(j))
      fackl = expnt(k)*expnt(l)/(expnt(k)+expnt(l))
      exijkl = exprjh(- facij*rab2 - fackl*rcd2)
      denom = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) *
     $        sqrt(expnt(i)+expnt(j)+expnt(k)+expnt(l))
      fac = (expnt(i)+expnt(j))*(expnt(k)+expnt(l)) /
     $        (expnt(i)+expnt(j)+expnt(k)+expnt(l))
c
      xp = (x(i)*expnt(i) + x(j)*expnt(j))/(expnt(i)+expnt(j))
      yp = (y(i)*expnt(i) + y(j)*expnt(j))/(expnt(i)+expnt(j))
      zp = (z(i)*expnt(i) + z(j)*expnt(j))/(expnt(i)+expnt(j))
      xq = (x(k)*expnt(k) + x(l)*expnt(l))/(expnt(k)+expnt(l))
      yq = (y(k)*expnt(k) + y(l)*expnt(l))/(expnt(k)+expnt(l))
      zq = (z(k)*expnt(k) + z(l)*expnt(l))/(expnt(k)+expnt(l))
      rpq2 = (xp-xq)**2 + (yp-yq)**2 + (zp-zq)**2
c
      call f0(f0val, fac*rpq2)
      value = (2.0d0 * pi**2.5d0 / denom) * exijkl * f0val *
     $    rnorm(i)*rnorm(j)*rnorm(k)*rnorm(l)
c
      end
      subroutine diagon(fock, orbs, evals, work, tester, iter)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension fock(nnbfn), orbs(nbfn, nbfn), evals(nbfn),
     $     work(nbfn*nbfn)
      dimension ilifq(nbfn), work2(nbfn*nbfn)
c
      do 10 i = 1, nbfn
         ilifq(i) = (i-1)*nbfn
 10   continue
c
c     transform fock matrix from AO to MO basis
c
      call zsqua(nbfn, fock, work)
      call dgemm('n', 'n', nbfn, nbfn, nbfn, 1.0d0, 
     $     work, nbfn, orbs, nbfn, 0.0d0, work2, nbfn)
      call dgemm('t', 'n', nbfn, nbfn, nbfn, 1.0d0, 
     $     orbs, nbfn, work2, nbfn, 0.0d0, work, nbfn)
      call zfold(nbfn, fock, work)
c
      iop1 = 1
      iop2 = 2
      tester = 0.0d0
      do 20 i = 2, nbfn
         do 30 j = 1, i-1
            tester = max(tester, abs(fock(iky(i)+j)))
 30      continue
 20   continue
c
      if (tester.gt.0.3d0) then
        shift = 0.3d0
      else
        if (nbfn .gt. 60) then
           shift = 0.1d0
        else
           shift = 0.0d0
        endif
      endif	
      if (iter.ge.2) then
         do 40 i = nocc+1, nbfn
            fock(iky(i)+i) = fock(iky(i)+i) + shift
 40      continue
      endif
      thresh = min(0.0001d0,max(1.0d-12,tester*0.01d0))
      call jacobi(fock, iky, nbfn, orbs, ilifq, nbfn, evals, iop1,
     *     iop2,thresh)
      if (iter.ge.2) then
         do 50 i = nocc+1, nbfn
            evals(i) = evals(i) - shift
 50      continue
      endif
c
      end
      subroutine makeob(orbs, work)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      include 'msgtypesf.h'
      dimension orbs(nbfn,nbfn), work(nbfn,nbfn)
      dimension tmp1(nbfn), tmp2(nbfn)
c
c     generate set of orthonormal vectors by orthoging twice
c     a set of random vectors ... don't do this at home!
c     ... should really diagonalize the overlap to get sym adaption
c
      if (nodeid().eq.0) then
         call srand48(12345)
         do 10 i = 1, nbfn
            do 20 j = 1, nbfn
               work(j,i) = s(i,j)
               orbs(j,i) = drand48(0)
 20         continue
 10      continue
         call orthv2(nbfn, orbs, work, tmp1, tmp2)
         call orthv2(nbfn, orbs, work, tmp1, tmp2)
      endif
      call brdcst(99+MSGDBL, orbs, mdtob(nbfn*nbfn), 0)
c
      end
      subroutine denges(dens, work)
      implicit double precision (a-h, o-z)
      include 'cscf.h'
      dimension dens(nnbfn), work(nbfn, nbfn)
c
c     Form guess density from superposition of atomic densities in the AO
c     basis set ... instead of doing the atomic SCF hardwire for this
c     small basis set for the Be atom.
c
      dimension atdens(15,15)
      data atdens/
     $     0.000002,0.000027,0.000129,0.000428,0.000950,0.001180,
     $     0.000457,-0.000270,-0.000271,0.000004,0.000004,0.000004,
     $     0.000004,0.000004,0.000004,0.000027,0.000102,0.000987,
     $     0.003269,0.007254,0.009007,0.003492,-0.002099,-0.002108,
     $     0.000035,0.000035,0.000035,0.000035,0.000035,0.000035,
     $     0.000129,0.000987,0.002381,0.015766,0.034988,0.043433,
     $     0.016835,-0.010038,-0.010082,0.000166,0.000166,0.000166,
     $     0.000166,0.000166,0.000166,0.000428,0.003269,0.015766,
     $     0.026100,0.115858,0.144064,0.055967,-0.035878,-0.035990,
     $     0.000584,0.000584,0.000584,0.000584,0.000584,0.000584,
     $     0.000950,0.007254,0.034988,0.115858,0.128586,0.320120,
     $     0.124539,-0.083334,-0.083536,0.001346,0.001346,0.001346,
     $     0.001346,0.001346,0.001346,0.001180,0.009007,0.043433,
     $     0.144064,0.320120,0.201952,0.159935,-0.162762,-0.162267,
     $     0.002471,0.002471,0.002471,0.002471,0.002471,0.002471,
     $     0.000457,0.003492,0.016835,0.055967,0.124539,0.159935,
     $     0.032378,-0.093780,-0.093202,0.001372,0.001372,0.001372,
     $     0.001372,0.001372,0.001372,-0.000270,-0.002099,-0.010038,
     $     -0.035878,-0.083334,-0.162762,-0.093780,0.334488,0.660918,
     $     -0.009090,-0.009090,-0.009090,-0.009090,-0.009090,-0.009090,
     $     -0.000271,-0.002108,-0.010082,-0.035990,-0.083536,-0.162267,
     $     -0.093202,0.660918,0.326482,-0.008982,-0.008982,-0.008981,
     $     -0.008981,-0.008981,-0.008982,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008982,
     $     0.000062,0.000124,0.000124,0.000124,0.000124,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008982,0.000124,0.000062,0.000124,
     $     0.000124,0.000124,0.000124,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
     $     0.000124,0.000124,0.000062,0.000124,0.000124,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008981,0.000124,0.000124,0.000124,
     $     0.000062,0.000124,0.000124,0.000004,0.000035,0.000166,
     $     0.000584,0.001346,0.002471,0.001372,-0.009090,-0.008981,
     $     0.000124,0.000124,0.000124,0.000124,0.000062,0.000124,
     $     0.000004,0.000035,0.000166,0.000584,0.001346,0.002471,
     $     0.001372,-0.009090,-0.008982,0.000124,0.000124,0.000124,
     $     0.000124,0.000124,0.000062/
c
      call dfill(nbfn*nbfn,0.0d0,work,1)
      do 10 iat = 1,natom
         ioff = (iat-1)*15
         do 20 i = 1,15
            do 30 j = 1,15
               work(ioff+j,ioff+i) = atdens(j,i)*0.5d0
 30         continue
 20      continue
 10   continue
c
      call zfold(nbfn,dens,work)
c
      end