!>
!> @file ppde3d_pb.f90
!>
!> @brief
!>
!> @copyright
!> Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
!> SPC (Swiss Plasma Center)
!>
!> spclibs is free software: you can redistribute it and/or modify it under
!> the terms of the GNU Lesser General Public License as published by the Free
!> Software Foundation, either version 3 of the License, or (at your option)
!> any later version.
!>
!> spclibs is distributed in the hope that it will be useful, but WITHOUT ANY
!> WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
!> FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
!>
!> You should have received a copy of the GNU Lesser General Public License
!> along with this program. If not, see <https://www.gnu.org/licenses/>.
!>
!> @authors
!> (in alphabetical order)
!> @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
!>
PROGRAM main
!
!  Solving the following 3d PDE using splines:
!
!    -d/dx[x d/dx]f - 1/x[d/dy]^2 f = 4(m+1)x^(m+1)cos(my)cos(z)^n, with f(x=1,y) = 0
!    exact solution: f(x,y) = (1-x^2) x^m cos(my)cos(z)^n
!
  USE futils
  USE fft
  USE pputils2, ONLY : pptransp
  USE ppde3d_pb_mod
!
  IMPLICIT NONE
!
  CHARACTER(len=128) :: infile="ppde3d_pb.in"
  INTEGER :: l
  INTEGER :: nx, ny, nz, nidbas(3), ngauss(3), mbess, npow, nterms
  INTEGER :: startz, endz, nzloc
  INTEGER :: start_rank, end_rank, nrank_loc
  LOGICAL :: nlppform
  INTEGER :: i, j, k, kk, ij, dimx, dimy, dimz, nrank, kl, ku
  INTEGER :: jder(3), it
  DOUBLE PRECISION :: pi, coefx(5)
  DOUBLE PRECISION :: dy, dz
  DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE   :: xgrid, ygrid, zgrid
  DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE   :: fftmass
  DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: rhs, sol, rhs_t
  DOUBLE COMPLEX, DIMENSION(:,:), ALLOCATABLE   :: crhs_t
!
  TYPE(spline2d1d), TARGET :: splxyz
  TYPE(spline2d), POINTER  :: splxy
  TYPE(spline1d)           :: splz
  TYPE(pbmat)              :: mat
!
  CHARACTER(len=128) :: file='ppde3d_pb.h5'
  INTEGER :: fid
  DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: arr
  DOUBLE PRECISION, DIMENSION(:,:,:), ALLOCATABLE :: bcoef, solcal, solana, errsol
  DOUBLE PRECISION :: seconds, mem, dopla
  DOUBLE PRECISION :: t0, tmat, tfact, tsolv, tgrid, gflops1, gflops2
  INTEGER :: nits=500
!
  INTEGER, PARAMETER :: npart=10000000
  INTEGER :: nploc
  DOUBLE PRECISION, DIMENSION(npart) :: xp, yp, zp, fp_calc, fp_anal
  DOUBLE PRECISION zsuml, zsumg, errnorm2
!
  INTEGER :: kmin, kmax
  DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE   :: fftmass_shifted
!
  NAMELIST /newrun/ nx, ny, nz, nidbas, ngauss, mbess, npow, nlppform, coefx
!===========================================================================
!              1.0 Prologue
!
!   Init MPI
!
  CALL mpi_init(ierr)
  CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
  CALL mpi_comm_rank(MPI_COMM_WORLD, me, ierr)
!
!   Get input file name from command argument
!
  IF( COMMAND_ARGUMENT_COUNT() .EQ. 1 ) THEN
     CALL GET_COMMAND_ARGUMENT(1, infile, l, ierr)
  END IF
!
!   Read in data specific to run
!
  nx = 8              ! Number of intervals in x
  ny = 8              ! Number of intervals in y
  nz = 8              ! Number of intervals in z
  nidbas = (/3,3,3/)  ! Degree of splines
  ngauss = (/4,4, 4/)    ! Number of Gauss points/interval
  mbess = 2           ! Exponent of differential problem
  npow = 2            ! Exponent of differential problem
  nterms = 2          ! Number of terms in weak form
  nlppform = .TRUE.   ! Use PPFORM for gridval or not
  coefx(1:5) = (/1.0d0, 0.d0, 0.d0, 0.d0, 1.d0/) ! Mesh point distribution function
!
  OPEN(unit=99, file=TRIM(infile), status='old', action='read')
  READ(99,newrun)
  IF( me.EQ.0) THEN
     WRITE(*,newrun)
  END IF
  CLOSE(99)
!
!   Define grid on x (=radial) & y (=poloidal) axis
!
  pi = 4.0d0*ATAN(1.0d0)
  ALLOCATE(xgrid(0:nx), ygrid(0:ny), zgrid(0:nz))
!
  xgrid(0) = 0.0d0;   xgrid(nx) = 1.0d0
  CALL meshdist(coefx, xgrid, nx)
! 
  dy = 2.d0*pi/REAL(ny,8)        ! Equidistant in y
  ygrid = (/ (j*dy, j=0,ny) /)
!
!   Partitionned toroidal grid z
!
  dz = 2.0d0*pi/REAL(nz,8)       ! Equidistant in z
  zgrid = (/ (k*dz, k=0,nz) /)
  CALL dist1d(0, nz, startz, nzloc)
  endz = startz+nzloc
!!$  PRINT*, 'PE', me, ' startz, endz, nzloc', startz, endz, nzloc
!
  IF( me.EQ.0) THEN
     WRITE(*,'(a/(10f8.3))') 'XGRID', xgrid(0:nx)
     WRITE(*,'(a/(10f8.3))') 'YGRID', ygrid(0:ny)
     WRITE(*,'(a/(10f8.3))') 'ZGRID', zgrid(0:nz)
  END IF
!
!   Create hdf5 file
!
  CALL creatf(file, fid, 'PDE2D Result File', real_prec='d', &
       &      mpicomm=MPI_COMM_WORLD)
  CALL attach(fid, '/', 'NX', nx)
  CALL attach(fid, '/', 'NY', ny)
  CALL attach(fid, '/', 'NZ', nz)
  CALL attach(fid, '/', 'NIDBAS1', nidbas(1))
  CALL attach(fid, '/', 'NIDBAS2', nidbas(2))
  CALL attach(fid, '/', 'NIDBAS3', nidbas(3))
  CALL attach(fid, '/', 'NGAUSS1', ngauss(1))
  CALL attach(fid, '/', 'NGAUSS2', ngauss(2))
  CALL attach(fid, '/', 'NGAUSS2', ngauss(3))
  CALL attach(fid, '/', 'MBESS', mbess)
  CALL putarr(fid, '/xgrid', xgrid, 'r')
  CALL putarr(fid, '/ygrid', ygrid, '\theta')
  CALL putarr(fid, '/zgrid', zgrid(0:nz-1), '\phi')
!===========================================================================
!              2.0 Discretize the PDE
!
!   Set up spline
!
  t0 = seconds()
  CALL set_spline(nidbas, ngauss, xgrid, ygrid, zgrid(startz:endz), &
       &          splxyz, (/.FALSE., .TRUE., .TRUE./), nlppform=nlppform)
  splxy => splxyz%sp12
!
  IF( me.EQ.0) THEN
     WRITE(*,'(/a,/(12(f8.3)))') 'KNOTS in X', splxy%sp1%knots
     WRITE(*,'(/a,/(12(f8.3)))') 'KNOTS in Y', splxy%sp2%knots
  END IF
  CALL disp(splxyz%sp3%knots, 'KNOTS in Z', MPI_COMM_WORLD)
!
!   2D FE matrix assembly (in plane x-y)
!
  nrank = (nx+nidbas(1))*ny    ! Rank of the FE matrix
  kl = (nidbas(1)+1)*ny -1     ! Number of sub-diagnonals
  ku = kl                      ! Number of super-diagnonals
  IF(me.EQ.0) THEN
     WRITE(*,'(a,5i6)') 'nrank, kl, ku', nrank, kl, ku
  END IF
!
  CALL init(ku, nrank, nterms, mat)
  CALL dismat(splxy, mat)
  ALLOCATE(arr(nrank))
!
!   BC on Matrix
!
  CALL ibcmat(mat, ny)
  tmat = seconds() - t0
!
!   3D RHS assembly
!
  ALLOCATE(rhs(nrank,0:nzloc+nidbas(3)-1)) ! With right guard cells nzloc:nzloc+nidbas3-1
  ALLOCATE(sol(nrank,0:nzloc-1))
  CALL disrhs3(mbess, npow, splxyz, rhs)
!
  zsuml = SUM(ABS(rhs(:,0:nzloc-1)))
  CALL mpi_reduce(zsuml, zsumg, 1, MPI_DOUBLE_PRECISION, MPI_SUM , 0, &
       &          MPI_COMM_WORLD, ierr)
  IF(me.EQ.0) PRINT*, 'sum of rhs after DISRHS3', zsumg
!
!   FFT in z of RHS
!
  CALL dist1d(1, nrank, start_rank, nrank_loc)
  end_rank = start_rank+nrank_loc-1
  ALLOCATE(rhs_t(0:nz-1,nrank_loc), crhs_t(0:nz-1,nrank_loc))
!
  CALL pptransp(MPI_COMM_WORLD, rhs(:,0:nzloc-1), rhs_t)
  crhs_t = rhs_t
  CALL fourcol(crhs_t,1)
  crhs_t = crhs_t/REAL(nz,8)
!
!   Apply Mass matrix to crhs
!
  CALL set_spline(nidbas(3), ngauss(3), zgrid, splz, .TRUE.)
  kmin =-nz/2
  kmax = nz/2-1
  CALL init_dft(splz, kmin, kmax)
  ALLOCATE(fftmass_shifted(kmin:kmax))
  ALLOCATE(fftmass(0:nz-1))
  CALL calc_fftmass_old(splz, fftmass_shifted)
  DO k=kmin,kmax
     fftmass(MODULO(k+nz,nz)) = fftmass_shifted(k)
  END DO
  IF(me.EQ.0) THEN
     WRITE(*,'(/a/(10(1pe12.3)))') 'Mass matrix', fftmass
  END IF
  DO k=0,nz-1
     crhs_t(k,:) = crhs_t(k,:)/fftmass(k)
  END DO
!
!   Fourier transform back crhs to real space in z
!
  CALL fourcol(crhs_t, -1)
  rhs_t(:,:) = REAL(crhs_t(:,:),8)
  CALL pptransp(MPI_COMM_WORLD, rhs_t, sol)  ! Put the final RHS in SOL
!
!   BC on RHS
!
  CALL ibcrhs3(sol, ny)
!
  IF(me.EQ.0) THEN
     WRITE(*,'(a,1pe12.3)') 'Mem used so far (MB)', mem()
  END IF
!===========================================================================
!              3.0 Solve the dicretized system
!
  t0 = seconds()
  CALL factor(mat)
  tfact = seconds() - t0
  gflops1 = dopla('DPBTRF',nrank,nrank,kl,ku,0) / tfact / 1.d9

  t0 = seconds()
  CALL bsolve(mat, sol)
!
!   Backtransform of solution
!
  DO k=0,nzloc-1
     sol(1:ny-1,k) = sol(ny,k)
  END DO
!
  zsuml = SUM(ABS(sol))
  CALL mpi_reduce(zsuml, zsumg, 1, MPI_DOUBLE_PRECISION, MPI_SUM , 0, &
       &          MPI_COMM_WORLD, ierr)
  IF(me.EQ.0) PRINT*, 'sum of sol', zsumg
!
  tsolv = seconds() - t0
  gflops2 = dopla('DPBTRS',nrank,1,kl,ku,0) / tsolv / 1.d9
!
!   Spline coefficients, taking into account of periodicity in y and z
!   Note: in SOL, y was numbered first.
!
  dimx = splxy%sp1%dim
  dimy = splxy%sp2%dim
  dimz = splxyz%sp3%dim
  ALLOCATE(bcoef(0:dimx-1, 0:dimy-1, 0:dimz-1))
!
!   Get 3D array of spline coefs.
!
  DO j=0,dimy-1
     DO i=0,dimx-1
        ij = MODULO(j,ny) + i*ny + 1
        DO k=0,nzloc-1
           bcoef(i,j,k) = sol(ij,k)
        END DO
     END DO
  END DO
!
!   Get missing coefs from remote guard cells
!
  prev = MODULO(me-1,npes)
  next = MODULO(me+1,npes)
  count = dimx*dimy
  DO i=0,nidbas(3)-1
     CALL mpi_sendrecv(bcoef(0,0,i), count, MPI_DOUBLE_PRECISION, prev, 0, &
          &            bcoef(0,0,nzloc+i), count, MPI_DOUBLE_PRECISION, next, 0, &
          &            MPI_COMM_WORLD, status, ierr)
  END DO
!
  IF(me.EQ.0) THEN
     WRITE(*,'(/a,3i6)') 'dimx, dimy, dimz =', dimx, dimy, dimz
  END IF
!===========================================================================
!              4.0 Check the solution
!
!   Check function values computed with various method
!
  CALL RANDOM_NUMBER(xp)
  CALL RANDOM_NUMBER(yp); yp = 2.d0*pi*yp
  CALL RANDOM_NUMBER(zp); zp = 2.d0*pi*zp
  nploc = 0
  DO i=1,npart
     IF(zp(i).GE.zgrid(startz) .AND. zp(i).LT.zgrid(endz)) THEN
        nploc = nploc+1
        xp(nploc) = xp(i)
        yp(nploc) = yp(i)
        zp(nploc) = zp(i)
     END IF
  END DO
  jder = (/0,0,0/)
  CALL gridval(splxyz, xp(1:nploc), yp(1:nploc), zp(1:nploc), fp_calc(1:nploc), jder, bcoef)
  DO i=1,nploc
     fp_anal(i) = (1-xp(i)**2) * xp(i)**mbess &
          &        * COS(mbess*yp(i)) * COS(zp(i))**npow
  END DO
  errnorm2 =  norm21(fp_calc(1:nploc)-fp_anal(1:nploc))/norm21(fp_calc(1:nploc))
  IF(me.EQ.0) THEN
     WRITE(*,'(a,2(1pe12.3))') 'Relative discretization errors, using random points', &
          &    errnorm2
  END IF
!
  ALLOCATE(solcal(0:nx,0:ny,0:nzloc-1))
  ALLOCATE(solana(0:nx,0:ny,0:nzloc-1))
  ALLOCATE(errsol(0:nx,0:ny,0:nzloc-1))
!
  DO i=0,nx
     DO j=0,ny
        DO k=0,nzloc-1
           kk=startz+k
           solana(i,j,k) = (1-xgrid(i)**2) * xgrid(i)**mbess &
                &        * COS(mbess*ygrid(j)) * COS(zgrid(kk))**npow
        END DO
     END DO
  END DO
!
  jder = (/0,0,0/)
  CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder, bcoef)
  t0 = seconds()
  CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder)
  tgrid = seconds()-t0
  errsol = solana - solcal
!
  errnorm2 =  norm2(errsol) / norm2(solana)
  IF(me.EQ.0) THEN
     WRITE(*,'(a,2(1pe12.3))') 'Relative discretization errors', &
          &    errnorm2
  END IF
  CALL putarr(fid, '/sol', solcal,pardim=3)
  CALL putarr(fid, '/solana', solana,pardim=3)
!
!   Check derivative d/dx
!
  DO i=0,nx
     DO j=0,ny
        DO k=0,nzloc-1
           IF( mbess .EQ. 0 ) THEN
              solana(i,j,k) = -2.0d0 * xgrid(i) * COS(zgrid(k+startz))**npow
           ELSE
              solana(i,j,k) = (-(mbess+2)*xgrid(i)**2 + mbess) * &
                   &        xgrid(i)**(mbess-1) * COS(mbess*ygrid(j)) * &
                   &        COS(zgrid(k+startz))**npow
           END IF
        END DO
     END DO
  END DO
!
  jder = (/1,0,0/)
  t0 = seconds()
  CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder)
  tgrid = tgrid + seconds()-t0
  errsol = solana - solcal
  errnorm2 =  norm2(errsol) / norm2(solana)
  IF(me.EQ.0) THEN
     WRITE(*,'(a,2(1pe12.3))') 'Relative error in d/dx', errnorm2
  END IF
  CALL putarr(fid, '/derivx', solcal, pardim=3)
  CALL putarr(fid, '/derivx_exact', solana,pardim=3)
!
!   Check derivative d/dy
!
  DO i=0,nx
     DO j=0,ny
        DO k=0,nzloc-1
           solana(i,j,k) = -mbess * (1-xgrid(i)**2) * xgrid(i)**mbess * &
                &        SIN(mbess*ygrid(j))* COS(zgrid(k+startz))**npow
        END DO
     END DO
  END DO
!
  jder = (/0,1,0/)
  t0 = seconds()
  CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder)
  tgrid = tgrid + seconds()-t0
  errsol = solana - solcal
  errnorm2 =  norm2(errsol) / norm2(solana)
  IF(me.EQ.0) THEN
     WRITE(*,'(a,2(1pe12.3))') 'Relative error in d/dy', errnorm2
  END IF
  CALL putarr(fid, '/derivy', solcal, pardim=3)
  CALL putarr(fid, '/derivy_exact', solana,pardim=3)
!
!   Check derivative d/dz
!
  DO i=0,nx
     DO j=0,ny
        DO k=0,nzloc-1
           solana(i,j,k) = -npow*(1-xgrid(i)**2) * xgrid(i)**mbess &
                &        * COS(mbess*ygrid(j)) * COS(zgrid(k+startz))**(npow-1) &
                &        * SIN(zgrid(k+startz))
        END DO
     END DO
  END DO
!
  jder = (/0,0,1/)
  t0 = seconds()
  IF(nlppform) THEN
     CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder)
  ELSE
     CALL gridval(splxyz, xgrid, ygrid, zgrid(startz:endz-1), solcal, jder, bcoef)
  END IF
  tgrid = tgrid + seconds()-t0
  errsol = solana - solcal
  errnorm2 =  norm2(errsol) / norm2(solana)
  IF(me.EQ.0) THEN
     WRITE(*,'(a,2(1pe12.3))') 'Relative error in d/dz', errnorm2
  END IF
  CALL putarr(fid, '/derivz', solcal, pardim=3)
  CALL putarr(fid, '/derivz_exact', solana,pardim=3)
!===========================================================================
!              9.0  Epilogue
!
  IF(me.EQ.0) THEN
     WRITE(*,'(/a)') '---'
     WRITE(*,'(a,1pe12.3)') 'Matrice construction time (s) ', tmat
     WRITE(*,'(a,1pe12.3)') 'Matrice factorisation time (s)', tfact
     WRITE(*,'(a,1pe12.3)') 'Backsolve time (s)            ', tsolv
     WRITE(*,'(a,1pe12.3)') 'gridval time (s)              ', tgrid
     WRITE(*,'(a,2f10.3)') 'Factor/solve Gflop/s', gflops1, gflops2
     WRITE(*,'(a,1pe12.3)') 'Mem used so far (MB)', mem()
  END IF
!
  DEALLOCATE(xgrid, ygrid, zgrid)
  DEALLOCATE(fftmass)
  DEALLOCATE(fftmass_shifted)
  DEALLOCATE(rhs)
  DEALLOCATE(sol)
  DEALLOCATE(rhs_t)
  DEALLOCATE(crhs_t)
  DEALLOCATE(bcoef)
  DEALLOCATE(solcal, solana, errsol)
  DEALLOCATE(arr)
  CALL destroy_sp(splxyz)
  CALL destroy_sp(splz)
  CALL destroy(mat)
!
  CALL closef(fid)
!
  CALL mpi_finalize(ierr)
!===========================================================================
!
CONTAINS
  FUNCTION norm2(x)
!
!  Compute the 2-norm of array x
!
    IMPLICIT NONE
    DOUBLE PRECISION :: norm2
    DOUBLE PRECISION, INTENT(in) :: x(:,:,:)
    DOUBLE PRECISION :: sum2, sum2g
    INTEGER :: i, j
!
    sum2 = 0.0d0
    DO i=1,SIZE(x,1)
       DO j=1,SIZE(x,2)
          DO k=1,SIZE(x,3)
             sum2 = sum2 + x(i,j,k)**2
          END DO
       END DO
    END DO
    CALL mpi_allreduce(sum2, sum2g, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
         &          MPI_COMM_WORLD, ierr)
    norm2 = SQRT(sum2g)
  END FUNCTION norm2
!
  FUNCTION norm21(x)
!
!  Compute the 2-norm of array x
!
    IMPLICIT NONE
    DOUBLE PRECISION :: norm21
    DOUBLE PRECISION, INTENT(in) :: x(:)
    DOUBLE PRECISION :: sum2, sum2g
    INTEGER :: i, j
!
    sum2 = DOT_PRODUCT(x,x)
    CALL mpi_allreduce(sum2, sum2g, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
         &          MPI_COMM_WORLD, ierr)
    norm21 = SQRT(sum2g)
  END FUNCTION norm21
  SUBROUTINE prnmat(label, mat)
    CHARACTER(len=*) :: label
    DOUBLE PRECISION, DIMENSION(:,:), INTENT(in) :: mat
    INTEGER :: i
    WRITE(*,'(/a)') TRIM(label)
    DO i=1,SIZE(mat,1)
       WRITE(*,'(10(1pe12.3))') mat(i,:)
    END DO
  END SUBROUTINE prnmat
END PROGRAM main
!
!+++