!>
!> @file ppde3d_mod.f90
!>
!> @brief
!>
!> @copyright
!> Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
!> SPC (Swiss Plasma Center)
!>
!> spclibs is free software: you can redistribute it and/or modify it under
!> the terms of the GNU Lesser General Public License as published by the Free
!> Software Foundation, either version 3 of the License, or (at your option)
!> any later version.
!>
!> spclibs is distributed in the hope that it will be useful, but WITHOUT ANY
!> WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
!> FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
!>
!> You should have received a copy of the GNU Lesser General Public License
!> along with this program. If not, see <https://www.gnu.org/licenses/>.
!>
!> @authors
!> (in alphabetical order)
!> @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
!>
MODULE ppde3d_mod
  USE bsplines
  USE matrix
  IMPLICIT NONE
  INCLUDE "mpif.h"
!
  INTEGER :: me, npes
  INTEGER :: prev, next
  INTEGER :: count, status(MPI_STATUS_SIZE), ierr
!
CONTAINS
  SUBROUTINE dismat(spl, mat)
!
!   Assembly of FE matrix mat using spline spl
!
    TYPE(spline2d), INTENT(in) :: spl
    TYPE(gbmat), INTENT(inout) :: mat
!
    INTEGER :: n1, nidbas1, ndim1, ng1
    INTEGER :: n2, nidbas2, ndim2, ng2
    INTEGER :: i, j, ig1, ig2
    INTEGER :: iterm, iw1, iw2, igw1, igw2, it1, it2, igt1, igt2, irow, jcol
    DOUBLE PRECISION, ALLOCATABLE :: xg1(:), wg1(:), fun1(:,:)
    DOUBLE PRECISION, ALLOCATABLE :: xg2(:), wg2(:), fun2(:,:)
    DOUBLE PRECISION:: contrib
!
    INTEGER :: kterms         ! Number of terms in weak form
    INTEGER, DIMENSION(:,:), ALLOCATABLE :: idert, iderw  ! Derivative order
    DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE  :: coefs ! Terms in weak form
!===========================================================================
!              1.0 Prologue
!
!   Properties of spline space
!
    CALL get_dim(spl%sp1, ndim1, n1, nidbas1)
    CALL get_dim(spl%sp2, ndim2, n2, nidbas2)
!
!   Weak form
!
    kterms = mat%nterms
    ALLOCATE(idert(kterms,2), iderw(kterms,2), coefs(kterms))
!
    ALLOCATE(fun1(0:nidbas1,0:1)) ! Spline and 1st derivative
    ALLOCATE(fun2(0:nidbas2,0:1)) ! 
!
!   Gauss quadature
!
    CALL get_gauss(spl%sp1, ng1)
    CALL get_gauss(spl%sp2, ng2)
    ALLOCATE(xg1(ng1), wg1(ng1))
    ALLOCATE(xg2(ng1), wg2(ng1))
!===========================================================================
!              2.0 Assembly loop
!
    DO i=1,n1
       CALL get_gauss(spl%sp1, ng1, i, xg1, wg1)
       DO ig1=1,ng1
          CALL basfun(xg1(ig1), spl%sp1, fun1, i)
          DO j=1,n2
             CALL get_gauss(spl%sp2, ng2, j, xg2, wg2)
             DO ig2=1,ng2
                CALL basfun(xg2(ig2), spl%sp2, fun2, j)
                CALL coefeq(xg1(ig1), xg2(ig2), idert, iderw, coefs)
                DO iterm=1,kterms
                   DO iw1=0,nidbas1  ! Weight function in dir 1
                      igw1 = i+iw1
                      DO iw2=0,nidbas2  ! Weight function in dir 2
                         igw2 = MODULO(j+iw2-1, n2) + 1
                         irow = igw2 + (igw1-1)*n2
                         DO it1=0,nidbas1  ! Test function in dir 1
                            igt1 = i+it1
                            DO it2=0,nidbas2  ! Test function in dir 2
                               igt2 = MODULO(j+it2-1, n2) + 1
                               jcol = igt2 + (igt1-1)*n2
                               contrib = fun1(iw1,iderw(iterm,1)) * &
                                    &    fun2(iw2,iderw(iterm,2)) * &
                                    &    coefs(iterm) *             &
                                    &    fun2(it2,idert(iterm,2)) * &
                                    &    fun1(it1,idert(iterm,1)) * &
                                    &    wg1(ig1) * wg2(ig2)
                               CALL updtmat(mat, irow, jcol, contrib)
                            END DO
                         END DO
                      END DO
                   END DO
                END DO
             END DO
          END DO
       END DO
    END DO
!===========================================================================
!              9.0  Epilogue
!
    DEALLOCATE(xg1, wg1, fun1)
    DEALLOCATE(xg2, wg2, fun2)
    DEALLOCATE(idert, iderw, coefs)
!
  CONTAINS
    SUBROUTINE coefeq(x, y, idt, idw, c)
      DOUBLE PRECISION, INTENT(in) :: x, y
      INTEGER, INTENT(out) :: idt(:,:), idw(:,:)
      DOUBLE PRECISION, INTENT(out) :: c(SIZE(idt,1))
!
! Weak form = Int(x*dw/dx*dt/dx + 1/x*dw/dy*dt/dy)dxdy
!
      c(1) = x        ! 
      idt(1,1) = 1
      idt(1,2) = 0
      idw(1,1) = 1
      idw(1,2) = 0
!
      c(2) = 1.d0/x
      idt(2,1) = 0
      idt(2,2) = 1
      idw(2,1) = 0
      idw(2,2) = 1
    END SUBROUTINE coefeq
  END SUBROUTINE dismat
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE disrhs3(mbess, npow, spl, rhs)
!
!   Assembly the RHS using 3d spline spl
!
    INTEGER, INTENT(in)           :: mbess, npow
    TYPE(spline2d1d), TARGET      :: spl
    DOUBLE PRECISION, INTENT(out) :: rhs(:,:)
!
    TYPE(spline1d), POINTER :: sp1, sp2, sp3
    INTEGER :: n1, nidbas1, ndim1, ng1
    INTEGER :: n2, nidbas2, ndim2, ng2
    INTEGER :: n3, nidbas3, ndim3, ng3
    INTEGER :: i, j, k, ig1, ig2, ig3, k1, k2, k3, i1, j2, ij, kk, nrank
    DOUBLE PRECISION, ALLOCATABLE :: xg1(:), wg1(:), fun1(:,:)
    DOUBLE PRECISION, ALLOCATABLE :: xg2(:), wg2(:), fun2(:,:)
    DOUBLE PRECISION, ALLOCATABLE :: xg3(:), wg3(:), fun3(:,:)
    DOUBLE PRECISION, ALLOCATABLE :: buf(:)
    DOUBLE PRECISION:: contrib
!===========================================================================
!              1.0 Prologue
!
!   Properties of spline space
!
    sp1 => spl%sp12%sp1
    sp2 => spl%sp12%sp2
    sp3 => spl%sp3
!
    CALL get_dim(sp1, ndim1, n1, nidbas1)
    CALL get_dim(sp2, ndim2, n2, nidbas2) 
    CALL get_dim(sp3, ndim3, n3, nidbas3) 
!
    ALLOCATE(fun1(0:nidbas1,1)) ! needs only basis functions (no derivatives)
    ALLOCATE(fun2(0:nidbas2,1)) ! needs only basis functions (no derivatives)
    ALLOCATE(fun3(0:nidbas3,1)) ! needs only basis functions (no derivatives)
!
!   Gauss quadature
!
    CALL get_gauss(sp1, ng1)
    CALL get_gauss(sp2, ng2)
    CALL get_gauss(sp3, ng3)
    ALLOCATE(xg1(ng1), wg1(ng1))
    ALLOCATE(xg2(ng2), wg2(ng2))
    ALLOCATE(xg3(ng3), wg3(ng3))
!===========================================================================
!              2.0 Assembly loop
!
    nrank = SIZE(rhs,1)
    rhs = 0.0d0
!
    DO i=1,n1
       CALL get_gauss(sp1, ng1, i, xg1, wg1)
       DO ig1=1,ng1
          CALL basfun(xg1(ig1), sp1, fun1, i)
          DO j=1,n2
             CALL get_gauss(sp2, ng2, j, xg2, wg2)
             DO ig2=1,ng2
                CALL basfun(xg2(ig2), sp2, fun2, j)
                DO k=1,n3
                   CALL get_gauss(sp3, ng3, k, xg3, wg3)
                   DO ig3=1,ng3
                      CALL basfun(xg3(ig3), sp3, fun3, k)
                      contrib = wg1(ig1)*wg2(ig2)*wg3(ig3) * &
                           &    rhseq(xg1(ig1), xg2(ig2), xg3(ig3), mbess, npow)
                      DO k1=0,nidbas1
                         i1 = i+k1
                         DO k2=0,nidbas2
                            j2 = MODULO(j+k2-1,n2) + 1
                            ij = j2 + (i1-1)*n2
                            DO k3=0,nidbas3
                               kk = k+k3
                               rhs(ij,kk) = rhs(ij, kk) + &
                                    &  contrib*fun1(k1,1)*fun2(k2,1)*fun3(k3,1)
                            END DO
                         END DO
                      END DO
                   END DO
                END DO
             END DO
          END DO
       END DO
    END DO
!
!     Update from remote guard cells
!
    CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
    CALL mpi_comm_rank(MPI_COMM_WORLD, me, ierr)
    next = MODULO(me+1,npes)
    prev = MODULO(me-1,npes)
    count = nrank
    ALLOCATE(buf(nrank))
    DO i=nidbas3,1,-1
       CALL mpi_sendrecv(rhs(1,n3+i), count, MPI_DOUBLE_PRECISION, next, 0, &
         &             buf, count, MPI_DOUBLE_PRECISION, prev, 0, &
         &             MPI_COMM_WORLD, status, ierr)
       rhs(:,i) =  rhs(:,i) + buf(:)
    END DO
    DEALLOCATE(buf)
!===========================================================================
!              9.0  Epilogue
!
    DEALLOCATE(xg1, wg1, fun1)
    DEALLOCATE(xg2, wg2, fun2)
    DEALLOCATE(xg3, wg3, fun3)
!
  CONTAINS
    DOUBLE PRECISION FUNCTION rhseq(x1, x2, x3, m, n)
      DOUBLE PRECISION, INTENT(in) :: x1, x2, x3
      INTEGER, INTENT(in) :: m, n
      rhseq = REAL(4*(m+1),8)*x1**(m+1)*COS(REAL(m,8)*x2)*COS(x3)**n
    END FUNCTION rhseq
  END SUBROUTINE disrhs3
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE meshdist(c, x, nx)
!
!   Construct an 1d  non-equidistant mesh given a
!   mesh distribution function.
!
    DOUBLE PRECISION, INTENT(in) :: c(5)
    INTEGER, INTENT(iN) :: nx
    DOUBLE PRECISION, INTENT(inout) :: x(0:nx)
    INTEGER :: nintg
    DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: xint, fint
    DOUBLE PRECISION :: a, b, dx, f0, f1, scal
    INTEGER :: i, k
!
    a=x(0)
    b=x(nx)
    nintg = 10*nx
    ALLOCATE(xint(0:nintg), fint(0:nintg))
!
!  Mesh distribution
!
    dx = (b-a)/REAL(nintg)
    xint(0) = a
    fint(0) = 0.0d0
    f1 = fdist(xint(0))
    DO i=1,nintg
       f0 = f1
       xint(i) = xint(i-1) + dx
       f1 = fdist(xint(i))
       fint(i) = fint(i-1) + 0.5*(f0+f1)
    END DO
!
!  Normalization
!
    scal = REAL(nx) / fint(nintg)
    fint(0:nintg) = fint(0:nintg) * scal
!
!  Obtain mesh point by (inverse) interpolation
!
    k = 1
    DO i=1,nintg-1
       IF( fint(i) .GE. REAL(k) ) THEN
          x(k) = xint(i) + (xint(i+1)-xint(i))/(fint(i+1)-fint(i)) * &
               &   (k-fint(i))
          k = k+1
       END IF
    END DO
!
    DEALLOCATE(xint, fint)
  CONTAINS
    DOUBLE PRECISION FUNCTION fdist(x)
      DOUBLE PRECISION, INTENT(in) :: x
      fdist = c(1) + c(2)*x + c(3)*EXP(-((x-c(4))/c(5))**2)
    END FUNCTION fdist
  END SUBROUTINE meshdist
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE ibcmat(mat, ny)
!
!   Apply BC on matrix
!
    TYPE(gbmat), INTENT(inout) :: mat
    INTEGER, INTENT(in) :: ny
    INTEGER :: kl, ku, nrank, i, j
    DOUBLE PRECISION, ALLOCATABLE :: zsum(:), arr(:)
!===========================================================================
!              1.0 Prologue
!
    kl = mat%kl
    ku = mat%ku
    nrank = mat%rank
    ALLOCATE(zsum(nrank), arr(nrank))
!===========================================================================
!              2.0 Unicity at the axis
!
!   The vertical sum on the NY-th row
!
    zsum = 0.0d0
    DO i=1,ny
       arr = 0.0d0
       CALL getrow(mat, i, arr)
       DO j=1,ny+ku
          zsum(j) = zsum(j) + arr(j)
       END DO
    END DO
    CALL putrow(mat, ny, zsum)
!
!   The horizontal sum on the NY-th column
!
    zsum = 0.0d0
    DO j=1,ny
       arr = 0.0d0
       CALL getcol(mat, j, arr)
       DO i=ny,ny+kl
          zsum(i) = zsum(i) + arr(i)
       END DO
    END DO
    CALL putcol(mat, ny, zsum)
!
!   The away operator
!
    DO j = 1,ny-1
       arr = 0.0d0; arr(j) = 1.0d0
       CALL putcol(mat, j, arr)     
    END DO
!
    DO i = 1,ny-1
       arr = 0.0d0; arr(i) = 1.0d0
       CALL putrow(mat, i, arr)     
    END DO
!  
!===========================================================================
!              3.0 Dirichlet on right boundary
!
    DO j = nrank, nrank-ny+1, -1
       arr = 0.0d0; arr(j) = 1.0d0
       CALL putcol(mat, j, arr)     
    END DO
    !
    DO i = nrank, nrank-ny+1, -1
       arr = 0.0d0; arr(i) = 1.0d0
       CALL putrow(mat, i, arr)     
    END DO
!===========================================================================
!              9.0  Epilogue
!
    DEALLOCATE(zsum, arr)
!
  END SUBROUTINE ibcmat
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE ibcrhs3(rhs, ny)
!
!   Apply BC on RHS
!
    DOUBLE PRECISION, INTENT(inout) :: rhs(:,:)
    INTEGER, INTENT(in) :: ny
    INTEGER :: nrank, nz, k
    DOUBLE PRECISION :: zsum
!===========================================================================
!              1.0 Prologue
!
    nrank = SIZE(rhs,1)
    nz = SIZE(rhs,2)
!===========================================================================
!              2.0 Unicity at the axis
!
!   The vertical sum
!
    DO k=1,nz
       zsum = SUM(rhs(1:ny,k))
       rhs(ny,k) = zsum
       rhs(1:ny-1,k) = 0.0d0
    END DO
!===========================================================================
!              3.0 Dirichlet on right boundary
!
    DO k=1,nz
       rhs(nrank-ny+1:nrank,k) = 0.0d0
    END DO
  END SUBROUTINE ibcrhs3
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE dist1d(s0, ntot, s, nloc)
    INCLUDE 'mpif.h'
    INTEGER, INTENT(in) :: s0, ntot
    INTEGER, INTENT(out) :: s, nloc
    INTEGER :: me, npes, ierr, naver, rem
!
    CALL MPI_COMM_SIZE(MPI_COMM_WORLD, npes, ierr)
    CALL MPI_COMM_RANK(MPI_COMM_WORLD, me, ierr)
    naver = ntot/npes
    rem = MODULO(ntot,npes)
    s = s0 + MIN(rem,me) + me*naver
    nloc = naver
    IF( me.LT.rem ) nloc = nloc+1
  END SUBROUTINE dist1d
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  SUBROUTINE disp(a, str, comm)
!
!   Gather partitionned 1d array to 0 and print it
!
    INCLUDE 'mpif.h'
    DOUBLE PRECISION, INTENT(in) :: a(:)
    INTEGER, INTENT(in) :: comm
    CHARACTER(len=*), INTENT(in) :: str
    INTEGER :: n, ntot, npes, me, ierr, i
    DOUBLE PRECISION, ALLOCATABLE, DIMENSION(:) :: c
    INTEGER, ALLOCATABLE, DIMENSION(:) :: counts, displs
!
    CALL mpi_comm_rank(comm, me, ierr)
    CALL mpi_comm_size(comm, npes, ierr)
    n = SIZE(a)
    IF(me.EQ.0) THEN
       ALLOCATE(counts(npes), displs(npes+1))
    END IF
    CALL mpi_gather(n, 1, MPI_INTEGER, counts, 1, MPI_INTEGER, 0, comm, ierr)
    IF(me.EQ.0) THEN
       displs(1) = 0
       DO i=2,npes+1
          displs(i) = displs(i-1)+counts(i-1)
       END DO
       ntot = displs(npes+1)
       ALLOCATE(c(ntot))
       c = 0.0d0
    END IF
    CALL mpi_gatherv(a, n, MPI_DOUBLE_PRECISION, c, counts, displs, &
         &           MPI_DOUBLE_PRECISION, 0, comm, ierr)    
    IF( me.EQ.0 ) THEN
       WRITE(*,'(/a)') TRIM(str)
       DO i=1,npes
          WRITE(*,'(a,i3.3/(10(1pe12.3)))') 'PE', i-1,  &
               &        c(displs(i)+1:displs(i+1))
       END DO
       DEALLOCATE(c)
       DEALLOCATE(counts)
       DEALLOCATE(displs)
    END IF
  END SUBROUTINE disp
!++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
END MODULE ppde3d_mod