!>
!> @file ex7.f90
!>
!> @brief
!>
!> @copyright
!> Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
!> SPC (Swiss Plasma Center)
!>
!> SPClibs is free software: you can redistribute it and/or modify it under
!> the terms of the GNU Lesser General Public License as published by the Free
!> Software Foundation, either version 3 of the License, or (at your option)
!> any later version.
!>
!> SPClibs is distributed in the hope that it will be useful, but WITHOUT ANY
!> WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
!> FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
!>
!> You should have received a copy of the GNU Lesser General Public License
!> along with this program. If not, see <https://www.gnu.org/licenses/>.
!>
!> @authors
!> (in alphabetical order)
!> @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
!>
PROGRAM main
!
!   Tranpsose of 3d matrices partitionned in 1 and 2 proc grid:
!     - A(n1,n2,n3/P)      -> AT(n3,n2,n1/P)
!     - B(n1,n2/P1,n3/P2)  -> BT(n3,n2/P1,n1/P2)
!   n1, n2, n3 NOT REQUIRED to be divided evenly by P
!
USE pputils2
  IMPLICIT NONE
  INCLUDE "mpif.h"
  INTEGER :: ierr, me, npes, comm=MPI_COMM_WORLD
  INTEGER ::  n1=15, n2=10, n3=20, n1p, n2p, n3p, s1, s2,s3
  DOUBLE PRECISION, DIMENSION(:,:,:), ALLOCATABLE :: a3, a3t
  DOUBLE PRECISION :: x
  INTEGER :: i, j, k, iglob, jglob, kglob, kerrors, nerrors
!
  INTEGER, PARAMETER :: ndims=2              ! N. of dims of proc. grid
  INTEGER, DIMENSION(ndims) :: dims, coords
  LOGICAL :: periods(ndims), reorder
  INTEGER :: cart, cartcol, cartrow
!================================================================================
!
!   Init MPI
  CALL mpi_init(ierr)
  CALL mpi_comm_size(comm, npes, ierr)
  CALL mpi_comm_rank(comm, me, ierr)
!
!--------------------------------------------------------------------------------
!
! 1D partition:
!
!   Define local array A3
  CALL dist1d(comm, 0, n1, s1, n1p)
  CALL dist1d(comm, 0, n3, s3, n3p)
  ALLOCATE( a3(n1,n2,n3p), a3t(n3,n2,n1p) )
  a3 = 0
  a3t = 0
  DO i=1,n1
     DO j=1,n2
        DO k=1,n3p
           kglob = s3 + k
           a3(i,j,k) = 10000*i + 100*j + kglob
        END DO
     END DO
  END DO
  IF( me.EQ. 0 ) THEN
     WRITE(*,'(a)') '*** 1D partition ***'
     WRITE(*,'(a,3i4)') 'Global dimensions of matrix a', n1, n2, n3
  END IF
!
!   Tranpose A3(n1,n2/P1,n3/P2) -> A3T(n3,n2/P1,n1/P2)
  CALL pptransp(comm, a3, a3t, 1, 3)
!
!   Check A3T
  kerrors = 0
  DO i=1,n1p
     iglob = s1 + i
     DO j=1,n2
        DO k=1,n3
           x = 10000*iglob + 100*j + k
           IF( x .NE. a3t(k,j,i) ) kerrors = kerrors+1
        END DO
     END DO
  END DO
  CALL mpi_reduce(kerrors, nerrors, 1, MPI_INTEGER, MPI_SUM, 0, &
       &          comm, ierr)
  IF( me .EQ. 0 ) WRITE(*,'(a,i6)') 'nerrors checking a3', nerrors
  DEALLOCATE(a3, a3t)
!--------------------------------------------------------------------------------
!
! 2D partition:
!
!   Create cartesian topololy
  dims    = (/2, 3/)
  periods = (/.FALSE., .FALSE./)
  reorder = .FALSE.
  IF( PRODUCT(dims) .NE. npes ) THEN
     IF( me .EQ. 0 ) THEN
        PRINT*,  PRODUCT(dims), " processors required!"
        CALL mpi_abort(comm, -1, ierr)
     END IF
  END IF
  CALL mpi_barrier(comm, ierr)
!
  CALL mpi_cart_create(comm, ndims, dims, periods, reorder, cart, ierr)
  CALL mpi_cart_coords(cart, me, ndims, coords, ierr)
  CALL mpi_cart_sub(cart, (/.TRUE., .FALSE. /), cartcol, ierr)
  CALL mpi_cart_sub(cart, (/.FALSE., .TRUE. /), cartrow, ierr)
!
!   Define local array A3
  CALL dist1d(cartrow, 0, n1, s1, n1p)
  CALL dist1d(cartcol, 0, n2, s2, n2p)
  CALL dist1d(cartrow, 0, n3, s3, n3p)
  ALLOCATE( a3(n1,n2p,n3p), a3t(n3,n2p,n1p) )
  a3 = 0
  a3t = 0
  DO i=1,n1
     DO j=1,n2p
        jglob = s2 + j
        DO k=1,n3p
           kglob = s3 + k
           a3(i,j,k) = 10000*i + 100*jglob + kglob
        END DO
     END DO
  END DO
  IF( me.EQ. 0 ) THEN
     WRITE(*,'(a)') '*** 2D partition ***'
     WRITE(*,'(a,3i4)') 'Global dimensions of matrix a', n1, n2, n3
  END IF
!
!   Tranpose A3(n1,n2/P1,n3/P2) -> A3T(n3,n2/P1,n1/P2)
  CALL pptransp(cartrow, a3, a3t, 1, 3)
!
!   Check A3T
  kerrors = 0
  DO i=1,n1p
     iglob = s1 + i
     DO j=1,n2p
        jglob = s2 + j
        DO k=1,n3
           x = 10000*iglob + 100*jglob + k
           IF( x .NE. a3t(k,j,i) ) kerrors = kerrors+1
        END DO
     END DO
  END DO
  CALL mpi_reduce(kerrors, nerrors, 1, MPI_INTEGER, MPI_SUM, 0, &
       &          comm, ierr)
  IF( me .EQ. 0 ) WRITE(*,'(a,i6)') 'nerrors checking a3', nerrors
  DEALLOCATE(a3, a3t)
!--------------------------------------------------------------------------------
! Epilogue
!
  CALL mpi_finalize(ierr)
END PROGRAM main