diff --git a/Jacobi_simple.F90 b/Jacobi_simple.F90 new file mode 100644 index 0000000..fe45acc --- /dev/null +++ b/Jacobi_simple.F90 @@ -0,0 +1,199 @@ +program jacobi + use iso_fortran_env, only: di=>int64, dp=>real64 +use omp_lib + implicit none + + integer, parameter :: iter_max = 10 + integer, parameter :: N = 8*1024 + integer :: i,j,iterk,istat + + real(dp), allocatable,target :: A(:, :), Anew(:, :) + real(dp), allocatable,target :: A_omp(:, :), Anew_omp(:, :) + real(dp), pointer :: A_pointer1(:,:) + real(dp), pointer :: A_pointer2(:,:) + real(dp) :: rate,startomp,endomp,startcpu,endcpu + integer(di) :: startc, endc + logical :: isDevice + integer :: nteams, nthreads + + + call system_clock(count_rate=rate) + + allocate(A(N, N)) + allocate(Anew(N, N)) + +! Jacobi on CPU + call init_array(A, N, 7.0_dp) + call init_array(Anew, N, 7.0_dp) + A_pointer1 => A + A_pointer2 => Anew + call system_clock(startc) + do iterk=1,iter_max + call jacobi_CPU(A_pointer1, A_pointer2, N) + call swap(A_pointer1,A_pointer2) + enddo + call system_clock(endc) + write(*,*) "timing for Jacobi CPU: ", real(endc-startc, dp)/rate*1000, "ms" +! + allocate(A_omp(N, N)) + allocate(Anew_omp(N, N)) +! Jacobi openmp + call init_array(A_omp, N, 7.0_dp) + call init_array(Anew_omp, N, 7.0_dp) + call system_clock(startc) + call jacobi_openmp(A_omp, Anew_omp, N, iter_max) + call system_clock(endc) + write(*,*) "timing for Jacobi CPU openmp: ", real(endc-startc, dp)/rate*1000, "ms" + call test_array(A_omp,A,N) + +! print details teams threads for openmp +! isDevice = omp_is_initial_device() +! if (isDevice) then + ! write(*,*) "we are on the GPU" +! !$omp target teams !!! defaultmap(tofrom:scalar) +! nteams = omp_get_num_teams() +! !$omp parallel +! !$omp single +! nthreads = omp_get_num_threads() +! !$omp end single +! !$omp end parallel +! !$omp end target teams +! write(*,*) "number of teams", nteams +! write(*,*) "number of threads", nthreads +! else +! write(*,*) "we are on the CPU" +! endif + +! Jacobi openmp offload + call init_array(A_omp, N, 7.0_dp) + call init_array(Anew_omp, N, 7.0_dp) + A_pointer1 => A_omp + A_pointer2 => Anew_omp + !$omp target enter data map(to: A_omp, Anew_omp) + call system_clock(startc) + call cpu_time(startcpu) + startomp=omp_get_wtime() +! !$omp target enter data map(to: A_omp, Anew_omp) + do iterk=1,iter_max + !$omp target + !$omp teams distribute + do j = 2, N-1 + call jacobi_offload(A_pointer1, A_pointer2, N, j) + end do + !$omp end teams distribute + !$omp end target + call swap(A_pointer1,A_pointer2) + enddo + endomp=omp_get_wtime() + call cpu_time(endcpu) + call system_clock(endc) + write(*,*) "timing for Jacobi GPU openmp: ", real(endc-startc, dp)/rate*1000, "ms" + write(*,*) "timing for Jacobi GPU openmp timeomp: ", (endomp-startomp)*1000, "ms" + write(*,*) "timing for Jacobi GPU openmp timecpu: ", (endcpu-startcpu)*1000, "ms" + !$omp target update from(A_omp) + call test_array(A_omp,A,N) +! +! +contains + + + subroutine init_array(Tab, N, val) + real(dp), intent(inout), allocatable :: Tab(:,:) + integer, intent(in) :: N + real(dp), intent(in) :: val + + integer i, j + + Tab = 0.0_dp + + do i = 1, N + Tab(1, i) = val + Tab(N, i) = val/2.0_dp + Tab(i, 1) = val/3.0_dp + Tab(i, N) = val/5.0_dp + end do + end subroutine init_array + + subroutine test_array(Tab, Aref, N) + real(dp), intent(in), allocatable :: Tab(:,:) + real(dp), intent(in), allocatable :: Aref(:,:) + integer, intent(in) :: N + + integer i, j + + do j = 2, N-1 + do i = 2, N-1 + if (abs((Tab(i, j) - Aref(i, j))/Aref(i, j)) > 100.0 * epsilon(1.0_dp)) then + write(*,*) "Error: ", abs((Tab(i, j) - Aref(i, j))/Aref(i, j)) + return + end if + end do + end do + end subroutine test_array + + subroutine jacobi_CPU(Tab, TabNew, N) + real(dp), intent(in) :: Tab(N,N) + real(dp), intent(inout) :: TabNew(N,N) + integer, intent(in) :: N + integer i, j + + do j = 2, N-1 + do i = 2, N-1 + TabNew(i, j) = 0.25 * (Tab(i, j+1) + Tab(i, j-1) + Tab(i+1, j) + Tab(i-1, j)) + end do + end do + + end subroutine jacobi_CPU + + subroutine jacobi_offload(Tab, TabNew, N, j) + real(dp), intent(in) :: Tab(N,N) + real(dp), intent(inout) :: TabNew(N,N) + integer, intent(in) :: N,j + integer :: i + + !$omp parallel do + do i = 2, N-1 + TabNew(i, j) = 0.25 * (Tab(i, j+1) + Tab(i, j-1) + Tab(i+1, j) + Tab(i-1, j)) + end do + !$omp end parallel do + + end subroutine jacobi_offload + + subroutine jacobi_openmp(Tab, Tabnew, N , iter_max) + real(dp), intent(inout),target :: Tab(N,N) + real(dp), intent(inout), target :: TabNew(N,N) + real(dp), pointer :: Tab_pointer1(:,:) + real(dp), pointer :: Tab_pointer2(:,:) + integer, intent(in) :: N + integer, intent(in) :: iter_max + + integer i, j, iter,k + real(dp) :: err + Tab_pointer1 => Tab + Tab_pointer2 => TabNew + do iter=1,iter_max + !$omp parallel + !$omp do + do j = 2, N-1 + !$omp simd + do i = 2, N-1 + Tab_pointer2(i, j) = 0.25 * (Tab_pointer1(i, j+1) + Tab_pointer1(i, j-1) + Tab_pointer1(i+1, j) + Tab_pointer1(i-1, j)) + end do + !$omp end simd + end do + !$omp end do + !$omp end parallel + call swap(Tab_pointer1,Tab_pointer2) + enddo + + end subroutine jacobi_openmp + subroutine swap(Tab_pointer1,Tab_pointer2) + real(dp), pointer, intent(inout) :: Tab_pointer1(:,:) + real(dp), pointer, intent(inout) :: Tab_pointer2(:,:) + real(dp), pointer :: tmp_pointer(:,:) +!!$omp declare target + tmp_pointer => Tab_pointer1 + Tab_pointer1 => Tab_pointer2 + Tab_pointer2 => tmp_pointer + end subroutine swap +end program jacobi diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d238847 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +EXEC=Jacobi_CCEoffload # Excutable name +# on cray +NVFC=ftn # NVHPC compiler on cray +CCE=ftn # CCE compiler on cray +NVOMPFLAGS= -O3 -static-nvidia -mp=gpu -gpu=cc60 # options for openMP with NVHPC on cray +CCEOMPFLAGS= -O3 -h omp -e Z # options for openmp with CCE on cray +CUDA_HOME=$CUDATOOLKIT_HOME +CUDAFORFLAGS= -LCUDA_HOME -Mcuda=cc60,cuda11.0 -lcublas -lblas -v -Wl,-t # options for cuda fortran on cray +# +all: $(EXEC) + +Jacobi_offload: Jacobi_simple.F90 + $(GNUFC) $(GNUOMPFLAGS) Jacobi_simple.F90 -o Jacobi_offload +Jacobi_CCEoffload: Jacobi_simple.F90 + $(CCE) $(CCEOMPFLAGS) Jacobi_simple.F90 -o Jacobi_CCEoffload +clean: + rm -f $(EXEC) Jacobi_offload *.mod *.o *~