program jacobi use iso_fortran_env, only: di=>int64, dp=>real64 use omp_lib use kernel implicit none integer, parameter :: iter_max = 2 integer, parameter :: N = 8*1024 integer :: i,j,iterk,istat real(dp), allocatable,target :: A(:, :), Anew(:, :) real(dp), allocatable,target :: A_omp(:, :), Anew_omp(:, :) real(dp), pointer :: A_pointer1(:,:) real(dp), pointer :: A_pointer2(:,:) real(dp) :: rate,startomp,endomp,startcpu,endcpu integer(di) :: startc, endc logical :: isDevice integer :: nteams, nthreads !$omp declare target(jacobi_offload_j2) call system_clock(count_rate=rate) allocate(A(N, N)) allocate(Anew(N, N)) ! Jacobi on CPU call init_array(A, N, 7.0_dp) call init_array(Anew, N, 7.0_dp) A_pointer1 => A A_pointer2 => Anew call system_clock(startc) do iterk=1,iter_max call jacobi_CPU(A_pointer1, A_pointer2, N) call swap(A_pointer1,A_pointer2) enddo call system_clock(endc) write(*,*) "timing for Jacobi CPU: ", real(endc-startc, dp)/rate*1000, "ms" ! allocate(A_omp(N, N)) allocate(Anew_omp(N, N)) ! Jacobi openmp call init_array(A_omp, N, 7.0_dp) call init_array(Anew_omp, N, 7.0_dp) call system_clock(startc) call jacobi_openmp(A_omp, Anew_omp, N, iter_max) call system_clock(endc) write(*,*) "timing for Jacobi CPU openmp: ", real(endc-startc, dp)/rate*1000, "ms" call test_array(A_omp,A,N) ! print details teams threads for openmp ! isDevice = omp_is_initial_device() ! if (isDevice) then ! write(*,*) "we are on the GPU" ! !$omp target teams !!! defaultmap(tofrom:scalar) ! nteams = omp_get_num_teams() ! !$omp parallel ! !$omp single ! nthreads = omp_get_num_threads() ! !$omp end single ! !$omp end parallel ! !$omp end target teams ! write(*,*) "number of teams", nteams ! write(*,*) "number of threads", nthreads ! else ! write(*,*) "we are on the CPU" ! endif ! Jacobi openmp offload V1 call init_array(A_omp, N, 7.0_dp) call init_array(Anew_omp, N, 7.0_dp) A_pointer1 => A_omp A_pointer2 => Anew_omp !$omp target enter data map(to: A_omp, Anew_omp) call system_clock(startc) call cpu_time(startcpu) startomp=omp_get_wtime() ! !$omp target enter data map(to: A_omp, Anew_omp) do iterk=1,iter_max ! !$omp target ! !$omp teams distribute ! do j = 2, N-1 ! nteams = omp_get_num_teams() call jacobi_offload(A_pointer1, A_pointer2, N, j) ! end do ! !$omp end teams distribute ! !$omp end target call swap(A_pointer1,A_pointer2) enddo endomp=omp_get_wtime() call cpu_time(endcpu) call system_clock(endc) write(*,*) "timing for Jacobi GPU openmp: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timeomp: ", (endomp-startomp)*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timecpu: ", (endcpu-startcpu)*1000, "ms" write(*,*) "number of teams", nteams write(*,*) "number of threads", nthreads !$omp target update from(A_omp) call test_array(A_omp,A,N) ! ! Jacobi openmp offload V2 call init_array(A_omp, N, 7.0_dp) call init_array(Anew_omp, N, 7.0_dp) A_pointer1 => A_omp A_pointer2 => Anew_omp !$omp target update to(A_omp, Anew_omp) call system_clock(startc) call cpu_time(startcpu) startomp=omp_get_wtime() do iterk=1,iter_max !$omp target !$omp teams distribute do j = 2, N-1 ! nteams = omp_get_num_teams() call jacobi_offload_j(A_pointer1, A_pointer2, N, j) end do !$omp end teams distribute !$omp end target call swap(A_pointer1,A_pointer2) enddo endomp=omp_get_wtime() call cpu_time(endcpu) call system_clock(endc) write(*,*) "timing for Jacobi GPU openmp V2: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timeomp V2: ", (endomp-startomp)*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timecpu V2: ", (endcpu-startcpu)*1000, "ms" write(*,*) "number of teams", nteams write(*,*) "number of threads", nthreads !$omp target update from(A_omp) call test_array(A_omp,A,N) ! ! Jacobi openmp offload V3 call init_array(A_omp, N, 7.0_dp) call init_array(Anew_omp, N, 7.0_dp) A_pointer1 => A_omp A_pointer2 => Anew_omp !$omp target update to(A_omp, Anew_omp) call system_clock(startc) call cpu_time(startcpu) startomp=omp_get_wtime() do iterk=1,iter_max !$omp target !$omp teams distribute do j = 2, N-1 ! nteams = omp_get_num_teams() call jacobi_offload_j2(A_pointer1, A_pointer2, N, j) end do !$omp end teams distribute !$omp end target call swap(A_pointer1,A_pointer2) enddo endomp=omp_get_wtime() call cpu_time(endcpu) call system_clock(endc) write(*,*) "timing for Jacobi GPU openmp V3: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timeomp V3: ", (endomp-startomp)*1000, "ms" write(*,*) "timing for Jacobi GPU openmp timecpu V3: ", (endcpu-startcpu)*1000, "ms" write(*,*) "number of teams", nteams write(*,*) "number of threads", nthreads !$omp target update from(A_omp) call test_array(A_omp,A,N) ! contains subroutine init_array(Tab, N, val) real(dp), intent(inout), allocatable :: Tab(:,:) integer, intent(in) :: N real(dp), intent(in) :: val integer i, j Tab = 0.0_dp do i = 1, N Tab(1, i) = val Tab(N, i) = val/2.0_dp Tab(i, 1) = val/3.0_dp Tab(i, N) = val/5.0_dp end do end subroutine init_array subroutine test_array(Tab, Aref, N) real(dp), intent(in), allocatable :: Tab(:,:) real(dp), intent(in), allocatable :: Aref(:,:) integer, intent(in) :: N integer i, j do j = 2, N-1 do i = 2, N-1 if (abs((Tab(i, j) - Aref(i, j))/Aref(i, j)) > 100.0 * epsilon(1.0_dp)) then write(*,*) "Error: ", abs((Tab(i, j) - Aref(i, j))/Aref(i, j)) return end if end do end do end subroutine test_array subroutine jacobi_CPU(Tab, TabNew, N) real(dp), intent(in) :: Tab(N,N) real(dp), intent(inout) :: TabNew(N,N) integer, intent(in) :: N integer i, j do j = 2, N-1 do i = 2, N-1 TabNew(i, j) = 0.25 * (Tab(i, j+1) + Tab(i, j-1) + Tab(i+1, j) + Tab(i-1, j)) end do end do end subroutine jacobi_CPU subroutine jacobi_offload(Tab, TabNew, N, j) real(dp), intent(in) :: Tab(N,N) real(dp), intent(inout) :: TabNew(N,N) integer, intent(in) :: N integer :: i,j !$omp target !$omp teams distribute ! thread_limit(1) do j = 2, N-1 !$omp parallel do ! simd do i = 2, N-1 TabNew(i, j) = 0.25 * (Tab(i, j+1) + Tab(i, j-1) + Tab(i+1, j) + Tab(i-1, j)) end do !$omp end parallel do ! simd end do !$omp end teams distribute !$omp end target end subroutine jacobi_offload subroutine jacobi_offload_j(Tab, TabNew, N, j) real(dp), intent(in) :: Tab(N,N) real(dp), intent(inout) :: TabNew(N,N) integer, intent(in) :: N integer :: i,j !$omp declare target !$omp parallel do ! simd do i = 2, N-1 TabNew(i, j) = 0.25 * (Tab(i, j+1) + Tab(i, j-1) + Tab(i+1, j) + Tab(i-1, j)) end do !$omp end parallel do ! simd end subroutine jacobi_offload_j subroutine jacobi_openmp(Tab, Tabnew, N , iter_max) real(dp), intent(inout),target :: Tab(N,N) real(dp), intent(inout), target :: TabNew(N,N) real(dp), pointer :: Tab_pointer1(:,:) real(dp), pointer :: Tab_pointer2(:,:) integer, intent(in) :: N integer, intent(in) :: iter_max integer i, j, iter,k real(dp) :: err Tab_pointer1 => Tab Tab_pointer2 => TabNew do iter=1,iter_max !$omp parallel !$omp do do j = 2, N-1 !$omp simd do i = 2, N-1 Tab_pointer2(i, j) = 0.25 * (Tab_pointer1(i, j+1) + Tab_pointer1(i, j-1) + Tab_pointer1(i+1, j) + Tab_pointer1(i-1, j)) end do !$omp end simd end do !$omp end do !$omp end parallel call swap(Tab_pointer1,Tab_pointer2) enddo end subroutine jacobi_openmp subroutine swap(Tab_pointer1,Tab_pointer2) real(dp), pointer, intent(inout) :: Tab_pointer1(:,:) real(dp), pointer, intent(inout) :: Tab_pointer2(:,:) real(dp), pointer :: tmp_pointer(:,:) !!$omp declare target tmp_pointer => Tab_pointer1 Tab_pointer1 => Tab_pointer2 Tab_pointer2 => tmp_pointer end subroutine swap end program jacobi