diff --git a/exercises/ex5/saxpy_cuda.F90 b/exercises/ex5/saxpy_cuda.F90 index a0e5e0e..97159c8 100644 --- a/exercises/ex5/saxpy_cuda.F90 +++ b/exercises/ex5/saxpy_cuda.F90 @@ -1,53 +1,53 @@ module mathOps contains attributes(global) subroutine saxpy_cuda(x, y, a) implicit none real :: x(:), y(:) real, value :: a integer :: i, j, n n = size(x) i = blockDim%x * (blockIdx%x - 1) + threadIdx%x - do j=1, n, blockDim%x*gridDim%x + do j=i, n, blockDim%x*gridDim%x y(j) = y(j) + a*x(j) end do end subroutine saxpy_cuda subroutine saxpy_cpu(x, y, a) implicit none real :: x(:), y(:) real, value :: a integer :: i, n n = size(x) do i=1, n y(i) = y(i) + a*x(i) end do end subroutine saxpy_cpu end module mathOps program testSaxpy use mathOps use cudafor implicit none integer, parameter :: N = 4000 integer istat, i real :: x(N), y(N), a, norm2_cpu, norm2_gpu real, managed :: x_d(N), y_d(N) type(dim3) :: grid, tBlock tBlock = dim3(256,1,1) grid = dim3(ceiling(real(N)/tBlock%x),1,1) do i=1, N y(i) = 0.0 x(i) = i*0.458 a = 12.1 end do x_d = x y_d = y call saxpy_cuda<<>>(x_d, y_d, a) istat = cudaDeviceSynchronize() call saxpy_cpu(x, y, a) write(*,*) 'L2 norm: ', norm2(y-y_d) end program testSaxpy