module sgs use precision use dimen use cudafor use optim_sd_gpu_m contains subroutine sgs_stag_gpu(u,v,w,& dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz, & LM_old,MM_old,QN_old,NN_old,cs2, & txx,txy,txz,tyy,tyz,tzz,t,device_id,nall) !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- implicit none integer*4 i,j,k,t,kstart,stagger_flag,istat,nall integer :: device_id real(fp),dimension(nx,ny,nz2),intent(in),device :: u,v,w,& dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz real(fp),dimension(nx,ny,nz2),intent(inout),device :: LM_old,MM_old,QN_old,NN_old,cs2 real(fp),dimension(nx,ny,nz2),intent(out),device :: txx,txy,txz,tyy,tyz,tzz real(fp),dimension(:,:,:,:),allocatable,device :: Sij real(fp),dimension(:,:,:),allocatable,device :: u_lag,v_lag,w_lag real(fp),dimension(:,:),allocatable,device :: txzp,tyzp ! Cuda variable type(dim3) :: grid, tBlock tBlock = dim3(16,16,2) grid = dim3(nx/16,ny/16,nzb/2) save u_lag,v_lag,w_lag,Sij,u_lag,v_lag,w_lag,txzp,tyzp !--------------------------------------------------------------------------- ! 0. prepare (at uvp nodes) !--------------------------------------------------------------------------- if (t == 1) then allocate(Sij(nx,ny,nz2,6)) allocate(u_lag(nx,ny,nz2)) allocate(v_lag(nx,ny,nz2)) allocate(w_lag(nx,ny,nz2)) allocate(txzp(nx,ny)) allocate(tyzp(nx,ny)) end if ! stress boundary condition at bottom has been given, ! thus, it needs to be saved and restored. here is for saving bc if (device_id == 0) then !$cuf kernel do <<<*,*>>> do j=1,ny do i=1,nx txzp(i,j)=txz(i,j,2) tyzp(i,j)=tyz(i,j,2) end do end do end if call get_strain<<>>(Sij,dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz,& device_id,0) !--------------------------------------------------------------------------- ! 1. get cs2 (at uvp nodes) !--------------------------------------------------------------------------- !--------------------------------------------------------------------------- ! lasd ! Initialize cs with Smagorinsky coefficient and lagarangian velocity with 0 if (t.eq.1) then cs2 = co**2 u_lag = 0. v_lag = 0. w_lag = 0. end if ! Compute lagarangian velocity with 0 ! TODO: CHECK IF it is necessary or not call get_u_lag<<>>(u_lag,v_lag,w_lag,u,v,w) if (mod(t-1,cs_count).eq.0) then call optim_sd_gpu(Cs2,u,v,w,u_lag,v_lag,w_lag,Sij,LM_old,MM_old,QN_old,NN_old,t,me,nall) u_lag = 0. v_lag = 0. w_lag = 0. end if ! call update_uv(cs2,me,nall) !--------------------------------------------------------------------------- ! 2. get Tau !--------------------------------------------------------------------------- !--------------------------------------------------------------------------- ! 2.1 u, v layer call get_stress_uv<<>>(txx,txy,tyy,tzz,cs2,Sij,device_id) !--------------------------------------------------------------------------- ! 2.2 w layer call get_strain<<>>(Sij,dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz,& device_id,1) call get_stress_w<<>>(txz,tyz,cs2,Sij,device_id) !--------------------------------------------------------------------------- ! 3. finalke !--------------------------------------------------------------------------- ! enforce b.c. of Tau if (device_id==0) then !$cuf kernel do <<<*,*>>> do j=1,ny do i=1,nx txz(i,j,2)=txzp(i,j) tyz(i,j,2)=tyzp(i,j) end do end do end if if (device_id==nprocs-1) then !$cuf kernel do <<<*,*>>> do j=1,ny do i=1,nx txz(i,j,nzb+1)=0.0d0 tyz(i,j,nzb+1)=0.0d0 end do end do end if end subroutine sgs_stag_gpu attributes(global) subroutine get_strain(Sij,dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz,device_id,stagger_flag) implicit none !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- real(fp),dimension(nx,ny,nz2,6) :: Sij real(fp),dimension(nx,ny,nz2),intent(in) :: dudx,dudy,dudz,& dvdx,dvdy,dvdz,& dwdx,dwdy,dwdz integer,value,intent(in) :: device_id,stagger_flag real(fp) ux_,uy_,uz_,vx_,vy_,vz_,wx_,wy_,wz_ integer :: i, j, k i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! the only reason doing that is: only at bottom(me=0,k=2), ! dudz and dvdz are computed at u layer if (stagger_flag == 0) then if (k == 2 .and. device_id == 0) then ux_ = dudx(i,j,k) uy_ = dudy(i,j,k) uz_ = dudz(i,j,k) vx_ = dvdx(i,j,k) vy_ = dvdy(i,j,k) vz_ = dvdz(i,j,k) wx_ = 0.5d0*(dwdx(i,j,k)+dwdx(i,j,k+1)) wy_ = 0.5d0*(dwdy(i,j,k)+dwdy(i,j,k+1)) wz_ = dwdz(i,j,k) else ux_ = dudx(i,j,k) uy_ = dudy(i,j,k) uz_ = 0.5d0*(dudz(i,j,k)+dudz(i,j,k+1)) vx_ = dvdx(i,j,k) vy_ = dvdy(i,j,k) vz_ = 0.5d0*(dvdz(i,j,k)+dvdz(i,j,k+1)) wx_ = 0.5d0*(dwdx(i,j,k)+dwdx(i,j,k+1)) wy_ = 0.5d0*(dwdy(i,j,k)+dwdy(i,j,k+1)) wz_ = dwdz(i,j,k) end if else if (stagger_flag == 1) then ux_ = 0.5d0*(dudx(i,j,k-1)+dudx(i,j,k)) uy_ = 0.5d0*(dudy(i,j,k-1)+dudy(i,j,k)) uz_ = dudz(i,j,k) vx_ = 0.5d0*(dvdx(i,j,k-1)+dvdx(i,j,k)) vy_ = 0.5d0*(dvdy(i,j,k-1)+dvdy(i,j,k)) vz_ = dvdz(i,j,k) wx_ = dwdx(i,j,k) wy_ = dwdy(i,j,k) wz_ = 0.5d0*(dwdz(i,j,k-1)+dwdz(i,j,k)) end if Sij(i,j,k,1) = ux_ ! S11 Sij(i,j,k,4) = vy_ ! S22 Sij(i,j,k,6) = wz_ ! S33 Sij(i,j,k,2) = 0.5d0*(uy_+vx_) ! S12 Sij(i,j,k,3) = 0.5d0*(uz_+wx_) ! S13 Sij(i,j,k,5) = 0.5d0*(vz_+wy_) ! S23 end subroutine get_strain attributes(global) subroutine get_stress_uv(txx,txy,tyy,tzz,cs2,Sij,device_id) implicit none real(fp),dimension(nx,ny,nz2),intent(out) :: txx,txy,tyy,tzz real(fp),dimension(nx,ny,nz2,6),intent(in) :: Sij real(fp),dimension(nx,ny,nz2),intent(in) :: cs2 real(fp) a(6),factor,S integer,value,intent(in) :: device_id integer :: i, j, k,kk, kstart i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 do kk = 1,6 a(kk) = Sij(i,j,k,kk) end do call symm_dot_prod(S,a,a) S = sqrt(2.d0*S) factor = -2*cs2(i,j,k)*delta**2*s factor = factor -2*iRe/(u_scale*z_i) Txx(i,j,k) = factor*Sij(i,j,k,1) Txy(i,j,k) = factor*Sij(i,j,k,2) Tyy(i,j,k) = factor*Sij(i,j,k,4) Tzz(i,j,k) = factor*Sij(i,j,k,6) end subroutine get_stress_uv attributes(global) subroutine get_stress_w(txz,tyz,cs2,Sij,device_id) implicit none real(fp),dimension(nx,ny,nz2),intent(out) :: txz,tyz real(fp),dimension(nx,ny,nz2,6),intent(in) :: Sij real(fp),dimension(nx,ny,nz2),intent(in) :: cs2 real(fp) a(6),factor,S integer,value,intent(in) :: device_id integer :: i, j, k,kk, kstart i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 do kk = 1,6 a(kk) = Sij(i,j,k,kk) end do call symm_dot_prod(S,a,a) S = sqrt(2.d0*S) if (device_id == 0) then kstart = 3 else kstart = 2 end if if (k >= kstart) then factor = -(cs2(i,j,k)+cs2(i,j,k-1))*delta**2*S factor = factor-2*iRe/(u_scale*z_i) txz(i,j,k) = factor*Sij(i,j,k,3) ! Txz tyz(i,j,k) = factor*Sij(i,j,k,5) ! Tyz end if end subroutine get_stress_w attributes(global) subroutine get_u_lag(u_lag,v_lag,w_lag,u,v,w) implicit none real(fp),dimension(nx,ny,nz2),intent(in) :: u,v,w real(fp),dimension(nx,ny,nz2),intent(inout) :: u_lag,v_lag,w_lag integer i,j,k,ii,jj,kk i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 u_lag(i,j,k)=u_lag(i,j,k)+u(i,j,k)/cs_count v_lag(i,j,k)=v_lag(i,j,k)+v(i,j,k)/cs_count w_lag(i,j,k)=w_lag(i,j,k)+0.5*(w(i,j,k)+w(i,j,k+1))/cs_count end subroutine get_u_lag end module sgs