module optim_sd_gpu_m use iso_c_binding use cudafor use precision use dimen use cufft_m use filter_gpu ! For lasd model real(fp), parameter :: const = 2.d0*(delta**2) real(fp), parameter :: tf1=fgr*tfr real(fp), parameter :: tf2=fgr*tfr*tfr real(fp), parameter :: tf1_2=tf1**2 real(fp), parameter :: tf2_2=tf2**2 real(fp), parameter :: beta_min = 1.d0/(tf1*tf2) real(fp), parameter :: beta_max = 100 real(fp), parameter :: cs2_min = 0.001*0.001 real(fp), parameter :: cs2_max = 0.9*0.9 contains subroutine optim_sd_gpu (Cs2,u,v,w,u_lag,v_lag,w_lag,Sij,LM_old,MM_old,QN_old,NN_old,t,device_id,nall) !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- implicit none integer*4 i,j,k,t,kend,kk,istat,device_id,flag,nall real(fp),dimension(nx,ny,nz2),intent(inout),device :: cs2 real(fp),dimension(nx,ny,nz2),intent(in),device :: u,v,w,u_lag,v_lag,w_lag ! 11,12,13,22,23,33 real(fp),dimension(nx,ny,nz2,6),intent(in),device :: Sij ! LM,MM,QN,NN real(fp),dimension(nx,ny,nz2),intent(inout),device :: LM_old,MM_old,QN_old,NN_old real(fp),dimension(:,:,:),allocatable,device :: LM,MM,QN,NN real(fp),dimension(:,:,:,:),allocatable,device :: SSij,uu,u_ ! Cuda variable type(dim3) :: grid, tBlock tBlock = dim3(16,16,2) grid = dim3(nx/16,ny/16,nzb/2) ! FFT plans type(c_ptr) plan(2) save plan,LM,MM,QN,NN,SSij,uu,u_ !--------------------------------------------------------------------------- ! main code !--------------------------------------------------------------------------- if(t == 1) then allocate(LM(nx,ny,nz2)) allocate(MM(nx,ny,nz2)) allocate(QN(nx,ny,nz2)) allocate(NN(nx,ny,nz2)) allocate(SSij(nx,ny,nz2,6)) allocate(uu(nx,ny,nz2,6)) allocate(u_(nx,ny,nz2,3)) end if call get_uu<<>>(uu,u_,u,v,w) call get_SSij<<>>(SSij,Sij) call get_LMQN(LM,MM,u_,uu,Sij,SSij,tf1,t,1) call get_LMQN(QN,NN,u_,uu,Sij,SSij,tf2,t,0) if (t.eq.1 .and. resub_flag == 0 .and. sim_flag < 2) then call load_LMQN_old<<>>(LM,MM,LM_old,MM_old,0) call load_LMQN_old<<>>(QN,NN,QN_old,NN_old,0) end if ! ! must be updated top and bot for intep3d ! call update(LM_old,me,nall) ! call update(MM_old,me,nall) ! call update(QN_old,me,nall) ! call update(NN_old,me,nall) if (me==0) then !$cuf kernel do(2) <<<*,*>>> do j=1,ny do i=1,nx LM_old(i,j,1) = LM_old(i,j,2) MM_old(i,j,1) = MM_old(i,j,2) QN_old(i,j,1) = QN_old(i,j,2) NN_old(i,j,1) = NN_old(i,j,2) end do end do end if if (me==nprocs-1) then !$cuf kernel do(2) <<<*,*>>> do j=1,ny do i=1,nx LM_old(i,j,nzb+1) = LM_old(i,j,nzb) MM_old(i,j,nzb+1) = MM_old(i,j,nzb) QN_old(i,j,nzb+1) = QN_old(i,j,nzb) NN_old(i,j,nzb+1) = NN_old(i,j,nzb) end do end do end if call get_cs2(cs2,LM,MM,QN,NN,LM_old,MM_old,QN_old,NN_old,u_lag,v_lag,w_lag,t,device_id) if (me==0) then !$cuf kernel do(2) <<<*,*>>> do j=1,ny do i=1,nx Cs2(i,j,1)=Cs2(i,j,2) end do end do end if ! warning This needs to be checked !!! if (me==nprocs-1) then !$cuf kernel do(2) <<<*,*>>> do j=1,ny do i=1,nx Cs2(i,j,nzb+1)=Cs2(i,j,nzb) end do end do end if call load_LMQN_old<<>>(LM,MM,LM_old,MM_old,1) call load_LMQN_old<<>>(QN,NN,QN_old,NN_old,1) end subroutine optim_sd_gpu attributes(global) subroutine get_uu(uu,u_,u,v,w) implicit none real(fp),dimension(nx,ny,nz2),intent(in) :: u,v,w real(fp),dimension(nx,ny,nz2,6),intent(out) :: uu real(fp),dimension(nx,ny,nz2,3),intent(out) :: u_ integer i,j,k,ii,jj,kk i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 u_(i,j,k,1) = u(i,j,k) u_(i,j,k,2) = v(i,j,k) u_(i,j,k,3) = (w(i,j,k)+w(i,j,k+1))/2.d0 ! loop through upper triangle kk = 1 do ii = 1,3 do jj = ii,3 uu(i,j,k,kk) = u_(i,j,k,ii)*u_(i,j,k,jj) kk = kk + 1 end do end do end subroutine get_uu attributes(global) subroutine get_SSij(SSij,Sij) implicit none real(fp),dimension(nx,ny,nz2,6),intent(in) :: Sij real(fp),dimension(nx,ny,nz2,6),intent(out) :: SSij real(fp),dimension(6) :: a real(fp) :: S integer i,j,k,ii i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 a = Sij(i,j,k,:) call symm_dot_prod(S,a,a) S = sqrt(2.d0*S) do ii = 1,6 SSij(i,j,k,ii) = S*Sij(i,j,k,ii) end do end subroutine get_SSij attributes(global) subroutine LMQN_kernel(LM,MM,uu_hat,u_hat,SSij_hat,Sij_hat,tf) implicit none real(fp),dimension(nx,ny,nz2),intent(out):: LM,MM real(fp),dimension(nx,ny,nz2,6),intent(in) :: uu_hat,SSij_hat,Sij_hat real(fp),dimension(nx,ny,nz2,3),intent(in) :: u_hat real(fp),intent(in),value :: tf real(fp) :: S_hat,a(6),Lij(6),Mij(6) integer i,j,k,ii,jj,kk i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 ! Compute the average strain a = Sij_hat(i,j,k,:) call symm_dot_prod(S_hat,a,a) S_hat = sqrt(2.d0*S_hat) ! Leonard stress kk = 1 do ii = 1,3 do jj = ii,3 Lij(kk) = uu_hat(i,j,k,kk) - u_hat(i,j,k,ii)*u_hat(i,j,k,jj) Mij(kk) = const*(SSij_hat(i,j,k,kk)-tf**2*S_hat*Sij_hat(i,j,k,kk)) kk = kk + 1 end do end do ! LM(i,j,k) = Mij(1) !u_hat(i,j,k,1)!uu_hat(i,j,k,1) call symm_dot_prod(LM(i,j,k),Lij,Mij) ! LM & QN call symm_dot_prod(MM(i,j,k),Mij,Mij) ! MM & NN end subroutine LMQN_kernel attributes(global) subroutine load_LMQN_old(LM,MM,LM_old,MM_old,flag) implicit none real(fp),dimension(nx,ny,nz2),intent(inout) :: LM,MM real(fp),dimension(nx,ny,nz2),intent(inout) :: LM_old,MM_old integer,value,intent(in) :: flag integer i,j,k,kk i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 if (flag == 0) then LM_old(i,j,k) = 0.03d0*MM(i,j,k) MM_old(i,j,k) = MM(i,j,k) else LM_old(i,j,k) = LM(i,j,k) MM_old(i,j,k) = MM(i,j,k) end if end subroutine load_LMQN_old subroutine get_cs2(cs2,LM,MM,QN,NN,LM_old,MM_old,QN_old,NN_old,u_lag,v_lag,w_lag,t,device_id) implicit none real(fp),dimension(nx,ny,nz2),intent(out),device :: cs2 real(fp),dimension(nx,ny,nz2),intent(inout),device :: LM,MM,QN,NN real(fp),dimension(nx,ny,nz2),intent(inout),device :: LM_old,MM_old,QN_old,NN_old real(fp),dimension(nx,ny,nz2),intent(in),device :: u_lag,v_lag,w_lag real(fp),dimension(nx,ny,nz2) :: xx integer,value,intent(in) :: t,device_id ! Cuda variable type(dim3) :: grid, tBlock tBlock = dim3(16,16,2) grid = dim3(nx/16,ny/16,nzb/2) call lagrng_sd_gpu<<>>(LM,MM,LM_old,MM_old,u_lag,v_lag,w_lag,t,device_id) call lagrng_sd_gpu<<>>(QN,NN,QN_old,NN_old,u_lag,v_lag,w_lag,t,device_id) call cs2_kernel<<>>(cs2,LM,MM,QN,NN,t,device_id) end subroutine get_cs2 subroutine get_LMQN(LM,MM,u_,uu,Sij,SSij,tf,t,flag) implicit none real(fp),dimension(nx,ny,nz2),intent(out),device:: LM,MM real(fp),dimension(nx,ny,nz2,6),intent(in),device :: uu,SSij,Sij real(fp),dimension(nx,ny,nz2,3),intent(in),device :: u_ real(fp),intent(in) :: tf integer,intent(in) :: t,flag integer fft_flag,istat integer :: ii type(c_ptr) :: plan(2) complex(fp),dimension(:,:,:),allocatable,device :: f_hat_ft real(fp),dimension(:,:,:),allocatable,device :: f_hat real(fp),dimension(:,:,:,:),allocatable,device :: Sij_hat,SSij_hat,uu_hat,u_hat real(fp),dimension(nx,ny,nz2) :: xx type(dim3) :: grid, tBlock, grid_fft, tBlock_fft tBlock = dim3(16,16,2) grid = dim3(nx/16,ny/16,nzb/2) tBlock_fft = dim3(16,16,2) grid_fft = dim3((nx/2)/16,ny/16,nzb/2) save plan,Sij_hat,SSij_hat,uu_hat,u_hat,f_hat_ft if(t == 1 .and. flag == 1) then if (fp==singlePrecision) then call cufftPlanMany(plan(1),2,(/ny,nx/),(/ny,nx/),1,nx*ny,(/ny,nx/2+1/),1,(nx/2+1)*ny,cufft_R2C,nzb) call cufftPlanMany(plan(2),2,(/ny,nx/),(/ny,nx/2+1/),1,(nx/2+1)*ny,(/ny,nx/),1,nx*ny,cufft_C2R,nzb) else call cufftPlanMany(plan(1),2,(/ny,nx/),(/ny,nx/),1,nx*ny,(/ny,nx/2+1/),1,(nx/2+1)*ny,cufft_D2Z,nzb) call cufftPlanMany(plan(2),2,(/ny,nx/),(/ny,nx/2+1/),1,(nx/2+1)*ny,(/ny,nx/),1,nx*ny,cufft_Z2D,nzb) end if allocate(Sij_hat(nx,ny,nz2,6)) allocate(SSij_hat(nx,ny,nz2,6)) allocate(uu_hat(nx,ny,nz2,6)) allocate(u_hat(nx,ny,nz2,3)) allocate(f_hat_ft(nx/2+1,ny,nzb)) end if do ii = 1,3 ! filter u call cufftExec(plan(1),u_(:,:,2:nzb+1,ii),f_hat_ft) call test_filter_kernel<<>>(f_hat_ft,tf) call cufftExec(plan(2),f_hat_ft,u_hat(:,:,2:nzb+1,ii)) end do do ii = 1,6 ! filter uu call cufftExec(plan(1),uu(:,:,2:nzb+1,ii),f_hat_ft) call test_filter_kernel<<>>(f_hat_ft,tf) call cufftExec(plan(2),f_hat_ft,uu_hat(:,:,2:nzb+1,ii)) ! filter Sij call cufftExec(plan(1),Sij(:,:,2:nzb+1,ii),f_hat_ft) call test_filter_kernel<<>>(f_hat_ft,tf) call cufftExec(plan(2),f_hat_ft,Sij_hat(:,:,2:nzb+1,ii)) ! filter SSij call cufftExec(plan(1),SSij(:,:,2:nzb+1,ii),f_hat_ft) call test_filter_kernel<<>>(f_hat_ft,tf) call cufftExec(plan(2),f_hat_ft,SSij_hat(:,:,2:nzb+1,ii)) end do ! Compute LM,MM or QN,NN call LMQN_kernel<<>>(LM,MM,uu_hat,u_hat,SSij_hat,Sij_hat,tf) end subroutine get_LMQN attributes(global) subroutine lagrng_sd_gpu(a,b,a_old,b_old,u_lag,v_lag,w_lag,t,device_id) !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- implicit none real(fp), dimension(nx,ny,nz2),intent(inout) :: a,b real(fp), dimension(nx,ny,nz2),intent(in) :: a_old,b_old real(fp),dimension(nx,ny,nz2),intent(in) :: u_lag,v_lag,w_lag integer,value,intent(in) :: t,device_id real(fp) eps,Tn,TLM,TMM,a_interp,b_interp,interp_pt(3),xint,yint,zint integer*4 i,j,k,kend !--------------------------------------------------------------------------- ! main code !--------------------------------------------------------------------------- i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 if (device_id==nprocs-1) then kend=nzb else kend=nzb+1 end if if (k<=kend) then xint = - u_lag(i,j,k)*dtl yint = - v_lag(i,j,k)*dtl zint = - w_lag(i,j,k)*dtl TLM=a_old(i,j,k) TMM=b_old(i,j,k) if (TLM.le.0.0.or.TMM.lt.(1e-24)) then eps=0.0 else Tn=1.5*delta*(TLM*TMM)**(-1./8.) Tn=max(1e-24,Tn) eps=(dtl/Tn)/(1.+dtl/Tn) end if ! if(abs(xint).gt.dx.or.abs(yint).gt.dy.or.abs(zint).gt.dz)then ! write(*,*)'timestep too large for lagrangian averaging' ! stop ! end if call interp3d_gpu(xint,yint,zint,i,j,k,a_old,a_interp,device_id,t) call interp3d_gpu(xint,yint,zint,i,j,k,b_old,b_interp,device_id,t) a(i,j,k) = eps*a(i,j,k)+(1.-eps)*a_interp b(i,j,k) = eps*b(i,j,k)+(1.-eps)*b_interp end if end subroutine lagrng_sd_gpu attributes(global) subroutine cs2_kernel(cs2,LM,MM,QN,NN,t,device_id) !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- implicit none real(fp),dimension(nx,ny,nz2),intent(out) :: cs2 real(fp),dimension(nx,ny,nz2),intent(inout) :: LM,MM,QN,NN integer,value,intent(in) :: t,device_id real(fp) Cs2_2d,Cs2_4d,beta integer*4 i,j,k,kend !--------------------------------------------------------------------------- ! main code !--------------------------------------------------------------------------- i = (blockIdx%x - 1)* blockDim%x + threadIdx%x; j = (blockIdx%y - 1)* blockDim%y + threadIdx%y; k = (blockIdx%z - 1)* blockDim%z + threadIdx%z; k = k + 1 ! k = 2,nzb+1 if (device_id==nprocs-1) then kend=nzb else kend=nzb+1 end if if (k<=kend) then if (LM(i,j,k).lt.0.d0.or.QN(i,j,k).lt.0.d0) then Cs2(i,j,k)=0.d0 LM(i,j,k)=0.d0 ! LM QN(i,j,k)=0.d0 ! QN else Cs2_2d = LM(i,j,k)/MM(i,j,k) ! LM/MM Cs2_2d = max(1E-24,Cs2_2d) Cs2_4d = QN(i,j,k)/NN(i,j,k) ! QN/NN Cs2_4d = max(1E-24,Cs2_4d) Beta = (Cs2_4d/Cs2_2d)**(log(tf1)/(log(tf2)-log(tf1))) Beta = max(Beta,beta_min) Beta = min(Beta,beta_max) Cs2(i,j,k)=Cs2_2d/Beta Cs2(i,j,k)=max(Cs2(i,j,k),cs2_min) Cs2(i,j,k)=min(Cs2(i,j,k),cs2_max) if (cs2(i,j,k).lt.0) then cs2(i,j,k) = 0.0 endif end if end if end subroutine cs2_kernel attributes(device) subroutine interp3d_gpu(xi,yi,zi,i,j,k,U,ui,device_id,t) !--------------------------------------------------------------------------- ! declaration !--------------------------------------------------------------------------- implicit none real(fp) xi,yi,zi,ui,U(Nx,Ny,Nz2),iw,jw,kw,ui_hgh,ui_low integer*4 i,j,k,ii,jj,kk,iplus,jplus,t,device_id !--------------------------------------------------------------------------- ! main code !--------------------------------------------------------------------------- !--------------------------------------------------------------------------- ! x-comp if (xi.lt.0.0) then iw=(dx-abs(xi))/dx if (i.eq.1) then ii=nx else ii=i-1 endif else iw=(xi)/dx ii=i endif !--------------------------------------------------------------------------- ! y-comp if (yi.lt.0.0) then jw=(dy-abs(yi))/dy if (j.eq.1) then jj=ny else jj=j-1 endif else jw=(yi)/dy jj=j endif !--------------------------------------------------------------------------- ! z-comp if (zi.lt.0.0) then kw=(dz-abs(zi))/dz kk=k-1 else kw=(zi)/dz kk=k endif !--------------------------------------------------------------------------- ! compute if (ii.eq.nx) then iplus=-(nx-1) else iplus=1 endif if (jj.eq.ny) then jplus=-(ny-1) else jplus=1 endif if (kw.eq.0.0) then ui = U(ii,jj,kk)*(1.-iw)*(1.-jw)+ & U(ii+iplus,jj,kk)*iw*(1.-jw)+ & U(ii+iplus,jj+jplus,kk)*iw*jw+ & U(ii,jj+jplus,kk)*(1.-iw)*jw else ui_low = U(ii,jj,kk)*(1.-iw)*(1.-jw)+ & U(ii+iplus,jj,kk)*iw*(1.-jw)+ & U(ii+iplus,jj+jplus,kk)*iw*jw+ & U(ii,jj+jplus,kk)*(1.-iw)*jw ui_hgh = U(ii,jj,kk+1)*(1.-iw)*(1.-jw)+ & U(ii+iplus,jj,kk+1)*iw*(1.-jw)+ & U(ii+iplus,jj+jplus,kk+1)*iw*jw+ & U(ii,jj+jplus,kk+1)*(1.-iw)*jw ui = ui_low*(1.-kw)+ui_hgh*kw endif end subroutine interp3d_gpu attributes(device) subroutine symm_dot_prod(c,a,b) ! Dot product of two flattened sysmetric 3 x 3 matrix (only contains upper tri) ! c = a11*b11+...+ 2*(a12*b12+.....) implicit none real(fp),dimension(6),intent(in) :: a,b real(fp),intent(out) :: c real(fp) alpha integer i,j,k,ii,jj,kk ! loop through upper triagnle c = 0.d0 kk = 1 C = 1.d0*a(1)*b(1) + 2.d0*a(2)*b(2) + 2.d0*a(3)*b(3) & + 1.d0*a(4)*b(4) + 2.d0*a(5)*b(5) & + 1.d0*a(6)*b(6) end subroutine symm_dot_prod end module optim_sd_gpu_m