diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 index 4273ddadf026988dfcb71676ddad44a92e4cec2b..794ac3a6f7c58f7c1bfdfb1baf02ed5b37386273 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 @@ -1092,6 +1092,37 @@ contains ! zst => a%st ! +#ifdef MNH_GPUDIRECT + if (LUseT) then + ! + ! Copy send buffer async to GPU + ! + ! Send to south + ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin + !$acc parallel loop collapse(3) async(1) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + ztab_halo_st_haloTin(ii,ij,ik) = zst(ii,ij+a_n-halo_size,ik-1) + end do + ! Send to north + ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin + !$acc parallel loop collapse(3) async(1) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + ztab_halo_nt_haloTin(ii,ij,ik) = zst(ii,ij,ik-1) + end do + ! Send to east + ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin + !$acc parallel loop collapse(3) async(1) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + ztab_halo_et_haloTin(ii,ij,ik) = zst(ii+a_n-halo_size,ij-halo_size,ik-1) + end do + ! Send to west + ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin + !$acc parallel loop collapse(3) async(1) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + ztab_halo_wt_haloTin(ii,ij,ik) = zst(ii,ij-halo_size,ik-1) + end do + end if +#endif ! Receive from north recvtag = 1002 if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1, & @@ -1132,6 +1163,12 @@ contains MPI_COMM_HORIZ, requests_nsT(2), ierr) #endif end if +#ifdef MNH_GPUDIRECT + if (LUseT) then + ! wait for async copy of send buffer to GPU + !$acc wait(1) + end if +#endif ! Send to south sendtag = 1002 if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1, & @@ -1140,11 +1177,6 @@ contains sendtag = 1012 if (LUseT) then #ifdef MNH_GPUDIRECT - ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin - !$acc parallel loop collapse(3) - do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) - ztab_halo_st_haloTin(ii,ij,ik) = zst(ii,ij+a_n-halo_size,ik-1) - end do !$acc host_data use_device(ztab_halo_st_haloTin) call mpi_isend(ztab_halo_st_haloTin,size(ztab_halo_st_haloTin), & MPI_DOUBLE_PRECISION,neighbour_s_rank,sendtag, & @@ -1164,11 +1196,6 @@ contains sendtag = 1013 if (LUseT) then #ifdef MNH_GPUDIRECT - ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin - !$acc parallel loop collapse(3) - do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) - ztab_halo_nt_haloTin(ii,ij,ik) = zst(ii,ij,ik-1) - end do !$acc host_data use_device(ztab_halo_nt_haloTin) call mpi_isend(ztab_halo_nt_haloTin,size(ztab_halo_nt_haloTin), & MPI_DOUBLE_PRECISION,neighbour_n_rank,sendtag, & @@ -1234,11 +1261,6 @@ contains sendtag = 1010 if (LUseT) then #ifdef MNH_GPUDIRECT - ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin - !$acc parallel loop collapse(3) - do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) - ztab_halo_et_haloTin(ii,ij,ik) = zst(ii+a_n-halo_size,ij-halo_size,ik-1) - end do !$acc host_data use_device(ztab_halo_et_haloTin) call mpi_isend(ztab_halo_et_haloTin,size(ztab_halo_et_haloTin), & MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, & @@ -1258,11 +1280,6 @@ contains recvtag = 1011 if (LUseT) then #ifdef MNH_GPUDIRECT - ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin - !$acc parallel loop collapse(3) - do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) - ztab_halo_wt_haloTin(ii,ij,ik) = zst(ii,ij-halo_size,ik-1) - end do !$acc host_data use_device(ztab_halo_wt_haloTin) call mpi_isend(ztab_halo_wt_haloTin,size(ztab_halo_wt_haloTin), & MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag, & @@ -1285,25 +1302,26 @@ contains #ifdef MNH_GPUDIRECT if (LUseT) then ! copy north halo for GPU managed - !$acc parallel loop collapse(3) + !$acc parallel loop collapse(3) async(3) do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) zst(ii,ij-halo_size,ik-1) = ztab_halo_nt_haloTout(ii,ij,ik) end do ! copy south halo for GPU managed - !$acc parallel loop collapse(3) + !$acc parallel loop collapse(3) async(3) do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) zst(ii,ij+a_n,ik-1) = ztab_halo_st_haloTout(ii,ij,ik) end do ! copy west halo for GPU managed - !$acc parallel loop collapse(3) + !$acc parallel loop collapse(3) async(3) do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) zst(ii-halo_size,ij-halo_size,ik-1) = ztab_halo_wt_haloTout(ii,ij,ik) end do ! copy east halo for GPU managed - !$acc parallel loop collapse(3) + !$acc parallel loop collapse(3) async(3) do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) zst(ii+a_n,ij-halo_size,ik-1) = ztab_halo_et_haloTout(ii,ij,ik) - end do + end do + !$acc wait(3) end if #endif end if! (stepsize == 1) ...