diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 index 368cd949bcc41e1b68ab86b1c1fe5608655ff5d4..243365e917d19e5db6f708089de2c86396304274 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 @@ -1026,6 +1026,15 @@ contains integer, dimension(4) :: requests_nsT integer, dimension(4) :: requests_ewT + integer :: ii,ij,ik + real , pointer , contiguous , dimension(:,:,:) :: zst + ! + real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_st_haloTin,ztab_halo_nt_haloTin + real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_et_haloTin,ztab_halo_wt_haloTin + ! + real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_nt_haloTout,ztab_halo_st_haloTout + real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_wt_haloTout,ztab_halo_et_haloTout + halo_size = comm_param%halo_size ! Do nothing if we are only using one processor @@ -1059,6 +1068,9 @@ contains yoffset = 1-halo_size blocklength = (a_n+2*halo_size)*(nz+2)*halo_size end if + ! + zst => a%st + ! ! Receive from north recvtag = 1002 if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1, & @@ -1067,9 +1079,12 @@ contains recvtag = 1012 if (LUseT) then #ifdef MNH_GPUDIRECT - call mpi_irecv(tab_halo_nt(level,m)%haloTout,size(tab_halo_nt(level,m)%haloTout), & + ztab_halo_nt_haloTout => tab_halo_nt(level,m)%haloTout + !$acc host_data use_device(ztab_halo_nt_haloTout) + call mpi_irecv(ztab_halo_nt_haloTout,size(ztab_halo_nt_haloTout), & MPI_DOUBLE_PRECISION,neighbour_n_rank,recvtag, & MPI_COMM_HORIZ, requests_nsT(1), ierr) + !$acc end host_data #else call mpi_irecv(a%st(1,0-(halo_size-1),0),1, & halo_nst(level,m),neighbour_n_rank,recvtag, & @@ -1084,9 +1099,12 @@ contains recvtag = 1013 if (LUseT) then #ifdef MNH_GPUDIRECT - call mpi_irecv(tab_halo_st(level,m)%haloTout,size(tab_halo_st(level,m)%haloTout), & + ztab_halo_st_haloTout => tab_halo_st(level,m)%haloTout + !$acc host_data use_device (ztab_halo_st_haloTout) + call mpi_irecv(ztab_halo_st_haloTout,size(ztab_halo_st_haloTout), & MPI_DOUBLE_PRECISION,neighbour_s_rank,recvtag, & MPI_COMM_HORIZ, requests_nsT(2), ierr) + !$acc end host_data #else call mpi_irecv(a%st(1,a_n+1,0),1, & halo_nst(level,m),neighbour_s_rank,recvtag, & @@ -1101,11 +1119,16 @@ contains sendtag = 1012 if (LUseT) then #ifdef MNH_GPUDIRECT - tab_halo_st(level,m)%haloTin(1:a_n,1:halo_size,1:nz+2) = & - a%st(1:a_n,a_n-(halo_size-1):a_n,0:nz+1) - call mpi_isend(tab_halo_st(level,m)%haloTin,size(tab_halo_st(level,m)%haloTin), & + ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin + !$acc parallel loop collapse(3) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + ztab_halo_st_haloTin(ii,ij,ik) = zst(ii,ij+a_n-halo_size,ik-1) + end do + !$acc host_data use_device(ztab_halo_st_haloTin) + call mpi_isend(ztab_halo_st_haloTin,size(ztab_halo_st_haloTin), & MPI_DOUBLE_PRECISION,neighbour_s_rank,sendtag, & - MPI_COMM_HORIZ, requests_nsT(3), ierr) + MPI_COMM_HORIZ, requests_nsT(3), ierr) + !$acc end host_data #else call mpi_isend(a%st(1,a_n-(halo_size-1),0),1, & halo_nst(level,m),neighbour_s_rank,sendtag, & @@ -1120,11 +1143,16 @@ contains sendtag = 1013 if (LUseT) then #ifdef MNH_GPUDIRECT - tab_halo_nt(level,m)%haloTin(1:a_n,1:halo_size,1:nz+2) = & - a%st(1:a_n,1:halo_size,0:nz+1) - call mpi_isend(tab_halo_nt(level,m)%haloTin,size(tab_halo_nt(level,m)%haloTin), & + ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin + !$acc parallel loop collapse(3) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + ztab_halo_nt_haloTin(ii,ij,ik) = zst(ii,ij,ik-1) + end do + !$acc host_data use_device(ztab_halo_nt_haloTin) + call mpi_isend(ztab_halo_nt_haloTin,size(ztab_halo_nt_haloTin), & MPI_DOUBLE_PRECISION,neighbour_n_rank,sendtag, & MPI_COMM_HORIZ, requests_nsT(4), ierr) + !$acc end host_data #else call mpi_isend(a%st(1,1,0),1, & halo_nst(level,m),neighbour_n_rank,sendtag, & @@ -1144,9 +1172,12 @@ contains recvtag = 1010 if (LUseT) then #ifdef MNH_GPUDIRECT - call mpi_irecv(tab_halo_wt(level,m)%haloTout,size(tab_halo_wt(level,m)%haloTout), & + ztab_halo_wt_haloTout => tab_halo_wt(level,m)%haloTout + !$acc host_data use_device(ztab_halo_wt_haloTout) + call mpi_irecv(ztab_halo_wt_haloTout,size(ztab_halo_wt_haloTout), & MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, & MPI_COMM_HORIZ, requests_ewT(1), ierr) + !$acc end host_data #else call mpi_irecv(a%st(0-(halo_size-1),0,0),1, & halo_wet(level,m),neighbour_w_rank,recvtag, & @@ -1161,9 +1192,12 @@ contains sendtag = 1011 if (LUseT) then #ifdef MNH_GPUDIRECT - call mpi_irecv(tab_halo_et(level,m)%haloTout,size(tab_halo_et(level,m)%haloTout), & + ztab_halo_et_haloTout => tab_halo_et(level,m)%haloTout + !$acc host_data use_device(ztab_halo_et_haloTout) + call mpi_irecv(ztab_halo_et_haloTout,size(ztab_halo_et_haloTout), & MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, & MPI_COMM_HORIZ, requests_ewT(2), ierr) + !$acc end host_data #else call mpi_irecv(a%st(a_n+1,0,0),1, & halo_wet(level,m),neighbour_e_rank,recvtag, & @@ -1179,11 +1213,16 @@ contains sendtag = 1010 if (LUseT) then #ifdef MNH_GPUDIRECT - tab_halo_et(level,m)%haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2) = & - a%st(a_n-(halo_size-1):a_n,1-halo_size:a_n+halo_size,0:nz+1) - call mpi_isend(tab_halo_et(level,m)%haloTin,size(tab_halo_et(level,m)%haloTin), & + ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin + !$acc parallel loop collapse(3) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + ztab_halo_et_haloTin(ii,ij,ik) = zst(ii+a_n-halo_size,ij-halo_size,ik-1) + end do + !$acc host_data use_device(ztab_halo_et_haloTin) + call mpi_isend(ztab_halo_et_haloTin,size(ztab_halo_et_haloTin), & MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, & MPI_COMM_HORIZ, requests_ewT(3), ierr) + !$acc end host_data #else call mpi_isend(a%st(a_n-(halo_size-1),0,0),1, & halo_wet(level,m),neighbour_e_rank,sendtag, & @@ -1198,11 +1237,16 @@ contains recvtag = 1011 if (LUseT) then #ifdef MNH_GPUDIRECT - tab_halo_wt(level,m)%haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2) = & - a%st(1:halo_size,1-halo_size:a_n+halo_size,0:nz+1) - call mpi_isend(tab_halo_wt(level,m)%haloTin,size(tab_halo_wt(level,m)%haloTin), & + ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin + !$acc parallel loop collapse(3) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + ztab_halo_wt_haloTin(ii,ij,ik) = zst(ii,ij-halo_size,ik-1) + end do + !$acc host_data use_device(ztab_halo_wt_haloTin) + call mpi_isend(ztab_halo_wt_haloTin,size(ztab_halo_wt_haloTin), & MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag, & MPI_COMM_HORIZ, requests_ewT(4), ierr) + !$acc end host_data #else call mpi_isend(a%st(1,0,0),1, & halo_wet(level,m),neighbour_w_rank,sendtag, & @@ -1220,17 +1264,25 @@ contains #ifdef MNH_GPUDIRECT if (LUseT) then ! copy north halo for GPU managed - a%st(1:a_n,0-(halo_size-1):0,0:nz+1) = & - tab_halo_nt(level,m)%haloTout(1:a_n,1:halo_size,1:nz+2) + !$acc parallel loop collapse(3) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + zst(ii,ij-halo_size,ik-1) = ztab_halo_nt_haloTout(ii,ij,ik) + end do ! copy south halo for GPU managed - a%st(1:a_n,a_n+1:a_n+halo_size,0:nz+1) = & - tab_halo_st(level,m)%haloTout(1:a_n,1:halo_size,1:nz+2) + !$acc parallel loop collapse(3) + do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + zst(ii,ij+a_n,ik-1) = ztab_halo_st_haloTout(ii,ij,ik) + end do ! copy west halo for GPU managed - a%st(0-(halo_size-1):0,1-halo_size:a_n+halo_size,0:nz+1) = & - tab_halo_wt(level,m)%haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2) + !$acc parallel loop collapse(3) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + zst(ii-halo_size,ij-halo_size,ik-1) = ztab_halo_wt_haloTout(ii,ij,ik) + end do ! copy east halo for GPU managed - a%st(a_n+1:a_n+halo_size,1-halo_size:a_n+halo_size,0:nz+1) = & - tab_halo_et(level,m)%haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2) + !$acc parallel loop collapse(3) + do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + zst(ii+a_n,ij-halo_size,ik-1) = ztab_halo_et_haloTout(ii,ij,ik) + end do end if #endif end if! (stepsize == 1) ...