From 87c311f9cd766128d23643f12b91e40ae4a1f9b8 Mon Sep 17 00:00:00 2001 From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr> Date: Tue, 15 Mar 2022 18:13:22 +0100 Subject: [PATCH] Juan 15/03/2022:tensor/communication.f90, Cray OpenACC optimization , disccard collapse + do concurrent --- .../communication.f90 | 138 +++++++++++++++--- 1 file changed, 114 insertions(+), 24 deletions(-) diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 index 6ec3b197d..6d5c71c75 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 @@ -561,7 +561,9 @@ contains za_st => a%st zb_st => b%st !$acc kernels +#ifdef MNH_COMPILER_NVHPC !$acc loop collapse(3) +#endif do iz=0,nz+1 do iy=icompy_min,icompy_max do ix=icompx_min,icompx_max @@ -1133,34 +1135,50 @@ contains ! Send to south if (Gneighbour_s) then ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin - !$acc parallel loop collapse(3) async(IS_SOUTH) + !$acc kernels async(IS_SOUTH) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) ztab_halo_st_haloTin(ii,ij,ik) = zst(ii,ij+a_n-halo_size,ik-1) end do + !$acc end kernels end if ! Send to north if (Gneighbour_n) then ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin - !$acc parallel loop collapse(3) async(IS_NORTH) + !$acc kernels async(IS_NORTH) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) ztab_halo_nt_haloTin(ii,ij,ik) = zst(ii,ij,ik-1) end do + !$acc end kernels end if ! Send to east if (Gneighbour_e) then ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin - !$acc parallel loop collapse(3) async(IS_EAST) + !$acc kernels async(IS_EAST) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) ztab_halo_et_haloTin(ii,ij,ik) = zst(ii+a_n-halo_size,ij-halo_size,ik-1) end do + !$acc end kernels end if ! Send to west if (Gneighbour_w) then ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin - !$acc parallel loop collapse(3) async(IS_WEST) + !$acc kernels async(IS_WEST) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) ztab_halo_wt_haloTin(ii,ij,ik) = zst(ii,ij-halo_size,ik-1) end do + !$acc end kernels end if end if #endif @@ -1368,31 +1386,47 @@ contains if (LUseT) then if (Gneighbour_n) then ! copy north halo for GPU managed - !$acc parallel loop collapse(3) async(IS_NORTH) + !$acc kernels async(IS_NORTH) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) zst(ii,ij-halo_size,ik-1) = ztab_halo_nt_haloTout(ii,ij,ik) end do + !$acc end kernels end if if (Gneighbour_s) then ! copy south halo for GPU managed - !$acc parallel loop collapse(3) async(IS_SOUTH) + !$acc kernels async(IS_SOUTH) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) zst(ii,ij+a_n,ik-1) = ztab_halo_st_haloTout(ii,ij,ik) - end do + end do + !$acc end kernels end if if (Gneighbour_w) then ! copy west halo for GPU managed - !$acc parallel loop collapse(3) async(IS_WEST) + !$acc kernels async(IS_WEST) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) zst(ii-halo_size,ij-halo_size,ik-1) = ztab_halo_wt_haloTout(ii,ij,ik) end do + !$acc end kernels end if if (Gneighbour_e) then ! copy east halo for GPU managed - !$acc parallel loop collapse(3) async(IS_EAST) + !$acc kernels async(IS_EAST) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) zst(ii+a_n,ij-halo_size,ik-1) = ztab_halo_et_haloTout(ii,ij,ik) end do + !$acc end kernels end if ! wait for async copy of send buffer to GPU call acc_wait_haloswap_mnh() @@ -1690,10 +1724,14 @@ contains #ifdef MNH_GPUDIRECT zb_st => b%st za_st => a%st - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2) zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1) end do + !$acc end kernels #else b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1) #endif @@ -1706,20 +1744,32 @@ contains zb_st => b%st ! copy from buffer for GPU DIRECT ! Receive from NE - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) zb_st(ii+b_n/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik) end do + !$acc end kernels ! Receive from SW - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) zb_st(ii,ij+b_n/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik) end do + !$acc end kernels ! Receive from SE - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) zb_st(ii+b_n/2,ij+b_n/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik) end do + !$acc end kernels end if #endif end if @@ -1738,10 +1788,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_interiorT_ne_m_haloTin => tab_interiorT_ne(level,m)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2) ztab_interiorT_ne_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_interiorT_ne_m_haloTin) call mpi_send(ztab_interiorT_ne_m_haloTin,size(ztab_interiorT_ne_m_haloTin), & MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr) @@ -1770,10 +1824,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_interiorT_sw_m_haloTin => tab_interiorT_sw(level,m)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2) ztab_interiorT_sw_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_interiorT_sw_m_haloTin) call mpi_send(ztab_interiorT_sw_m_haloTin,size(ztab_interiorT_sw_m_haloTin), & MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr) @@ -1802,10 +1860,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_interiorT_se_m_haloTin => tab_interiorT_se(level,m)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2) ztab_interiorT_se_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_interiorT_se_m_haloTin) call mpi_send(ztab_interiorT_se_m_haloTin,size(ztab_interiorT_se_m_haloTin), & MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr) @@ -1909,10 +1971,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_ne_m_1_haloTin => tab_sub_interiorT_ne(level,m-1)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTin) call mpi_isend(ztab_sub_interiorT_ne_m_1_haloTin,size(ztab_sub_interiorT_ne_m_1_haloTin), & MPI_DOUBLE_PRECISION,dest_rank, send_tag, & @@ -1939,10 +2005,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_sw_m_1_haloTin => tab_sub_interiorT_sw(level,m-1)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_n/2,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTin) call mpi_isend(ztab_sub_interiorT_sw_m_1_haloTin,size(ztab_sub_interiorT_sw_m_1_haloTin), & MPI_DOUBLE_PRECISION, dest_rank, send_tag, & @@ -1970,10 +2040,14 @@ contains if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_se_m_1_haloTin => tab_sub_interiorT_se(level,m-1)%haloTin - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij+a_n/2,ik-1) end do + !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTin) call mpi_isend(ztab_sub_interiorT_se_m_1_haloTin,size(ztab_sub_interiorT_se_m_1_haloTin), & MPI_DOUBLE_PRECISION, dest_rank, send_tag, & @@ -1994,10 +2068,14 @@ contains #ifdef MNH_GPUDIRECT zb_st => b%st za_st => a%st - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2) zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1) end do + !$acc end kernels #else b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1) #endif @@ -2026,10 +2104,14 @@ contains call mpi_recv(ztab_interiorT_ne_m_haloTout,size(ztab_interiorT_ne_m_haloTout), & MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_ne_m_haloTout(ii,ij,ik) end do + !$acc end kernels #else call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) #endif @@ -2058,10 +2140,14 @@ contains call mpi_recv(ztab_interiorT_sw_m_haloTout,size(ztab_interiorT_sw_m_haloTout), & MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_sw_m_haloTout(ii,ij,ik) end do + !$acc end kernels #else call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) #endif @@ -2090,10 +2176,14 @@ contains call mpi_recv(ztab_interiorT_se_m_haloTout,size(ztab_interiorT_se_m_haloTout), & MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data - !$acc parallel loop collapse(3) + !$acc kernels +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_se_m_haloTout(ii,ij,ik) - end do + end do + !$acc end kernels #else call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) #endif -- GitLab