From 87c311f9cd766128d23643f12b91e40ae4a1f9b8 Mon Sep 17 00:00:00 2001
From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr>
Date: Tue, 15 Mar 2022 18:13:22 +0100
Subject: [PATCH] Juan 15/03/2022:tensor/communication.f90, Cray OpenACC
 optimization , disccard collapse + do concurrent

---
 .../communication.f90                         | 138 +++++++++++++++---
 1 file changed, 114 insertions(+), 24 deletions(-)

diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
index 6ec3b197d..6d5c71c75 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
@@ -561,7 +561,9 @@ contains
            za_st => a%st
            zb_st => b%st
            !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
            !$acc loop collapse(3)
+#endif
            do iz=0,nz+1
               do iy=icompy_min,icompy_max
                  do ix=icompx_min,icompx_max
@@ -1133,34 +1135,50 @@ contains
            ! Send to south
            if (Gneighbour_s) then
            ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin
-           !$acc parallel loop collapse(3) async(IS_SOUTH)
+           !$acc kernels async(IS_SOUTH)
+#ifdef MNH_COMPILER_NVHPC            
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
               ztab_halo_st_haloTin(ii,ij,ik) = zst(ii,ij+a_n-halo_size,ik-1)
            end do
+           !$acc end kernels
            end if
            ! Send to north
            if (Gneighbour_n) then
            ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin
-           !$acc parallel loop collapse(3) async(IS_NORTH) 
+           !$acc kernels async(IS_NORTH)
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )           
               ztab_halo_nt_haloTin(ii,ij,ik) = zst(ii,ij,ik-1)
            end do
+           !$acc end kernels
            end if
            ! Send to east
            if (Gneighbour_e) then
            ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin
-           !$acc parallel loop collapse(3) async(IS_EAST)
+           !$acc kernels async(IS_EAST)
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
               ztab_halo_et_haloTin(ii,ij,ik) = zst(ii+a_n-halo_size,ij-halo_size,ik-1)
            end do
+           !$acc end kernels
            end if
            ! Send to west
            if (Gneighbour_w) then
            ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin
-           !$acc parallel loop collapse(3) async(IS_WEST)
+           !$acc kernels async(IS_WEST)
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
               ztab_halo_wt_haloTin(ii,ij,ik) = zst(ii,ij-halo_size,ik-1)
            end do
+           !$acc end kernels
            end if
         end if
 #endif
@@ -1368,31 +1386,47 @@ contains
         if (LUseT) then
            if (Gneighbour_n) then
            ! copy north halo for GPU managed
-           !$acc parallel loop collapse(3) async(IS_NORTH)
+           !$acc kernels async(IS_NORTH)
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif              
            do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
               zst(ii,ij-halo_size,ik-1) = ztab_halo_nt_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels
            end if
            if (Gneighbour_s) then
            ! copy south halo for GPU managed
-           !$acc parallel loop collapse(3) async(IS_SOUTH)
+           !$acc kernels async(IS_SOUTH)
+#ifdef MNH_COMPILER_NVHPC              
+           !$acc loop independent collapse(3)   
+#endif              
            do concurrent ( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
               zst(ii,ij+a_n,ik-1) = ztab_halo_st_haloTout(ii,ij,ik)
-           end do          
+           end do
+           !$acc end kernels
            end if
            if (Gneighbour_w) then
            ! copy west halo for GPU managed
-           !$acc parallel loop collapse(3) async(IS_WEST)
+           !$acc kernels async(IS_WEST)
+#ifdef MNH_COMPILER_NVHPC              
+           !$acc loop independent collapse(3)
+#endif              
            do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
               zst(ii-halo_size,ij-halo_size,ik-1) = ztab_halo_wt_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels
            end if
            if (Gneighbour_e) then
            ! copy east halo for GPU managed
-           !$acc parallel loop collapse(3) async(IS_EAST)
+           !$acc kernels async(IS_EAST)
+#ifdef MNH_COMPILER_NVHPC
+           !$acc loop independent collapse(3)
+#endif              
            do concurrent ( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
               zst(ii+a_n,ij-halo_size,ik-1) = ztab_halo_et_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels           
            end if 
            ! wait for async copy of send buffer to GPU
            call acc_wait_haloswap_mnh()           
@@ -1690,10 +1724,14 @@ contains
 #ifdef MNH_GPUDIRECT           
            zb_st => b%st
            za_st => a%st
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
            end do
+           !$acc end kernels
 #else           
            b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1)
 #endif
@@ -1706,20 +1744,32 @@ contains
            zb_st => b%st
            ! copy from buffer for GPU DIRECT
            ! Receive from NE
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
               zb_st(ii+b_n/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels           
            ! Receive from SW
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
               zb_st(ii,ij+b_n/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels
            ! Receive from SE
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
             do concurrent (ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
                zb_st(ii+b_n/2,ij+b_n/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik)
             end do
+            !$acc end kernels
         end if
 #endif
       end if
@@ -1738,10 +1788,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_ne_m_haloTin => tab_interiorT_ne(level,m)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2)
               ztab_interiorT_ne_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            end do
+           !$acc end kernels           
            !$acc host_data use_device(ztab_interiorT_ne_m_haloTin)
            call mpi_send(ztab_interiorT_ne_m_haloTin,size(ztab_interiorT_ne_m_haloTin), &
                 MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
@@ -1770,10 +1824,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_sw_m_haloTin => tab_interiorT_sw(level,m)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2)
               ztab_interiorT_sw_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            end do
+           !$acc end kernels           
            !$acc host_data use_device(ztab_interiorT_sw_m_haloTin)
            call mpi_send(ztab_interiorT_sw_m_haloTin,size(ztab_interiorT_sw_m_haloTin), &
                 MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
@@ -1802,10 +1860,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_se_m_haloTin => tab_interiorT_se(level,m)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:a_n,ij=1:a_n,ik=1:nz+2)
               ztab_interiorT_se_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            end do
+           !$acc end kernels
            !$acc host_data use_device(ztab_interiorT_se_m_haloTin)
            call mpi_send(ztab_interiorT_se_m_haloTin,size(ztab_interiorT_se_m_haloTin), &
                 MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
@@ -1909,10 +1971,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_ne_m_1_haloTin => tab_sub_interiorT_ne(level,m-1)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif
            do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
               ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij,ik-1)
            end do
+           !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTin)
            call mpi_isend(ztab_sub_interiorT_ne_m_1_haloTin,size(ztab_sub_interiorT_ne_m_1_haloTin), &
                 MPI_DOUBLE_PRECISION,dest_rank, send_tag, &
@@ -1939,10 +2005,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_sw_m_1_haloTin => tab_sub_interiorT_sw(level,m-1)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
               ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_n/2,ik-1)
            end do
+           !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTin)
            call mpi_isend(ztab_sub_interiorT_sw_m_1_haloTin,size(ztab_sub_interiorT_sw_m_1_haloTin), &
                 MPI_DOUBLE_PRECISION, dest_rank, send_tag, &
@@ -1970,10 +2040,14 @@ contains
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_se_m_1_haloTin => tab_sub_interiorT_se(level,m-1)%haloTin
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
               ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij+a_n/2,ik-1)
            end do
+           !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTin)
            call mpi_isend(ztab_sub_interiorT_se_m_1_haloTin,size(ztab_sub_interiorT_se_m_1_haloTin), &
                 MPI_DOUBLE_PRECISION, dest_rank, send_tag, &
@@ -1994,10 +2068,14 @@ contains
 #ifdef MNH_GPUDIRECT                     
            zb_st => b%st
            za_st => a%st
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
            end do
+           !$acc end kernels
 #else
            b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1)
 #endif
@@ -2026,10 +2104,14 @@ contains
            call mpi_recv(ztab_interiorT_ne_m_haloTout,size(ztab_interiorT_ne_m_haloTout), &
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_ne_m_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels           
 #else
            call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
 #endif
@@ -2058,10 +2140,14 @@ contains
            call mpi_recv(ztab_interiorT_sw_m_haloTout,size(ztab_interiorT_sw_m_haloTout), &           
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_sw_m_haloTout(ii,ij,ik)
            end do
+           !$acc end kernels
 #else
            call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
 #endif
@@ -2090,10 +2176,14 @@ contains
            call mpi_recv(ztab_interiorT_se_m_haloTout,size(ztab_interiorT_se_m_haloTout), &
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
-           !$acc parallel loop collapse(3)
+           !$acc kernels
+#ifdef MNH_COMPILER_NVHPC           
+           !$acc loop independent collapse(3)
+#endif           
            do concurrent (ii=1:b_n,ij=1:b_n,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_se_m_haloTout(ii,ij,ik)
-           end do           
+           end do
+           !$acc end kernels
 #else
            call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
 #endif
-- 
GitLab