From 8987f40b445ce4e847058126a38ce63495f2dc2a Mon Sep 17 00:00:00 2001
From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr>
Date: Fri, 15 Apr 2022 19:02:35 +0200
Subject: [PATCH] Juan 15/04/2022:MNH/ZSOLVER/turb_hor_dyn_corr.f90, Cray GPU
 Opt/Bug bypass, add present_cr + acc_nv , where needeed

---
 src/MNH/turb_hor_dyn_corr.f90     | 222 ++++++++++++++++++++----------
 src/ZSOLVER/turb_hor_dyn_corr.f90 | 144 +++++--------------
 2 files changed, 181 insertions(+), 185 deletions(-)

diff --git a/src/MNH/turb_hor_dyn_corr.f90 b/src/MNH/turb_hor_dyn_corr.f90
index 902dd306b..8d5717d5c 100644
--- a/src/MNH/turb_hor_dyn_corr.f90
+++ b/src/MNH/turb_hor_dyn_corr.f90
@@ -456,7 +456,10 @@ ELSE
 END IF
 !
 !$acc kernels async(2)
-ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) 
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKE+1) = ZFLX(JI,JJ,IKE) 
+ENDDO
 !$acc end kernels
 !
 !* prescription of du/dz and dv/dz with uncentered gradient at the surface
@@ -470,12 +473,15 @@ ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2))
 CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(3)
-ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
-       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB+1)=   (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) /      &
+        ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
+ENDDO
 !$acc end kernels
 !
 #ifndef MNH_OPENACC
@@ -488,10 +494,13 @@ ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
 ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2))
 #else
 !$acc kernels async(3)
-ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2)       &
-                          +ZCOEFF(:,:,IKB+1)*PUM(:,:,IKB+1)       &
-                          +ZCOEFF(:,:,IKB )*PUM(:,:,IKB)       &
-                          )* 0.5 * ( PDZX(:,:,IKB+1)+PDZX(:,:,IKB))
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZTMP1_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PUM(JI,JJ,IKB+2)       &
+                          +ZCOEFF(JI,JJ,IKB+1)*PUM(JI,JJ,IKB+1)       &
+                          +ZCOEFF(JI,JJ,IKB)*PUM(JI,JJ,IKB)       &
+                          )* 0.5 * ( PDZX(JI,JJ,IKB+1)+PDZX(JI,JJ,IKB))
+ENDDO
 !$acc end kernels
 !
 !!! wait for the computation of ZCOEFF and ZTMP1_DEVICE
@@ -500,18 +509,24 @@ ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2)       &
 CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-ZDU_DZ_DZS_DX(:,:,1) = ZTMP2_DEVICE(:,:,1) / ZTMP1_DEVICE(:,:,1)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDU_DZ_DZS_DX(JI,JJ,1) = ZTMP2_DEVICE(JI,JJ,1) / ZTMP1_DEVICE(JI,JJ,1)
+ENDDO
 !$acc end kernels
 !
 CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(4)
-ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
-       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB+1)=   (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) /      &
+        ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
+ENDDO
 !$acc end kernels
 !
 #ifndef MNH_OPENACC
@@ -522,10 +537,13 @@ ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
                          )/ MYF(PDYY(:,:,IKB:IKB))
 #else
 !$acc kernels async(4)
-ZTMP3_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PVM(:,:,IKB+2)       &
-                          +ZCOEFF(:,:,IKB+1)*PVM(:,:,IKB+1)       &
-                          +ZCOEFF(:,:,IKB)*PVM(:,:,IKB)       &
-                          )* 0.5 * ( PDZY(:,:,IKB+1)+PDZY(:,:,IKB))
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZTMP3_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PVM(JI,JJ,IKB+2)       &
+                          +ZCOEFF(JI,JJ,IKB+1)*PVM(JI,JJ,IKB+1)       &
+                          +ZCOEFF(JI,JJ,IKB)*PVM(JI,JJ,IKB)       &
+                          )* 0.5 * ( PDZY(JI,JJ,IKB+1)+PDZY(JI,JJ,IKB))
+ENDDO
 !$acc end kernels
 !
 !!! wait for the computation of ZCOEFF and ZTMP3_DEVICE
@@ -541,7 +559,7 @@ ZDV_DY(:,:,:)=  DYF(PVM(:,:,IKB:IKB)) / MYF(PDYY(:,:,IKB:IKB)) &
 #else
 CALL MYF_DEVICE(ZTMP3_DEVICE(:,:,1:1), ZTMP4_DEVICE(:,:,1:1))
 CALL MYF_DEVICE(PDYY(:,:,IKB:IKB), ZTMP3_DEVICE(:,:,1:1))
-!$acc kernels async(4)
+!$acc kernels async(4) present_cr(ZDV_DZ_DZS_DY) 
 ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
 !$acc end kernels
 !
@@ -552,7 +570,10 @@ ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
 CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-ZDU_DX(:,:,1)=  ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDU_DX(JI,JJ,1)=  ZTMP1_DEVICE(JI,JJ,1) / ZTMP2_DEVICE(JI,JJ,1) - ZDU_DZ_DZS_DX(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
 !!! wait for the computation of ZDV_DZ_DZS_DY
@@ -560,8 +581,11 @@ ZDU_DX(:,:,1)=  ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1)
 !
 CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1))
 CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1))
-!$acc kernels! async(4)
-ZDV_DY(:,:,1)=  ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1)
+!$acc kernels async(4)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDV_DY(JI,JJ,1)=  ZTMP3_DEVICE(JI,JJ,1) / ZTMP4_DEVICE(JI,JJ,1) - ZDV_DZ_DZS_DY(JI,JJ,1)
+ENDDO
 !$acc end kernels
 !
 !
@@ -570,7 +594,10 @@ ZDV_DY(:,:,1)=  ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1)
 #endif
 !
 !$acc kernels async(4)
-ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDW_DZ(JI,JJ,1)=-ZDU_DX(JI,JJ,1)-ZDV_DY(JI,JJ,1)
+ENDDO
 !$acc end kernels
 !
 !* computation 
@@ -586,11 +613,13 @@ ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:)
 ! du coup je ne peux pas faire de update self asynchrone...
 !
 !$acc kernels async(3)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDU_DX(:,:,1)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDU_DX(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
-
 !!  &  to be tested later
 !! + XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *        &
 !!   (-2./3.) * PTP(:,:,IKB:IKB)
@@ -601,7 +630,7 @@ ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
 !!! wait for the computation of ZDIRSINZW
 !$acc wait(1)
 !
-!$acc kernels async(4)
+!$acc kernels async(4) present_cr(ZFLX)
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) =                                                            &
         PTAU11M(:,:) * PCOSSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &
@@ -613,9 +642,7 @@ ZFLX(:,:,IKB-1) =                                                            &
     - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    )
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
         PTAU11M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
@@ -633,7 +660,10 @@ END DO ! CONCURRENT
 !$acc wait(3) async(4)
 !
 !$acc kernels async(4)
-ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) -  ZFLX(:,:,IKB)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB-1) = 2. * ZFLX(JI,JJ,IKB-1) -  ZFLX(JI,JJ,IKB)
+ENDDO
 !$acc end kernels
 !
 !
@@ -755,8 +785,11 @@ IF (KSPLT==1) THEN
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
   !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDU_DX(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc_nv loop independent collapse(2) 
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDU_DX(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -777,7 +810,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+  ZTMP1_DEVICE(:,:,:) = -ZWORK(:,:,:)
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_U_SBG_UaU , .TRUE.)
   !
@@ -811,11 +844,14 @@ IF (.NOT. L2D) THEN
   !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
   !
 ELSE
-  !$acc kernels async(3)
-  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
-    - XCMFS * PK *(-(2./3.) * ( GX_U_M_PUM                      &
-                               +GZ_W_M_PWM                ) )  
-  !$acc end kernels
+   !$acc kernels async(3)
+   !$acc_nv loop independent collapse(3)
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
+           - XCMFS * PK(JI,JJ,JK) *(-(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)        &
+                                      +GZ_W_M_PWM(JI,JJ,JK)     ) )  
+   ENDDO
+   !$acc end kernels
   !! &  to be tested
   !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
   !
@@ -830,8 +866,11 @@ ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE)
 ! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10)
 !
 !$acc kernels async(3)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDV_DY(:,:,1)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDV_DY(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
 !!           & to be tested
@@ -839,7 +878,7 @@ ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
 !!   (-2./3.) * PTP(:,:,IKB:IKB)
 !
 ! extrapolates this flux under the ground with the surface flux
-!$acc kernels async(3)
+!$acc kernels async(3) present_cr(ZFLX) 
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) =                                                            &
         PTAU11M(:,:) * PSINSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &         
@@ -851,9 +890,7 @@ ZFLX(:,:,IKB-1) =                                                            &
     + PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    )
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
         PTAU11M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
@@ -999,9 +1036,12 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-  !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDV_DY(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc kernels async(2)
+   !$acc_nv loop independent collapse(2)
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDV_DY(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -1022,7 +1062,10 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+  !$acc_nv loop independent collapse(3) 
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.)
   !
@@ -1051,11 +1094,14 @@ IF (.NOT. L2D) THEN
   !!  &  to be tested
   !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
 ELSE
-  !$acc kernels async(2)
-  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
-    - XCMFS * PK *( (4./3.) * GZ_W_M_PWM                        &
-                   -(2./3.) * ( GX_U_M_PUM                ) ) 
-  !$acc end kernels
+   !$acc kernels async(2)
+   !$acc_nv loop independent collapse(3) 
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
+           - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK)          &
+           -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)           ) ) 
+   ENDDO
+   !$acc end kernels
   !!  &  to be tested
   !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
 END IF
@@ -1069,8 +1115,12 @@ ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE)
 !
 !
 !$acc kernels async(2)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDW_DZ(:,:,1)
+!$acc_nv loop independent collapse(2) 
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDW_DZ(JI,JJ,1)
+ENDDO
+
 !$acc end kernels
 !
 
@@ -1078,7 +1128,7 @@ ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
 !   - 2.* XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *             &
 !  (-2./3.) * PTP(:,:,IKB:IKB)
 ! extrapolates this flux under the ground with the surface flux
-!$acc kernels async(3)
+!$acc kernels async(3) present_cr(ZFLX) 
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) = &    
         PTAU11M(:,:) * ZDIRSINZW(:,:)**2                                &
@@ -1086,9 +1136,7 @@ ZFLX(:,:,IKB-1) = &
   +2. * PCDUEFF(:,:)* PUSLOPEM(:,:)  * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )        
 ZFLX(JI,JJ,IKB-1) = &
         PTAU11M(JI,JJ) * BR_P2(ZDIRSINZW(JI,JJ))                                &
@@ -1184,9 +1232,12 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-  !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDW_DZ(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc kernels async(2)
+   !$acc_nv loop independent collapse(2)
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDW_DZ(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -1225,20 +1276,29 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+  !$acc_nv loop independent collapse(3) 
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.)
   !
   CALL GZ_M_M_DEVICE(PTHLM,PDZZ,ZTMP1_DEVICE)
   !$acc kernels
-  ZTMP2_DEVICE = ZTMP1_DEVICE * ZFLX
+  !$acc_nv loop independent collapse(3) 
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK) * ZFLX(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.)
   !
   CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE)
   CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
   !$acc kernels
-  ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+  !$acc_nv loop independent collapse(3) 
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE,X_LES_RES_ddz_Thl_SBG_W2)
   !
@@ -1249,14 +1309,20 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+    !$acc_nv loop independent collapse(3) 
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.)
     !
     CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
     !$acc kernels
-    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+    !$acc_nv loop independent collapse(3) 
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Rt_SBG_W2)
     !
@@ -1268,15 +1334,21 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+    !$acc_nv loop independent collapse(3)
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, &
                            X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.)
     !
     CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
-    CALL MZF_DEVICE( ZTMP1_DEVICE,ZTMP2_DEVICE)
+    CALL MZF_DEVICE(ZTMP1_DEVICE,ZTMP2_DEVICE)
     !$acc kernels
-    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+    !$acc_nv loop independent collapse(3)
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV))
     !
diff --git a/src/ZSOLVER/turb_hor_dyn_corr.f90 b/src/ZSOLVER/turb_hor_dyn_corr.f90
index 9ce23f59a..850e010d2 100644
--- a/src/ZSOLVER/turb_hor_dyn_corr.f90
+++ b/src/ZSOLVER/turb_hor_dyn_corr.f90
@@ -461,13 +461,10 @@ ELSE
 END IF
 !
 !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZFLX(JI,JJ,IKE+1) = ZFLX(JI,JJ,IKE) 
 ENDDO
-
 !$acc end kernels
 !
 !* prescription of du/dz and dv/dz with uncentered gradient at the surface
@@ -481,9 +478,7 @@ ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2))
 CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
         ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
@@ -493,7 +488,6 @@ DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
         ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
 ENDDO
 !$acc end kernels
-
 !
 #ifndef MNH_OPENACC
 ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
@@ -505,9 +499,7 @@ ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
 ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2))
 #else
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZTMP1_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PUM(JI,JJ,IKB+2)       &
                           +ZCOEFF(JI,JJ,IKB+1)*PUM(JI,JJ,IKB+1)       &
@@ -522,9 +514,7 @@ ENDDO
 CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZDU_DZ_DZS_DX(JI,JJ,1) = ZTMP2_DEVICE(JI,JJ,1) / ZTMP1_DEVICE(JI,JJ,1)
 ENDDO
@@ -533,9 +523,7 @@ ENDDO
 CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(4)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
         ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
@@ -544,10 +532,8 @@ DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) /      &
         ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
 ENDDO
-
 !$acc end kernels
 !
-
 #ifndef MNH_OPENACC
 ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
                           +ZCOEFF(:,:,IKB+1:IKB+1)*PVM(:,:,IKB+1:IKB+1)       &
@@ -556,9 +542,7 @@ ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
                          )/ MYF(PDYY(:,:,IKB:IKB))
 #else
 !$acc kernels async(4)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZTMP3_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PVM(JI,JJ,IKB+2)       &
                           +ZCOEFF(JI,JJ,IKB+1)*PVM(JI,JJ,IKB+1)       &
@@ -580,11 +564,7 @@ ZDV_DY(:,:,:)=  DYF(PVM(:,:,IKB:IKB)) / MYF(PDYY(:,:,IKB:IKB)) &
 #else
 CALL MYF_DEVICE(ZTMP3_DEVICE(:,:,1:1), ZTMP4_DEVICE(:,:,1:1))
 CALL MYF_DEVICE(PDYY(:,:,IKB:IKB), ZTMP3_DEVICE(:,:,1:1))
-#ifdef MNH_COMPILER_CCE
-!$acc kernels present(ZDV_DZ_DZS_DY) async(4)
-#else
-!$acc kernels async(4)
-#endif
+!$acc kernels async(4) present_cr(ZDV_DZ_DZS_DY) 
 ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
 !$acc end kernels
 !
@@ -595,9 +575,7 @@ ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
 CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZDU_DX(JI,JJ,1)=  ZTMP1_DEVICE(JI,JJ,1) / ZTMP2_DEVICE(JI,JJ,1) - ZDU_DZ_DZS_DX(JI,JJ,1)
 ENDDO
@@ -609,9 +587,7 @@ ENDDO
 CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1))
 CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1))
 !$acc kernels async(4)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZDV_DY(JI,JJ,1)=  ZTMP3_DEVICE(JI,JJ,1) / ZTMP4_DEVICE(JI,JJ,1) - ZDV_DZ_DZS_DY(JI,JJ,1)
 ENDDO
@@ -642,9 +618,7 @@ ENDDO
 ! du coup je ne peux pas faire de update self asynchrone...
 !
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
         - XCMFS * PK(JI,JJ,IKB) * 2. * ZDU_DX(JI,JJ,1)
@@ -661,11 +635,7 @@ ENDDO
 !!! wait for the computation of ZDIRSINZW
 !$acc wait(1)
 !
-#ifdef MNH_COMPILER_CCE
-!$acc kernels present(ZFLX) async(4)
-#else
-!$acc kernels async(4)
-#endif
+!$acc kernels async(4) present_cr(ZFLX)
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) =                                                            &
         PTAU11M(:,:) * PCOSSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &
@@ -677,9 +647,7 @@ ZFLX(:,:,IKB-1) =                                                            &
     - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    )
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
         PTAU11M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
@@ -697,9 +665,7 @@ END DO ! CONCURRENT
 !$acc wait(3) async(4)
 !
 !$acc kernels async(4)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZFLX(JI,JJ,IKB-1) = 2. * ZFLX(JI,JJ,IKB-1) -  ZFLX(JI,JJ,IKB)
 ENDDO
@@ -825,10 +791,8 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-   !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC   
-   !$acc loop independent collapse(2) 
-#endif
+  !$acc kernels async(2)
+   !$acc_nv loop independent collapse(2) 
    DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
       ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDU_DX(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
    ENDDO
@@ -888,9 +852,7 @@ IF (.NOT. L2D) THEN
   !
 ELSE
    !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-   !$acc loop independent collapse(3)
-#endif   
+   !$acc_nv loop independent collapse(3)
    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
       ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
            - XCMFS * PK(JI,JJ,JK) *(-(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)        &
@@ -911,9 +873,7 @@ ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE)
 ! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10)
 !
 !$acc kernels async(3)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
         - XCMFS * PK(JI,JJ,IKB) * 2. * ZDV_DY(JI,JJ,1)
@@ -925,11 +885,7 @@ ENDDO
 !!   (-2./3.) * PTP(:,:,IKB:IKB)
 !
 ! extrapolates this flux under the ground with the surface flux
-#ifdef MNH_COMPILER_CCE
-!$acc kernels present(ZFLX) async(3)
-#else
-!$acc kernels async(3)
-#endif
+!$acc kernels async(3) present_cr(ZFLX) 
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) =                                                            &
         PTAU11M(:,:) * PSINSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &         
@@ -941,9 +897,7 @@ ZFLX(:,:,IKB-1) =                                                            &
     + PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    )
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
         PTAU11M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
@@ -1092,9 +1046,7 @@ IF (KSPLT==1) THEN
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
    !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC   
-   !$acc loop independent collapse(2)
-#endif   
+   !$acc_nv loop independent collapse(2)
    DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
       ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDV_DY(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
    ENDDO
@@ -1119,9 +1071,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-#ifdef MNH_COMPILER_NVHPC  
-  !$acc loop independent collapse(3) 
-#endif
+  !$acc_nv loop independent collapse(3) 
   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
      ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
   ENDDO
@@ -1154,9 +1104,7 @@ IF (.NOT. L2D) THEN
   !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
 ELSE
    !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC   
-   !$acc loop independent collapse(3) 
-#endif
+   !$acc_nv loop independent collapse(3) 
    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
       ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
            - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK)          &
@@ -1176,9 +1124,7 @@ ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE)
 !
 !
 !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2) 
-#endif
+!$acc_nv loop independent collapse(2) 
 DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
    ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
         - XCMFS * PK(JI,JJ,IKB) * 2. * ZDW_DZ(JI,JJ,1)
@@ -1191,11 +1137,7 @@ ENDDO
 !   - 2.* XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *             &
 !  (-2./3.) * PTP(:,:,IKB:IKB)
 ! extrapolates this flux under the ground with the surface flux
-#ifdef MNH_COMPILER_CCE
-!$acc kernels present(ZFLX) async(3)
-#else
-!$acc kernels async(3)
-#endif
+!$acc kernels async(3) present_cr(ZFLX) 
 #ifndef MNH_BITREP
 ZFLX(:,:,IKB-1) = &    
         PTAU11M(:,:) * ZDIRSINZW(:,:)**2                                &
@@ -1203,9 +1145,7 @@ ZFLX(:,:,IKB-1) = &
   +2. * PCDUEFF(:,:)* PUSLOPEM(:,:)  * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)
 #else
 !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)
-#endif
+!$acc_nv loop independent collapse(2)
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )        
 ZFLX(JI,JJ,IKB-1) = &
         PTAU11M(JI,JJ) * BR_P2(ZDIRSINZW(JI,JJ))                                &
@@ -1302,9 +1242,7 @@ IF (KSPLT==1) THEN
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
    !$acc kernels async(2)
-#ifdef MNH_COMPILER_NVHPC   
-   !$acc loop independent collapse(2)
-#endif   
+   !$acc_nv loop independent collapse(2)
    DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
       ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDW_DZ(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
    ENDDO
@@ -1347,9 +1285,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-#ifdef MNH_COMPILER_NVHPC  
-  !$acc loop independent collapse(3) 
-#endif
+  !$acc_nv loop independent collapse(3) 
   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
      ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
   ENDDO
@@ -1358,9 +1294,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !
   CALL GZ_M_M_DEVICE(PTHLM,PDZZ,ZTMP1_DEVICE)
   !$acc kernels
-#ifdef MNH_COMPILER_NVHPC  
-  !$acc loop independent collapse(3) 
-#endif
+  !$acc_nv loop independent collapse(3) 
   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
      ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK) * ZFLX(JI,JJ,JK)
   ENDDO
@@ -1370,9 +1304,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE)
   CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
   !$acc kernels
-#ifdef MNH_COMPILER_NVHPC  
-  !$acc loop independent collapse(3) 
-#endif
+  !$acc_nv loop independent collapse(3) 
   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
      ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
   ENDDO
@@ -1386,9 +1318,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-#ifdef MNH_COMPILER_NVHPC    
-    !$acc loop independent collapse(3) 
-#endif
+    !$acc_nv loop independent collapse(3) 
     DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
     ENDDO
@@ -1398,9 +1328,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
     !$acc kernels
-#ifdef MNH_COMPILER_NVHPC    
-    !$acc loop independent collapse(3) 
-#endif
+    !$acc_nv loop independent collapse(3) 
     DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
     ENDDO
@@ -1415,9 +1343,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-#ifdef MNH_COMPILER_NVHPC    
-    !$acc loop independent collapse(3)
-#endif    
+    !$acc_nv loop independent collapse(3)
     DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
     ENDDO
@@ -1426,11 +1352,9 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
                            X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.)
     !
     CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
-    CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
+    CALL MZF_DEVICE(ZTMP1_DEVICE,ZTMP2_DEVICE)
     !$acc kernels
-#ifdef MNH_COMPILER_NVHPC    
-    !$acc loop independent collapse(3)
-#endif    
+    !$acc_nv loop independent collapse(3)
     DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
     ENDDO
-- 
GitLab