From b85eafc116825b219a09a08b339547cbab0ecbf0 Mon Sep 17 00:00:00 2001
From: ESCOBAR Juan <escj@nuwa>
Date: Thu, 3 Mar 2022 16:22:03 +0100
Subject: [PATCH] Juan 03/03/2022:ZSOLVER/turb_hor_dyn_corr.f90, nvhpc22.2
 bug/optimisation -> replace WHERE+ARRAY SYNTAX -> DO CONCURRENT

---
 src/ZSOLVER/turb_hor_dyn_corr.f90 | 265 ++++++++++++++++++++++--------
 1 file changed, 201 insertions(+), 64 deletions(-)

diff --git a/src/ZSOLVER/turb_hor_dyn_corr.f90 b/src/ZSOLVER/turb_hor_dyn_corr.f90
index 2e0fa87bc..e255ed69a 100644
--- a/src/ZSOLVER/turb_hor_dyn_corr.f90
+++ b/src/ZSOLVER/turb_hor_dyn_corr.f90
@@ -461,7 +461,13 @@ ELSE
 END IF
 !
 !$acc kernels async(2)
-ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) 
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKE+1) = ZFLX(JI,JJ,IKE) 
+ENDDO
+
 !$acc end kernels
 !
 !* prescription of du/dz and dv/dz with uncentered gradient at the surface
@@ -475,13 +481,19 @@ ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2))
 CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(3)
-ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
-       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB+1)=   (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) /      &
+        ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
+ENDDO
 !$acc end kernels
+
 !
 #ifndef MNH_OPENACC
 ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
@@ -493,10 +505,15 @@ ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
 ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2))
 #else
 !$acc kernels async(3)
-ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2)       &
-                          +ZCOEFF(:,:,IKB+1)*PUM(:,:,IKB+1)       &
-                          +ZCOEFF(:,:,IKB )*PUM(:,:,IKB)       &
-                          )* 0.5 * ( PDZX(:,:,IKB+1)+PDZX(:,:,IKB))
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZTMP1_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PUM(JI,JJ,IKB+2)       &
+                          +ZCOEFF(JI,JJ,IKB+1)*PUM(JI,JJ,IKB+1)       &
+                          +ZCOEFF(JI,JJ,IKB)*PUM(JI,JJ,IKB)       &
+                          )* 0.5 * ( PDZX(JI,JJ,IKB+1)+PDZX(JI,JJ,IKB))
+ENDDO
 !$acc end kernels
 !
 !!! wait for the computation of ZCOEFF and ZTMP1_DEVICE
@@ -505,20 +522,32 @@ ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2)       &
 CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-ZDU_DZ_DZS_DX(:,:,1) = ZTMP2_DEVICE(:,:,1) / ZTMP1_DEVICE(:,:,1)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDU_DZ_DZS_DX(JI,JJ,1) = ZTMP2_DEVICE(JI,JJ,1) / ZTMP1_DEVICE(JI,JJ,1)
+ENDDO
 !$acc end kernels
 !
 CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
 #endif
 !$acc kernels async(4)
-ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
-       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
-ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
-       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB+1)=   (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) /      &
+        ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) )
+   ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) /      &
+        ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) )
+ENDDO
+
 !$acc end kernels
 !
+
 #ifndef MNH_OPENACC
 ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
                           +ZCOEFF(:,:,IKB+1:IKB+1)*PVM(:,:,IKB+1:IKB+1)       &
@@ -527,10 +556,15 @@ ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
                          )/ MYF(PDYY(:,:,IKB:IKB))
 #else
 !$acc kernels async(4)
-ZTMP3_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PVM(:,:,IKB+2)       &
-                          +ZCOEFF(:,:,IKB+1)*PVM(:,:,IKB+1)       &
-                          +ZCOEFF(:,:,IKB)*PVM(:,:,IKB)       &
-                          )* 0.5 * ( PDZY(:,:,IKB+1)+PDZY(:,:,IKB))
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZTMP3_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PVM(JI,JJ,IKB+2)       &
+                          +ZCOEFF(JI,JJ,IKB+1)*PVM(JI,JJ,IKB+1)       &
+                          +ZCOEFF(JI,JJ,IKB)*PVM(JI,JJ,IKB)       &
+                          )* 0.5 * ( PDZY(JI,JJ,IKB+1)+PDZY(JI,JJ,IKB))
+ENDDO
 !$acc end kernels
 !
 !!! wait for the computation of ZCOEFF and ZTMP3_DEVICE
@@ -557,7 +591,12 @@ ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
 CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1))
 CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1))
 !$acc kernels async(3)
-ZDU_DX(:,:,1)=  ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDU_DX(JI,JJ,1)=  ZTMP1_DEVICE(JI,JJ,1) / ZTMP2_DEVICE(JI,JJ,1) - ZDU_DZ_DZS_DX(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
 !!! wait for the computation of ZDV_DZ_DZS_DY
@@ -565,8 +604,13 @@ ZDU_DX(:,:,1)=  ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1)
 !
 CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1))
 CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1))
-!$acc kernels! async(4)
-ZDV_DY(:,:,1)=  ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1)
+!$acc kernels async(4)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZDV_DY(JI,JJ,1)=  ZTMP3_DEVICE(JI,JJ,1) / ZTMP4_DEVICE(JI,JJ,1) - ZDV_DZ_DZS_DY(JI,JJ,1)
+ENDDO
 !$acc end kernels
 !
 !
@@ -575,7 +619,12 @@ ZDV_DY(:,:,1)=  ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1)
 #endif
 !
 !$acc kernels async(4)
-ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(3) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+   ZDW_DZ(JI,JJ,JK)=-ZDU_DX(JI,JJ,JK)-ZDV_DY(JI,JJ,JK)
+ENDDO
 !$acc end kernels
 !
 !* computation 
@@ -591,11 +640,15 @@ ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:)
 ! du coup je ne peux pas faire de update self asynchrone...
 !
 !$acc kernels async(3)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDU_DX(:,:,1)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDU_DX(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
-
 !!  &  to be tested later
 !! + XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *        &
 !!   (-2./3.) * PTP(:,:,IKB:IKB)
@@ -617,8 +670,9 @@ ZFLX(:,:,IKB-1) =                                                            &
       PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    &
     - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    )
 #else
+!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
 #ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)   
+!$acc loop independent collapse(2)
 #endif
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
@@ -637,7 +691,12 @@ END DO ! CONCURRENT
 !$acc wait(3) async(4)
 !
 !$acc kernels async(4)
-ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) -  ZFLX(:,:,IKB)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB-1) = 2. * ZFLX(JI,JJ,IKB-1) -  ZFLX(JI,JJ,IKB)
+ENDDO
 !$acc end kernels
 !
 !
@@ -760,9 +819,14 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-  !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDU_DX(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc kernels async(2)
+#ifdef MNH_COMPILER_NVHPC   
+   !$acc loop independent collapse(2) 
+#endif
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDU_DX(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -783,7 +847,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+  ZTMP1_DEVICE(:,:,:) = -ZWORK(:,:,:)
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_U_SBG_UaU , .TRUE.)
   !
@@ -817,11 +881,16 @@ IF (.NOT. L2D) THEN
   !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
   !
 ELSE
-  !$acc kernels async(3)
-  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
-    - XCMFS * PK *(-(2./3.) * ( GX_U_M_PUM                      &
-                               +GZ_W_M_PWM                ) )  
-  !$acc end kernels
+   !$acc kernels async(3)
+#ifdef MNH_COMPILER_NVHPC
+   !$acc loop independent collapse(3)
+#endif   
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
+           - XCMFS * PK(JI,JJ,JK) *(-(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)        &
+                                      +GZ_W_M_PWM(JI,JJ,JK)     ) )  
+   ENDDO
+   !$acc end kernels
   !! &  to be tested
   !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
   !
@@ -836,8 +905,13 @@ ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE)
 ! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10)
 !
 !$acc kernels async(3)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDV_DY(:,:,1)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDV_DY(JI,JJ,1)
+ENDDO
 !$acc end kernels
 
 !!           & to be tested
@@ -856,8 +930,9 @@ ZFLX(:,:,IKB-1) =                                                            &
       PUSLOPEM(:,:) * PSINSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    &
     + PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    )
 #else
+!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
 #ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)   
+!$acc loop independent collapse(2)
 #endif
 DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
 ZFLX(JI,JJ,IKB-1) =                                                             &
@@ -1006,9 +1081,14 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-  !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDV_DY(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc kernels async(2)
+#ifdef MNH_COMPILER_NVHPC   
+   !$acc loop independent collapse(2)
+#endif   
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDV_DY(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -1029,7 +1109,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+#ifdef MNH_COMPILER_NVHPC  
+  !$acc loop independent collapse(3) 
+#endif
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.)
   !
@@ -1058,11 +1143,16 @@ IF (.NOT. L2D) THEN
   !!  &  to be tested
   !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
 ELSE
-  !$acc kernels async(2)
-  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
-    - XCMFS * PK *( (4./3.) * GZ_W_M_PWM                        &
-                   -(2./3.) * ( GX_U_M_PUM                ) ) 
-  !$acc end kernels
+   !$acc kernels async(2)
+#ifdef MNH_COMPILER_NVHPC   
+   !$acc loop independent collapse(3) 
+#endif
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                           &
+           - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK)          &
+           -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)           ) ) 
+   ENDDO
+   !$acc end kernels
   !!  &  to be tested
   !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
 END IF
@@ -1076,8 +1166,14 @@ ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE)
 !
 !
 !$acc kernels async(2)
-ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
-  - XCMFS * PK(:,:,IKB) * 2. * ZDW_DZ(:,:,1)
+#ifdef MNH_COMPILER_NVHPC
+!$acc loop independent collapse(2) 
+#endif
+DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+   ZFLX(JI,JJ,IKB)   = (2./3.) * PTKEM(JI,JJ,IKB)                           &
+        - XCMFS * PK(JI,JJ,IKB) * 2. * ZDW_DZ(JI,JJ,1)
+ENDDO
+
 !$acc end kernels
 !
 
@@ -1092,6 +1188,7 @@ ZFLX(:,:,IKB-1) = &
   +     PTAU33M(:,:) * PDIRCOSZW(:,:)**2                                &
   +2. * PCDUEFF(:,:)* PUSLOPEM(:,:)  * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)
 #else
+!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1)
 #ifdef MNH_COMPILER_NVHPC
 !$acc loop independent collapse(2)
 #endif
@@ -1190,9 +1287,14 @@ IF (KSPLT==1) THEN
   !
   ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
   !
-  !$acc kernels async(2)
-  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDW_DZ(:,:,1) + ZWORK(:,:,IKB+1) )
-  !$acc end kernels
+   !$acc kernels async(2)
+#ifdef MNH_COMPILER_NVHPC   
+   !$acc loop independent collapse(2)
+#endif   
+   DO CONCURRENT (JI=1:JIU,JJ=1:JJU)
+      ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDW_DZ(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) )
+   ENDDO
+   !$acc end kernels
   !
   !$acc kernels async(2)
   PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
@@ -1231,20 +1333,35 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
   !$acc wait(2)
   !
   !$acc kernels
-  ZTMP1_DEVICE = -ZWORK
+#ifdef MNH_COMPILER_NVHPC  
+  !$acc loop independent collapse(3) 
+#endif
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.)
   !
   CALL GZ_M_M_DEVICE(PTHLM,PDZZ,ZTMP1_DEVICE)
   !$acc kernels
-  ZTMP2_DEVICE = ZTMP1_DEVICE * ZFLX
+#ifdef MNH_COMPILER_NVHPC  
+  !$acc loop independent collapse(3) 
+#endif
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK) * ZFLX(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.)
   !
   CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE)
   CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
   !$acc kernels
-  ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+#ifdef MNH_COMPILER_NVHPC  
+  !$acc loop independent collapse(3) 
+#endif
+  DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+  ENDDO
   !$acc end kernels
   CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE,X_LES_RES_ddz_Thl_SBG_W2)
   !
@@ -1255,14 +1372,24 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+#ifdef MNH_COMPILER_NVHPC    
+    !$acc loop independent collapse(3) 
+#endif
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.)
     !
     CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
     CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
     !$acc kernels
-    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+#ifdef MNH_COMPILER_NVHPC    
+    !$acc loop independent collapse(3) 
+#endif
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Rt_SBG_W2)
     !
@@ -1274,7 +1401,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     !
     CALL GZ_M_M_DEVICE(PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
     !$acc kernels
-    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+#ifdef MNH_COMPILER_NVHPC    
+    !$acc loop independent collapse(3)
+#endif    
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, &
                            X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.)
@@ -1282,7 +1414,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN
     CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
     CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE )
     !$acc kernels
-    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+#ifdef MNH_COMPILER_NVHPC    
+    !$acc loop independent collapse(3)
+#endif    
+    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK)
+    ENDDO
     !$acc end kernels
     CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV))
     !
-- 
GitLab