From a5af36d5bf5fd89a0948fe87eb326317316e0f47 Mon Sep 17 00:00:00 2001
From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr>
Date: Fri, 16 Dec 2022 14:18:41 +0100
Subject: [PATCH] Juan 16/12/2022:ZSOLVER/* , GPU loop optimization in good
 order : DO CONCURRENT -> !$mnh_do_concurrent

---
 src/ZSOLVER/advection_metsv.f90   | 97 +++++++++++++++++++------------
 src/ZSOLVER/advection_uvw.f90     | 14 ++---
 src/ZSOLVER/advection_uvw_cen.f90 |  3 +-
 src/ZSOLVER/contrav.f90           | 40 +++++--------
 src/ZSOLVER/dotprod.f90           | 20 +++----
 src/ZSOLVER/get_halo.f90          | 42 ++++++++++---
 src/ZSOLVER/ppm.f90               |  9 +--
 src/ZSOLVER/turb.f90              | 93 ++++++++++++++---------------
 8 files changed, 170 insertions(+), 148 deletions(-)

diff --git a/src/ZSOLVER/advection_metsv.f90 b/src/ZSOLVER/advection_metsv.f90
index 69e179229..410aece0a 100644
--- a/src/ZSOLVER/advection_metsv.f90
+++ b/src/ZSOLVER/advection_metsv.f90
@@ -191,9 +191,9 @@ USE MODI_ADV_BOUNDARIES
 #if defined(MNH_BITREP) || defined(MNH_BITREP_OMP)
 USE MODI_BITREP
 #endif
-#ifdef MNH_COMPILER_CCE
-!$mnh_undef(LOOP)
-!$mnh_undef(OPENACC)
+#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP)
+! mnh_undef(LOOP)
+! mnh_undef(OPENACC)
 #endif
 
 USE MODI_CONTRAV
@@ -539,29 +539,40 @@ IF (.NOT. L1D) THEN
   !$acc end kernels
   IF (LIBM) THEN
     !$acc kernels
-!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU)
 #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
+!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU)
     ZCFLU(IIB:IIE,IJB:IJE,:) = ZCFLU(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,2)/&
                                                         (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.))
     ZCFLV(IIB:IIE,IJB:IJE,:) = ZCFLV(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,3)/&
                                                         (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.))
     ZCFLW(IIB:IIE,IJB:IJE,:) = ZCFLW(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,4)/&
                                                         (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.))
+!$mnh_end_expand_array()
 #else
+#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP)
+DO CONCURRENT (JK=1:JKU,JJ=IJB:IJE,JI=IIB:IIE)
+#else
+!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU)
+#endif    
     ZCFLU(IIB:IIE,IJB:IJE,:) = ZCFLU(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,2)/&
                                                         Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.)))
     ZCFLV(IIB:IIE,IJB:IJE,:) = ZCFLV(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,3)/&
                                                         Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.)))
     ZCFLW(IIB:IIE,IJB:IJE,:) = ZCFLW(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,4)/&
                                                         Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.)))
-#endif
+#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP)
+END DO ! CONCURRENT 
+#else
 !$mnh_end_expand_array()
+#endif
+#endif
     WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,2).GT.(-ZIBM_EPSI)) ZCFLU(IIB:IIE,IJB:IJE,:)=0.
     WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,3).GT.(-ZIBM_EPSI)) ZCFLV(IIB:IIE,IJB:IJE,:)=0.
     WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,4).GT.(-ZIBM_EPSI)) ZCFLW(IIB:IIE,IJB:IJE,:)=0.
     !$acc end kernels
   ENDIF
-#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
+!if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
+#if !defined(MNH_BITREP) 
   IF (.NOT. L2D) THEN
      !$acc kernels present_cr(ZCFL) 
      ZCFL(:,:,:)  = SQRT(ZCFLU(:,:,:)**2+ZCFLV(:,:,:)**2+ZCFLW(:,:,:)**2)
@@ -574,17 +585,15 @@ IF (.NOT. L1D) THEN
 #else
   IF (.NOT. L2D) THEN
      !$acc kernels
-     !$acc_nv loop independent collapse(3)
-     DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU )
+     !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
         ZCFL(JI,JJ,JK)  = SQRT(BR_P2(ZCFLU(JI,JJ,JK))+BR_P2(ZCFLV(JI,JJ,JK))+BR_P2(ZCFLW(JI,JJ,JK)))
-     END DO
+     !$mnh_end_do()
      !$acc end kernels
   ELSE
      !$acc kernels
-     !$acc_nv loop independent collapse(3)
-     DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU )
+     !$mnh_do_concurrent( JI=1:JIU,JJ=1:JJU,JK=1:JKU )
         ZCFL(JI,JJ,JK)  = SQRT(BR_P2(ZCFLU(JI,JJ,JK))+BR_P2(ZCFLW(JI,JJ,JK)))
-     END DO
+     !$mnh_end_do()
      !$acc end kernels
   END IF
 #endif 
@@ -592,13 +601,13 @@ ELSE
    !$acc kernels
    ZCFLU(:,:,:) = 0.0 ; ZCFLV(:,:,:) = 0.0 ;  ZCFLW(:,:,:) = 0.0
    ZCFLW(IIB:IIE,IJB:IJE,:) = ABS(ZRWCPPM(IIB:IIE,IJB:IJE,:) * PTSTEP)
-#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
+!if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
+#if !defined(MNH_BITREP)
    ZCFL(:,:,:) = SQRT(ZCFLW(:,:,:)**2)
-#else
-   !$acc_nv loop independent collapse(3)
-   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU )
+#else  
+   !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU )
       ZCFL(JI,JJ,JK) = SQRT(BR_P2(ZCFLW(JI,JJ,JK)))
-   END DO
+   !$mnh_end_do()
 #endif
    !$acc end kernels
 END IF
@@ -678,7 +687,7 @@ ZCFL_MAX  = MAXVAL(ZCFL (IIB:IIE,IJB:IJE,IKB:IKE))
 #else
 ZCFLU_MAX = 0.0 ; ZCFLV_MAX = 0.0 ; ZCFLW_MAX = 0.0 ; ZCFL_MAX = 0.0
 !$acc parallel reduction(max:ZCFLU_MAX,ZCFLV_MAX,ZCFLW_MAX,ZCFL_MAX)
-!$mnh_do_concurrent (JI=IIB:IIE,JJ=IJB:IJE,JK=IKB:IKE)
+!$mnh_do_concurrent(JI=IIB:IIE,JJ=IJB:IJE,JK=IKB:IKE)
  ZCFLU_MAX = MAX(ZCFLU_MAX,ZCFLU(JI,JJ,JK))
  ZCFLV_MAX = MAX(ZCFLV_MAX,ZCFLV(JI,JJ,JK))
  ZCFLW_MAX = MAX(ZCFLW_MAX,ZCFLW(JI,JJ,JK))
@@ -768,7 +777,9 @@ ZRWCPPM(:,:,:) = ZRWCPPM(:,:,:)*ZTSTEP_PPM
 !dir$ concurrent
 ZRTHS_OTHER(:,:,:) = PRTHS(:,:,:) - PTHT(:,:,:) * PRHODJ(:,:,:) / PTSTEP
 !dir$ concurrent
-IF (GTKE) ZRTKES_OTHER(:,:,:) = PRTKES(:,:,:) - PTKET(:,:,:) * PRHODJ(:,:,:) / PTSTEP
+IF (GTKE) THEN
+   ZRTKES_OTHER(:,:,:) = PRTKES(:,:,:) - PTKET(:,:,:) * PRHODJ(:,:,:) / PTSTEP
+END IF
 DO JR = 1, KRR
  !dir$ concurrent
  ZRRS_OTHER(:,:,:,JR) = PRRS(:,:,:,JR) - PRT(:,:,:,JR) * PRHODJ(:,:,:) / PTSTEP
@@ -864,10 +875,18 @@ CALL PPM_RHODJ(HLBCX,HLBCY, ZRUCPPM, ZRVCPPM, ZRWCPPM,              &
 !$acc kernels
 !dir$ concurrent
 ZTH(:,:,:)  = PTHT(:,:,:)
-!dir$ concurrent
-IF (KRR /=0 ) ZR(:,:,:,:)  = PRT(:,:,:,:)
-!dir$ concurrent
-IF (KSV /=0 ) ZSV(:,:,:,:) = PSVT(:,:,:,:)
+!dir concurrent
+IF (KRR /=0 ) THEN
+ !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JR=1:KRR )
+   ZR(:,:,:,:)  = PRT(:,:,:,:)
+ !$mnh_end_expand_array()
+END IF
+!dir concurrent
+IF (KSV /=0 ) THEN
+ !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JSV=1:KSV)
+   ZSV(:,:,:,:) = PSVT(:,:,:,:)
+ !$mnh_end_expand_array()
+END IF
 !
 IF (GTKE)  THEN
    PRTKES_ADV(:,:,:)  = 0.
@@ -914,15 +933,21 @@ DO JSPL=1,KSPLIT
 ! Tendencies of PPM
 !
 ! acc kernels
-   !$acc kernels
-   !dir$ concurrent
+   !$acc kernels present_cr(PRTHS,ZRTHS_PPM)
    PRTHS(:,:,:)                      = PRTHS     (:,:,:)   + ZRTHS_PPM (:,:,:)   / KSPLIT
-   !dir$ concurrent
-   IF (GTKE)     PRTKES_ADV(:,:,:)   = PRTKES_ADV(:,:,:)   + ZRTKES_PPM(:,:,:)   / KSPLIT
-   !dir$ concurrent
-   IF (KRR /=0)  PRRS      (:,:,:,:) = PRRS      (:,:,:,:) + ZRRS_PPM  (:,:,:,:) / KSPLIT
-   !dir$ concurrent
-   IF (KSV /=0 ) PRSVS     (:,:,:,:) = PRSVS     (:,:,:,:) + ZRSVS_PPM (:,:,:,:) / KSPLIT
+   IF (GTKE) THEN
+      PRTKES_ADV(:,:,:)   = PRTKES_ADV(:,:,:)   + ZRTKES_PPM(:,:,:)   / KSPLIT
+   END IF
+   IF (KRR /=0)  THEN
+      !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JR=1:KRR)
+      PRRS      (:,:,:,:) = PRRS      (:,:,:,:) + ZRRS_PPM  (:,:,:,:) / KSPLIT
+      !$mnh_end_expand_array()
+   END IF
+   IF (KSV /=0 ) THEN
+      !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JSV=1:KSV)
+      PRSVS     (:,:,:,:) = PRSVS     (:,:,:,:) + ZRSVS_PPM (:,:,:,:) / KSPLIT
+      !$mnh_end_expand_array()
+   END IF
    !$acc end kernels
 !
    IF (JSPL<KSPLIT) THEN
@@ -939,18 +964,16 @@ DO JSPL=1,KSPLIT
       !$acc end kernels
    END IF
    !$acc kernels
-   !$acc_nv loop independent collapse(4)
-   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU, JR=1:KRR )
+   !$mnh_do_concurrent( JI=1:JIU,JJ=1:JJU,JK=1:JKU, JR=1:KRR )
       ZR(JI,JJ,JK,JR) = ZR(JI,JJ,JK,JR) + ( ZRRS_PPM(JI,JJ,JK,JR) + ZRRS_OTHER(JI,JJ,JK,JR) + PRRS_CLD(JI,JJ,JK,JR) ) &
            * ZTSTEP_PPM / PRHODJ(JI,JJ,JK)
-   END DO !CONCURRENT 
+   !$mnh_end_do() !CONCURRENT 
    !$acc loop seq
    DO JSV = 1, KSV
-      !$acc_nv loop independent collapse(3)
-      DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      !$mnh_do_concurrent ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
          ZSV(JI,JJ,JK,JSV) = ZSV(JI,JJ,JK,JSV) + ( ZRSVS_PPM(JI,JJ,JK,JSV) + ZRSVS_OTHER(JI,JJ,JK,JSV) +  &
               PRSVS_CLD(JI,JJ,JK,JSV) ) * ZTSTEP_PPM / PRHODJ(JI,JJ,JK)
-      END DO !CONCURRENT 
+      !$mnh_end_do() !CONCURRENT 
    END DO
    !$acc end kernels
    END IF
diff --git a/src/ZSOLVER/advection_uvw.f90 b/src/ZSOLVER/advection_uvw.f90
index 6885bca91..58ba42fe5 100644
--- a/src/ZSOLVER/advection_uvw.f90
+++ b/src/ZSOLVER/advection_uvw.f90
@@ -437,30 +437,24 @@ DO JSPL=1,ISPLIT
 ! Tendencies on wind
 ! acc update device(ZRUS_ADV,ZRVS_ADV,ZRWS_ADV)
 !$acc kernels
-#ifdef MNH_COMPILER_NVHPC  
-!$acc loop independent collapse(3)
-#endif     
-DO CONCURRENT (JI=1:IIU , JJ=1:IJU , JK=1:IKU )
+!$mnh_do_concurrent(JI=1:IIU,JJ=1:IJU,JK=1:IKU )
   PRUS(JI,JJ,JK) = PRUS(JI,JJ,JK) + ZRUS_ADV(JI,JJ,JK) / ISPLIT
   PRVS(JI,JJ,JK) = PRVS(JI,JJ,JK) + ZRVS_ADV(JI,JJ,JK) / ISPLIT
   PRWS(JI,JJ,JK) = PRWS(JI,JJ,JK) + ZRWS_ADV(JI,JJ,JK) / ISPLIT
-END DO
+!$mnh_end_do()
   IF (JSPL<ISPLIT) THEN
 !
 ! Guesses for next time splitting loop
 !
 !
-#ifdef MNH_COMPILER_NVHPC  
-!$acc loop independent collapse(3)
-#endif     
-DO CONCURRENT (JI=1:IIU , JJ=1:IJU , JK=1:IKU )
+!$mnh_do_concurrent(JI=1:IIU,JJ=1:IJU,JK=1:IKU)
   ZU(JI,JJ,JK) = ZU(JI,JJ,JK) + ZTSTEP / ZMXM_RHODJ(JI,JJ,JK) *  &
               (ZRUS_OTHER(JI,JJ,JK) + ZRUS_ADV(JI,JJ,JK))
   ZV(JI,JJ,JK) = ZV(JI,JJ,JK) + ZTSTEP / ZMYM_RHODJ(JI,JJ,JK) *  &
               (ZRVS_OTHER(JI,JJ,JK) + ZRVS_ADV(JI,JJ,JK))
   ZW(JI,JJ,JK) = ZW(JI,JJ,JK) + ZTSTEP / ZMZM_RHODJ(JI,JJ,JK) *  &
               (ZRWS_OTHER(JI,JJ,JK) + ZRWS_ADV(JI,JJ,JK))
-END DO
+!$mnh_end_do()
 END IF
 !$acc end kernels
 !
diff --git a/src/ZSOLVER/advection_uvw_cen.f90 b/src/ZSOLVER/advection_uvw_cen.f90
index 313a36457..a4055eda1 100644
--- a/src/ZSOLVER/advection_uvw_cen.f90
+++ b/src/ZSOLVER/advection_uvw_cen.f90
@@ -403,7 +403,8 @@ ELSEIF (HUVW_ADV_SCHEME=='CEN4TH') THEN
 !
 END IF
 !
-!$acc kernels present( ZRUS, ZRVS, ZRWS, ZMXM_RHODJ, ZMYM_RHODJ, ZMZM_RHODJ )
+!$acc kernels present(ZRUS,ZRVS,ZRWS,ZMXM_RHODJ,ZMYM_RHODJ,ZMZM_RHODJ) &
+!$acc present_cr(PRUS,PRVS,PRWS,PDUM,PDWM)
 ZUS(:,:,:) = ZRUS(:,:,:)/ZMXM_RHODJ(:,:,:)*2.*PTSTEP
 ZVS(:,:,:) = ZRVS(:,:,:)/ZMYM_RHODJ(:,:,:)*2.*PTSTEP
 ZWS(:,:,:) = ZRWS(:,:,:)/ZMZM_RHODJ(:,:,:)*2.*PTSTEP
diff --git a/src/ZSOLVER/contrav.f90 b/src/ZSOLVER/contrav.f90
index 78b9f656f..2976ea754 100644
--- a/src/ZSOLVER/contrav.f90
+++ b/src/ZSOLVER/contrav.f90
@@ -708,23 +708,20 @@ IF (KADV_ORDER == 2 ) THEN
 #endif
 !$acc kernels  
 !
-!$acc_nv loop independent collapse(3)
-  do concurrent (ji=iib:iie,jj=1:iju,jk=ikb:ike+1)
+  !$mnh_do_concurrent(ji=iib:iie,jj=1:iju,jk=ikb:ike+1)
      Z1(ji, jj, jk ) =   ( PRUCT(ji,     jj, jk ) + PRUCT(ji,     jj, jk - 1 ) ) * PDZX (ji,     jj, jk ) * 0.25 &
                        + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25
-  end do
-!$acc_nv loop independent collapse(3)
-  do concurrent (ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1)
+  !$mnh_end_do()
+  !$mnh_do_concurrent(ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1)
      Z2(ji, jj, jk ) =   ( PRVCT(ji, jj,     jk) + PRVCT( ji, jj,    jk - 1) ) * PDZY(ji, jj,     jk) * 0.25 &
                        + ( PRVCT(ji, jj + 1, jk) + PRVCT( ji, jj + 1,jk - 1) ) * PDZY(ji, jj + 1, jk) * 0.25
-  end do
+  !$mnh_end_do()
 
   PRWCT(:,:,:)=0.
 
-!$acc_nv loop independent collapse(3)
-  do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
+  !$mnh_do_concurrent(ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
      PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
-  end do
+  !$mnh_end_do()
 !
 !$acc end kernels  
 ELSE IF (KADV_ORDER == 4 ) THEN
@@ -777,25 +774,23 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works
 !                      but ji loop can not be collapsed -> 10x slower on GPU
 !                     *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance)
-!$acc_nv loop independent collapse(3)      
-  do concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1)
+  !$mnh_do_concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1)
         ztmp1 = ( 9.0 * PDZX(ji,   jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0
         ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) ) / 3.0 ) / 16.0
         Z1(ji, jj, jk ) =  7.0 * (  ( PRUCT(ji,   jj, jk ) + PRUCT(ji,   jj, jk-1 ) ) * ztmp1                        &
                                   + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2               ) / 12.0 &
                          - 0.5 * (  ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk)           &
                                   + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk)  ) / 12.0
-  end do
+  !$mnh_end_do()
 !
-!$acc_nv loop independent collapse(3)
-  do concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1)
+  !$mnh_do_concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1)
         ztmp1 = ( 9.0 * PDZY(ji, jj,   jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0
         ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) ) / 3.0 ) / 16.0
         Z2(ji, jj, jk ) =  7.0 * (  ( PRVCT(ji, jj,   jk ) + PRVCT(ji, jj,   jk-1 ) ) * ztmp1                         &
                                   + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2                ) / 12.0 &
                          - 0.5 * (  ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk )           &
                                   + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk )  ) / 12.0
-  end do
+  !$mnh_end_do()
 !$acc end kernels
 !
 !!$CALL MPPDB_CHECK3DM("contrav_device :: dom Z1/Z2",PRECISION,Z1,Z2)  
@@ -804,27 +799,25 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !
 !!$  IF (NHALO==1) THEN
 !$acc kernels async
-!$acc_nv loop independent collapse(2)
-    do concurrent(jj=1:iju,jk=IKB:IKE+1)
+    !$mnh_do_concurrent(jj=1:iju,jk=IKB:IKE+1)
       ztmp1 = ( 9.0 * PDZX(IIE,   jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0
       ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk )   + PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) ) / 3.0 ) / 16.0
       Z1(IIE, jj, jk ) =  7.0 * (  ( PRUCT(IIE,   jj, jk ) + PRUCT(IIE,   jj, jk-1 ) ) * ztmp1                        &
                                  + ( PRUCT(IIE+1, jj, jk ) + PRUCT(IIE+1, jj, jk-1 ) ) * ztmp2               ) / 12.0 &
                         - 0.5 * (  ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk)          &
                                  + ( ZU_EAST     (jj, jk ) + ZU_EAST     (jj, jk-1 ) ) * ZDZX_EAST  (jj, jk)  ) / 12.0
-   end do
+   !$mnh_end_do()
 !$acc end kernels   
 !
 !$acc kernels async   
-!$acc_nv loop independent collapse(2)
-    do concurrent(ji=1:iiu,jk=IKB:IKE+1)
+    !$mnh_do_concurrent(ji=1:iiu,jk=IKB:IKE+1)
       ztmp1 = ( 9.0 * PDZY(ji, IJE,   jk) - ( PDZY      (ji, IJE+1, jk) + PDZY(ji, IJE,   jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0
       ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji,        jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE,   jk) ) / 3.0 ) / 16.0
       Z2(ji, IJE, jk ) =  7.0 * (  ( PRVCT   (ji, IJE,   jk ) + PRVCT   (ji, IJE,   jk-1 ) ) * ztmp1                               &
                                  + ( PRVCT   (ji, IJE+1, jk ) + PRVCT   (ji, IJE+1, jk-1 ) ) * ztmp2                      ) / 12.0 &
                         - 0.5 * (  ( PRVCT   (ji, IJE-1, jk ) + PRVCT   (ji, IJE-1, jk-1 ) ) * PDZY      (ji, IJE-1, jk )          &
                                  + ( ZV_NORTH(ji,        jk ) + ZV_NORTH(ji,        jk-1 ) ) * ZDZY_NORTH(ji,        jk ) ) / 12.0
-   end do
+   !$mnh_end_do()
 !$acc end kernels
 !$acc wait
 !!$  END IF
@@ -871,10 +864,9 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !!$
 !!$  CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ)
   PRWCT(:,:,:)=0.
-!$acc_nv loop independent collapse(3)
-  do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
+  !$mnh_do_concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
         PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
-  end do
+  !$mnh_end_do()
 !$acc end kernels 
 !
 CALL MPPDB_CHECK3DM("contrav_device :: PRWCT/Z1/Z2",PRECISION,PRWCT,Z1,Z2)
diff --git a/src/ZSOLVER/dotprod.f90 b/src/ZSOLVER/dotprod.f90
index 2c4791cc6..dab2e6a88 100644
--- a/src/ZSOLVER/dotprod.f90
+++ b/src/ZSOLVER/dotprod.f90
@@ -185,18 +185,16 @@ CALL MNH_MEM_GET(ZDOTPROD, ILBXB,ILBXE ,ILBYB,ILBYE )
 #endif
 !$acc kernels present(ZDOTPROD)
 ZDOTPROD(:,:)    = 0.
-!$acc loop seq
-!dir nextscalar
-DO JK = IKB-1,IKE+1
-   !DO CONCURRENT (JI=ILBXB:ILBXE,JJ=ILBYB:ILBYE)
-   !$acc loop collapse(2) independent
-   DO JJ = ILBYB,ILBYE
-      DO JI = ILBXB,ILBXE  
-         ZDOTPROD(JI,JJ) = ZDOTPROD(JI,JJ) + PA(JI,JJ,JK) * PB(JI,JJ,JK)
-      END DO
-   END DO
-END DO
 !$acc end kernels
+!$acc parallel 
+!$mnh_do_concurrent(JI=ILBXB:ILBXE,JJ=ILBYB:ILBYE)
+  !dir$ nextscalar
+  !$acc loop seq
+  DO JK = IKB-1,IKE+1
+         ZDOTPROD(JI,JJ) = ZDOTPROD(JI,JJ) + PA(JI,JJ,JK) * PB(JI,JJ,JK)
+  END DO
+!$mnh_end_do()
+!$acc end parallel
 !$acc update host(ZDOTPROD)
 PDOTPROD = SUM_DD_R2_ll(ZDOTPROD)
 !JUAN16
diff --git a/src/ZSOLVER/get_halo.f90 b/src/ZSOLVER/get_halo.f90
index 719fa0a19..e53338d1f 100644
--- a/src/ZSOLVER/get_halo.f90
+++ b/src/ZSOLVER/get_halo.f90
@@ -464,6 +464,11 @@ INTEGER,PARAMETER :: IS_WEST=1 , IS_EAST=2, IS_SOUTH=3, IS_NORTH=4
 LOGICAL      :: LX , LY
 INTEGER      :: NB_REQ, IERR
 !
+INTEGER :: JI,JJ,JK, JIU,JJU,JKU
+
+JIU = SIZE(PSRC,1)
+JJU = SIZE(PSRC,2)
+JKU = SIZE(PSRC,3)
 
 CALL INIT_HALO_D()
 
@@ -553,12 +558,16 @@ END IF
 IF (LX) THEN
    IF (.NOT. GWEST) THEN
       !$acc kernels async(IS_WEST)
-      ZWEST_IN ( IIB:IIB+IHALO_1  ,    IJB:IJE  , : )  = PSRC( IIB:IIB+IHALO_1  ,  IJB:IJE  , : )
+      !$mnh_expand_array(JI=IIB:IIB+IHALO_1 , JJ=IJB:IJE , JK=1:JKU )
+           ZWEST_IN ( IIB:IIB+IHALO_1  ,    IJB:IJE  , : )  = PSRC( IIB:IIB+IHALO_1  ,  IJB:IJE  , : )
+      !$mnh_end_expand_array()
       !$acc end kernels
    END IF
    IF (.NOT.GEAST) THEN
       !$acc kernels async(IS_EAST)
-      ZEAST_IN ( IIE-IHALO_1:IIE  ,    IJB:IJE  , : )  = PSRC( IIE-IHALO_1:IIE  ,  IJB:IJE  , : )
+      !$mnh_expand_array(JI=IIE-IHALO_1:IIE , JJ=IJB:IJE , JK=1:JKU)
+           ZEAST_IN ( IIE-IHALO_1:IIE  ,    IJB:IJE  , : )  = PSRC( IIE-IHALO_1:IIE  ,  IJB:IJE  , : )
+      !$mnh_end_expand_array()
       !$acc end kernels
    ENDIF
 END IF
@@ -566,12 +575,16 @@ END IF
 IF (LY) THEN
    IF (.NOT.GSOUTH) THEN
       !$acc kernels async(IS_SOUTH)
-      ZSOUTH_IN ( IIB:IIE  ,    IJB:IJB+IHALO_1  , : ) = PSRC( IIB:IIE  ,    IJB:IJB+IHALO_1  , : )
+      !$mnh_expand_array(JI=IIB:IIE , JJ=IJB:IJB+IHALO_1 , JK=1:JKU )
+           ZSOUTH_IN ( IIB:IIE  ,    IJB:IJB+IHALO_1  , : ) = PSRC( IIB:IIE  ,    IJB:IJB+IHALO_1  , : )
+      !$mnh_end_expand_array()
       !$acc end kernels
    ENDIF
    IF (.NOT.GNORTH) THEN
       !$acc kernels async(IS_NORTH)
-      ZNORTH_IN ( IIB:IIE  ,    IJE-IHALO_1:IJE  , : ) = PSRC( IIB:IIE  ,    IJE-IHALO_1:IJE  , : )
+      !$mnh_expand_array(JI=IIB:IIE , JJ=IJE-IHALO_1:IJE , JK=1:JKU )
+           ZNORTH_IN ( IIB:IIE  ,    IJE-IHALO_1:IJE  , : ) = PSRC( IIB:IIE  ,    IJE-IHALO_1:IJE  , : )
+      !$mnh_end_expand_array()
       !$acc end kernels
    ENDIF   
 ENDIF
@@ -676,6 +689,11 @@ INTEGER,PARAMETER :: IS_WEST=1 , IS_EAST=2, IS_SOUTH=3, IS_NORTH=4
 LOGICAL      :: LX , LY
 INTEGER      :: NB_REQ, IERR
 !
+INTEGER :: JI,JJ,JK, JIU,JJU,JKU
+
+JIU = SIZE(PSRC,1)
+JJU = SIZE(PSRC,2)
+JKU = SIZE(PSRC,3)
 
 CALL INIT_HALO_D()
 
@@ -715,7 +733,9 @@ IF (LX) THEN
    !$acc update device(ZWEST_OUT) async(IS_WEST)
 #endif
    !$acc kernels async(IS_WEST)
-   PSRC( 1:IIB-1  ,      IJB:IJE      , : ) = ZWEST_OUT( 1:IIB-1  ,   IJB:IJE    , : )
+   !$mnh_expand_array(JI=1:IIB-1 , JJ=IJB:IJE , JK=1:JKU )
+        PSRC( 1:IIB-1  ,      IJB:IJE      , : ) = ZWEST_OUT( 1:IIB-1  ,   IJB:IJE    , : )
+   !$mnh_end_expand_array()
    !$acc end kernels
    ENDIF
    IF (.NOT.GEAST) THEN
@@ -723,7 +743,9 @@ IF (LX) THEN
    !$acc update device(ZEAST_OUT) async(IS_EAST)
 #endif
    !$acc kernels async(IS_EAST)
-   PSRC( IIE+1:IIU  ,      IJB:IJE      , : ) = ZEAST_OUT( IIE+1:IIU  ,   IJB:IJE    , : )  
+   !$mnh_expand_array(JI=IIE+1:IIU , JJ=IJB:IJE , JK=1:JKU )
+        PSRC( IIE+1:IIU  ,      IJB:IJE      , : ) = ZEAST_OUT( IIE+1:IIU  ,   IJB:IJE    , : )  
+   !$mnh_end_expand_array()
    !$acc end kernels
    ENDIF
 END IF
@@ -733,7 +755,9 @@ IF (LY) THEN
    !$acc update device(ZSOUTH_OUT) async(IS_SOUTH)
 #endif
    !$acc kernels async(IS_SOUTH)
-   PSRC(      IIB:IIE       ,  1:IJB-1 , : ) = ZSOUTH_OUT(  IIB:IIE     , 1:IJB-1  , : )
+   !$mnh_expand_array(JI=IIB:IIE , JJ=1:IJB-1 , JK=1:JKU )
+        PSRC(      IIB:IIE       ,  1:IJB-1 , : ) = ZSOUTH_OUT(  IIB:IIE     , 1:IJB-1  , : )
+   !$mnh_end_expand_array()
    !$acc end kernels
    ENDIF
    IF (.NOT.GNORTH) THEN
@@ -741,7 +765,9 @@ IF (LY) THEN
    !$acc update device(ZNORTH_OUT) async(IS_NORTH)
 #endif
    !$acc kernels async(IS_NORTH)
-   PSRC(      IIB:IIE       , IJE+1:IJU , : ) = ZNORTH_OUT (  IIB:IIE     , IJE+1:IJU  , : )
+   !$mnh_expand_array(JI=IIB:IIE , JJ=IJE+1:IJU , JK=1:JKU )
+        PSRC(      IIB:IIE       , IJE+1:IJU , : ) = ZNORTH_OUT (  IIB:IIE     , IJE+1:IJU  , : )
+   !$mnh_end_expand_array()
    !$acc end kernels
    ENDIF
 END IF
diff --git a/src/ZSOLVER/ppm.f90 b/src/ZSOLVER/ppm.f90
index 608a9b2ae..51bf1a52a 100644
--- a/src/ZSOLVER/ppm.f90
+++ b/src/ZSOLVER/ppm.f90
@@ -525,10 +525,7 @@ ZFNEG(:,:,:) = PSRC(:,:,:)
 CALL GET_HALO_D(PSRC,HDIR="01_X", HNAME='PSRC')
 !
 !$acc kernels 
-!$acc loop independent collapse(3)
-  do jk = 1, iku
-    do jj = 1, iju
-      do ji = 1, iiu
+!$mnh_do_concurrent (ji=1:iiu,jj=1:iju,jk=1:iku)
         PR   (ji, jj, jk ) = PSRC(ji, jj, jk )
         ZQL  (ji, jj, jk ) = PSRC(ji, jj, jk )
         ZQR  (ji, jj, jk ) = PSRC(ji, jj, jk )
@@ -538,9 +535,7 @@ CALL GET_HALO_D(PSRC,HDIR="01_X", HNAME='PSRC')
         ZQL0 (ji, jj, jk ) = PSRC(ji, jj, jk )
         ZQR0 (ji, jj, jk ) = PSRC(ji, jj, jk )
         ZQ60 (ji, jj, jk ) = PSRC(ji, jj, jk )
-    end do
-  end do
-end do
+!$mnh_end_do()
 !
 #if 0
 ZFPOS(:,1:IJS,:)=PSRC(:,1:IJS,:)
diff --git a/src/ZSOLVER/turb.f90 b/src/ZSOLVER/turb.f90
index a173a86c6..c72f7d21d 100644
--- a/src/ZSOLVER/turb.f90
+++ b/src/ZSOLVER/turb.f90
@@ -14,7 +14,7 @@ module mode_turb
 #if defined(MNH_BITREP) || defined(MNH_BITREP_OMP)
 use modi_bitrep
 #endif
-#ifdef MNH_COMPILER_CCE
+#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP)
 !$mnh_undef(LOOP)
 !$mnh_undef(OPENACC)  
 #endif
@@ -760,9 +760,9 @@ ELSE
 #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
   ZEXN(:,:,:) = (PPABST(:,:,:)/XP00) ** (XRD/XCPD)
 #else
-DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
   ZEXN(JI,JJ,JK) = BR_POW(PPABST(JI,JJ,JK)/XP00,XRD/XCPD)
-END DO
+!$mnh_end_do()
 #endif
 END IF
 !
@@ -811,18 +811,20 @@ IF (KRRL >=1) THEN
                                  ZLSOCPEXNM,ZAMOIST_ICE,ZATHETA_ICE)
 !
 !$acc kernels present_cr( zamoist, zatheta, zlocpexnm, zlvocpexnm, zlsocpexnm, zamoist_ice, zatheta_ice )    
-    DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+    !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        IF (PRT(JI,JJ,JK,2)+PRT(JI,JJ,JK,4)>0.0) THEN
           ZFRAC_ICE(JI,JJ,JK) = PRT(JI,JJ,JK,4) / ( PRT(JI,JJ,JK,2)+PRT(JI,JJ,JK,4) )
        END IF
-    END DO
+    !$mnh_end_do()
 !
+  !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
     ZLOCPEXNM(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZLVOCPEXNM(:,:,:) &
                            +ZFRAC_ICE(:,:,:) *ZLSOCPEXNM(:,:,:)
     ZAMOIST(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZAMOIST(:,:,:) &
                          +ZFRAC_ICE(:,:,:) *ZAMOIST_ICE(:,:,:)
     ZATHETA(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZATHETA(:,:,:) &
                          +ZFRAC_ICE(:,:,:) *ZATHETA_ICE(:,:,:)
+  !$mnh_end_expand_array()
 !$acc end kernels
 
 !$acc end data
@@ -877,7 +879,7 @@ END IF              ! loop end on KRRL >= 1
 IF ( KRRL >= 1 ) THEN
 !$acc kernels present_cr( zlocpexnm )
   IF ( KRRI >= 1 ) THEN
-    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+    !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ! Rnp at t
        PRT(JI,JJ,JK,1)  = PRT(JI,JJ,JK,1)  + PRT(JI,JJ,JK,2)  + PRT(JI,JJ,JK,4)
        PRRS(JI,JJ,JK,1) = PRRS(JI,JJ,JK,1) + PRRS(JI,JJ,JK,2) + PRRS(JI,JJ,JK,4)
@@ -886,16 +888,16 @@ IF ( KRRL >= 1 ) THEN
             - ZLSOCPEXNM(JI,JJ,JK) * PRT(JI,JJ,JK,4)
        PRTHLS(JI,JJ,JK) = PRTHLS(JI,JJ,JK) - ZLVOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,2) &
             - ZLSOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,4)
-    ENDDO
+    !$mnh_end_do()
  ELSE
-    DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+    !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
        ! Rnp at t   
        PRT(JI,JJ,JK,1)  = PRT(JI,JJ,JK,1)  + PRT(JI,JJ,JK,2) 
        PRRS(JI,JJ,JK,1) = PRRS(JI,JJ,JK,1) + PRRS(JI,JJ,JK,2)
        ! Theta_l at t
        PTHLT(JI,JJ,JK)  = PTHLT(JI,JJ,JK)  - ZLOCPEXNM(JI,JJ,JK) * PRT(JI,JJ,JK,2)
        PRTHLS(JI,JJ,JK) = PRTHLS(JI,JJ,JK) - ZLOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,2)
-    ENDDO
+    !$mnh_end_do()
  END IF
 !$acc end kernels
 END IF
@@ -1083,11 +1085,10 @@ ENDIF
   ZCDUEFF(:,:) =-SQRT ( (PSFU(:,:)**2 + PSFV(:,:)**2) /                  &
                         (XMNH_TINY + ZUSLOPE(:,:)**2 + ZVSLOPE(:,:)**2 ) )
 #else
-  !$acc_nv loop independent collapse(2)
-  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
+  !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
      ZCDUEFF(JI,JJ) =-SQRT ( (BR_P2(PSFU(JI,JJ)) + BR_P2(PSFV(JI,JJ))) /                  &
                     (XMNH_TINY + BR_P2(ZUSLOPE(JI,JJ)) + BR_P2(ZVSLOPE(JI,JJ)) ) )     
-  END DO
+  !$mnh_end_do()
 #endif
 !$acc end kernels
 !
@@ -1879,9 +1880,9 @@ CALL MNH_MEM_GET( zdrvsatdt, size( pexn, 1 ), size( pexn, 2 ), size( pexn, 3 ) )
 #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
   ZRVSAT(:,:,:) =  EXP( PALP - PBETA/PT(:,:,:) - PGAM*ALOG( PT(:,:,:) ) )
 #else
-  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+  !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
      ZRVSAT(JI,JJ,JK) =  BR_EXP( PALP - PBETA/PT(JI,JJ,JK) - PGAM*BR_LOG( PT(JI,JJ,JK) ) )
-  END DO
+  !$mnh_end_do()
 #endif
 !$acc end kernels  
 !$acc kernels present_cr(ZRVSAT,ZDRVSATDT) 
@@ -1915,8 +1916,7 @@ CALL MNH_MEM_GET( zdrvsatdt, size( pexn, 1 ), size( pexn, 2 ), size( pexn, 3 ) )
          - ZDRVSATDT(:,:,:)                                                  &
         )
 #else
-!$acc_nv loop independent collapse(3)
-DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
  PATHETA(JI,JJ,JK)= PAMOIST(JI,JJ,JK) * PEXN(JI,JJ,JK) *                             &
         ( ( ZRVSAT(JI,JJ,JK) - PRT(JI,JJ,JK,1) ) * PLOCPEXN(JI,JJ,JK) /              &
           ( 1. + ZDRVSATDT(JI,JJ,JK) * PLOCPEXN(JI,JJ,JK) )        *                 &
@@ -1928,16 +1928,15 @@ DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
           )                                                                          &
          - ZDRVSATDT(JI,JJ,JK)                                                       &
         )
-ENDDO
+!$mnh_end_do()
 #endif
 !$acc end kernels
 !*      1.7 Lv/Cph/Exner at t-1
 !
 !$acc kernels present_cr(PLOCPEXN)
-!$acc_nv loop independent collapse(3)
-DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
    PLOCPEXN(JI,JJ,JK) = PLOCPEXN(JI,JJ,JK) / PEXN(JI,JJ,JK)
-END DO
+!$mnh_end_do()
 !$acc end kernels
 
   if ( mppdb_initialized ) then
@@ -2275,9 +2274,9 @@ IF (ODZ) THEN
 #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
       PLM(:,:,:) = ( PLM(:,:,:) * ZTMP1_DEVICE(:,:,:) * ZTMP2_DEVICE(:,:,:) ) ** (1./3.)
 #else
-DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
       PLM(JI,JJ,JK) = BR_POW( PLM(JI,JJ,JK) * ZTMP1_DEVICE(JI,JJ,JK) * ZTMP2_DEVICE(JI,JJ,JK), 1./3. )
-ENDDO
+!$mnh_end_do()
 #endif
 !$acc end kernels
 #endif
@@ -2309,9 +2308,9 @@ ELSE
 #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP)
       PLM(:,:,:) = ( ZTMP1_DEVICE * ZTMP2_DEVICE ) ** (1./2.)
 #else
-   DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK=1:JKU )
+   !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
       PLM(JI,JJ,JK) = BR_POW( ZTMP1_DEVICE(JI,JJ,JK) * ZTMP2_DEVICE(JI,JJ,JK),  1. / 2. )
-   END DO
+   !$mnh_end_do()
 #endif
 !$acc end kernels
 #endif
@@ -2360,12 +2359,12 @@ IF (.NOT. ORMC01) THEN
 END IF
 !
 !$acc kernels
-DO CONCURRENT(JI=1:JIU , JJ=1:JJU )
-   PLM(JI,JJ,KKA) = PLM(JI,JJ,KKB  )
-END DO
-DO CONCURRENT(JI=1:JIU , JJ=1:JJU )
-   PLM(JI,JJ,KKU  ) = PLM(JI,JJ,KKE)
-END DO
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
+   PLM(JI,JJ,KKA) = PLM(JI,JJ,KKB)
+! mnh_end_do()
+! mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
+   PLM(JI,JJ,KKU) = PLM(JI,JJ,KKE)
+!$mnh_end_do()
 !$acc end kernels
 
 !$acc end data
@@ -2580,9 +2579,9 @@ IF ( HTURBDIM /= '1DIM' ) THEN  ! 3D turbulence scheme
        call Mppdb_check( plm, "Dear mid1:plm" )
     end if
 !$acc kernels
-DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU)
     PLM(JI,JJ,JK) = BR_POW( PLM(JI,JJ,JK)*ZTMP1_DEVICE(JI,JJ,JK)    *ZTMP2_DEVICE(JI,JJ,JK)     , 1./3. )
-ENDDO
+!$mnh_end_do()
 !$acc end kernels
     if ( mppdb_initialized ) then
        call Mppdb_check( plm, "Dear mid2:plm" )
@@ -2603,8 +2602,8 @@ CALL EMOIST(KRR,KRRI,PTHLT,PRT,PLOCPEXNM,PAMOIST,PSRCT,ZEMOIST)
 !
 !$acc kernels present(ZWORK2D,PLM)
 IF (KRR>0) THEN
-   !$acc_nv loop independent collapse(3) private(ZVAR)
-   DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK = KKTB+1:KKTE-1)          
+   ! acc loop private(ZVAR)
+   !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=KKTB+1:KKTE-1)          
             ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
                  (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
             ZDRTDZ(JI,JJ,JK) = 0.5*((PRT(JI,JJ,JK+KKL,1)-PRT(JI,JJ,JK    ,1))/PDZZ(JI,JJ,JK+KKL)+ &
@@ -2620,10 +2619,10 @@ IF (KRR>0) THEN
                PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
                     0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
             END IF
-   END DO
+   !$mnh_end_do()
 ELSE! For dry atmos or unsalted ocean runs
-   !$acc_nv loop independent collapse(3) private(ZVAR)
-   DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK = KKTB+1:KKTE-1)
+   ! acc loop private(ZVAR)
+   !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=KKTB+1:KKTE-1)
             ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
                  (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
             IF (GOCEAN) THEN
@@ -2636,7 +2635,7 @@ ELSE! For dry atmos or unsalted ocean runs
                PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
                     0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
             END IF
-   END DO
+   !$mnh_end_do()
 END IF
 !  special case near the surface
 ZDTHLDZ(:,:,KKB)=(PTHLT(:,:,KKB+KKL)-PTHLT(:,:,KKB))/PDZZ(:,:,KKB+KKL)
@@ -2653,12 +2652,9 @@ IF (GOCEAN) THEN
   ZWORK2D(:,:)=XG*(XALPHAOC*ZDTHLDZ(:,:,KKB)-XBETAOC*ZDRTDZ(:,:,KKB))
 #else
 !PW: bug: nvhpc 21.11 does not parallelize this loop even with loop independent directive!
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)
-#endif
-  DO CONCURRENT( JI = 1 : JIU, JJ = 1 : JJU )
+  !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
     ZWORK2D(JI,JJ)=XG*(XALPHAOC*ZDTHLDZ(JI,JJ,KKB)-XBETAOC*ZDRTDZ(JI,JJ,KKB))
-  END DO
+  !$mnh_end_do()
 #endif
 ELSE
 #if 0
@@ -2667,21 +2663,18 @@ ELSE
               (ZETHETA(:,:,KKB)*ZDTHLDZ(:,:,KKB)+ZEMOIST(:,:,KKB)*ZDRTDZ(:,:,KKB))
 #else
 !PW: bug: nvhpc 21.11 does not parallelize this loop even with loop independent directive!
-#ifdef MNH_COMPILER_NVHPC
-!$acc loop independent collapse(2)
-#endif
-  DO CONCURRENT( JI = 1 : JIU, JJ = 1 : JJU )
+  !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
     ZWORK2D(JI,JJ)=XG/PTHVREF(JI,JJ,KKB)*                                           &
               (ZETHETA(JI,JJ,KKB)*ZDTHLDZ(JI,JJ,KKB)+ZEMOIST(JI,JJ,KKB)*ZDRTDZ(JI,JJ,KKB))
-  END DO
+  !$mnh_end_do()
 #endif
 END IF
-DO CONCURRENT(JI=1:JIU,JJ=1:JJU)
+!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU)
    IF (ZWORK2D(JI,JJ)>0.) THEN
       PLM(JI,JJ,KKB)=MAX(XMNH_EPSILON,MIN( PLM(JI,JJ,KKB),                 &
            0.76* SQRT(PTKET(JI,JJ,KKB)/ZWORK2D(JI,JJ))))
    END IF
-END DO
+!$mnh_end_do()
 !
 !  mixing length limited by the distance normal to the surface (with the same factor as for BL89)
 !
-- 
GitLab