From fc613008cf3cfbfe8ddbf5d25ce3a345e2ac7395 Mon Sep 17 00:00:00 2001
From: Philippe WAUTELET <philippe.wautelet@aero.obs-mip.fr>
Date: Wed, 26 Jun 2019 15:08:13 +0200
Subject: [PATCH] Philippe 26/06/2019: OPENACC: optimisation for GPU + improve
 readability

---
 src/MNH/contrav.f90 | 256 +++++++++++++++++++++-----------------------
 1 file changed, 124 insertions(+), 132 deletions(-)

diff --git a/src/MNH/contrav.f90 b/src/MNH/contrav.f90
index b0c14210a..4a1d6da58 100644
--- a/src/MNH/contrav.f90
+++ b/src/MNH/contrav.f90
@@ -316,7 +316,7 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !!$   END IF
    !
    !
-   !*       3.1    interior of the processor subdomain
+   !*       3.1    interior of the process subdomain
 !
 !
    Z1(IW:IE,:,IKB:IKE+1)=                                               &
@@ -344,7 +344,7 @@ ELSE IF (KADV_ORDER == 4 ) THEN
             +(PRVCT(:,IS+2:IN+2,IKB:IKE+1)+PRVCT(:,IS+2:IN+2,IKB-1:IKE)) &
             *PDZY(:,IS+2:IN+2,IKB:IKE+1) *0.5)/12.0
 !
-!*       3.2    limits of the processor subdomain (inside the whole domain or in cyclic conditions)
+!*       3.2    limits of the process subdomain (inside the whole domain or in cyclic conditions)
 !
 !!$  IF (NHALO==1) THEN
 
@@ -519,6 +519,7 @@ END SUBROUTINE CONTRAV
 !!      Corrections 19/01/11 (by J.P. Pinty) WC 4th order
 !!      Corrections 28/03/11 (by V.Masson) // of WC 4th order
 !!      J.Escobar 21/03/2013: for HALOK comment all NHALO=1 test
+!  P. Wautelet 26/06/2019: optimisation for GPU + improve readability
 !----------------------------------------------------------------------------
 !
 !*       0.    DECLARATIONS
@@ -562,22 +563,22 @@ REAL, DIMENSION(:,:,:),  INTENT(OUT)   ::  PRWCT    ! Contrav comp along z-bar
 INTEGER,                 INTENT(IN)    ::  KADV_ORDER ! Order of the advection scheme
 REAL, DIMENSION(:,:,:),  INTENT(OUT)   :: Z1,Z2       ! Work arrays
 !$acc declare present(Z1,Z2)
-LOGICAL, OPTIONAL, INTENT(IN) :: ODATA_ON_DEVICE ! Is some of the data on the accelerator device
+LOGICAL,   OPTIONAL,     INTENT(IN)    :: ODATA_ON_DEVICE ! Is some of the data on the accelerator device
 !
 !
 !*       0.2   declarations of local variables
 !
-REAL, DIMENSION(:,:),ALLOCATABLE::ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH
-!$acc declare create(ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH)
-INTEGER :: IIB,IIE,IJB,IJE,IKB,IKE
-INTEGER :: IIU, IJU, IKU
-INTEGER:: IW,IE,IS,IN   ! Coordinate of forth order diffusion area
-!
-TYPE(LIST_ll),      POINTER :: TZFIELD_U, TZFIELD_V, TZFIELD_DZX, TZFIELD_DZY
-TYPE(HALO2LIST_ll), POINTER :: TZHALO2_U, TZHALO2_V, TZHALO2_DZX, TZHALO2_DZY
-INTEGER                     :: IINFO_ll
-LOGICAL :: GDATA_ON_DEVICE
-
+integer                             :: ji, jj, jk
+INTEGER                             :: IIB, IIE, IJB, IJE, IKB, IKE
+INTEGER                             :: IIU, IJU, IKU
+INTEGER                             :: IW, IE, IS, IN   ! Coordinate of fourth order diffusion area
+INTEGER                             :: IINFO_ll
+LOGICAL                             :: GDATA_ON_DEVICE
+real                                :: ZTMP1, ZTMP2 ! Intermediate work variables
+REAL,   DIMENSION(:,:), ALLOCATABLE :: ZU_EAST, ZV_NORTH, ZDZX_EAST, ZDZY_NORTH
+!$acc declare create(ZU_EAST, ZV_NORTH, ZDZX_EAST, ZDZY_NORTH)
+TYPE(LIST_ll),          POINTER     :: TZFIELD_U, TZFIELD_V, TZFIELD_DZX, TZFIELD_DZY
+TYPE(HALO2LIST_ll),     POINTER     :: TZHALO2_U, TZHALO2_V, TZHALO2_DZX, TZHALO2_DZY
 
 IF ( PRESENT(ODATA_ON_DEVICE) ) THEN
   GDATA_ON_DEVICE = ODATA_ON_DEVICE
@@ -612,7 +613,7 @@ IKE=IKU - JPVEXT
 !
 IF (GDATA_ON_DEVICE) THEN
 !PW TODO:remplacer (ailleurs aussi...) 1/PDXX... par PINV_PDXX (fait pour la turbulence...) cfr MNH/turb_hor_splt.f90
-!$acc kernels present(PRUCT,PRVCT,PRUT,PRVT,PDXX,PDYY)
+!$acc kernels
   PRUCT(:,:,:) = PRUT(:,:,:) / PDXX(:,:,:)
   PRVCT(:,:,:) = PRVT(:,:,:) / PDYY(:,:,:)
 !$acc end kernels
@@ -656,7 +657,7 @@ IF (KADV_ORDER == 4 ) THEN
   ZDZX_EAST(:,:)  = TZHALO2_DZX%HALO2%EAST
   ZV_NORTH(:,:)   = TZHALO2_V%HALO2%NORTH
   ZDZY_NORTH(:,:) = TZHALO2_DZY%HALO2%NORTH
-  !$acc update device(ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH)
+!$acc update device(ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH)
  END IF
 END IF
 !
@@ -666,7 +667,7 @@ END IF
 !
 IF (LFLAT) THEN
 IF (GDATA_ON_DEVICE) THEN
-!$acc kernels present(PRWCT,PRWT,PDZZ)
+!$acc kernels
   PRWCT(:,:,:) = PRWT(:,:,:) / PDZZ(:,:,:)
 !$acc end kernels
 !$acc update self(PRWCT)
@@ -679,11 +680,9 @@ END IF
 !*       3.    Compute the vertical contravariant components (general case)
 !              ------------------------------------
 !
-!$acc kernels present(Z1,Z2) present(PRUCT,PRVCT,PRWCT) present(PDXX,PDYY,PDZZ,PDZX,PDZY) &
-!$acc & present(ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH)
-!PW:TODO:initialize Z1,Z2,PRWCT only if necessary (or part of it)
-Z1(:,:,:) = 0.
-Z2(:,:,:) = 0.
+!$acc kernels
+! Z1(:,:,:) = 0.
+! Z2(:,:,:) = 0.
 !
 IF (KADV_ORDER == 2 ) THEN
 #ifdef _OPENACC
@@ -691,26 +690,36 @@ PRINT *,'OPENACC: contrav::KADV_ORDER=2 and LFLAT=.TRUE. not yet tested'
 CALL ABORT
 #endif
 !
-!PW:TODO: not parallelized with openacc
-  Z1(IIB:IIE,:,IKB:IKE+1)=                                             &
-      (PRUCT(IIB:IIE,:,IKB:IKE+1)+PRUCT(IIB:IIE,:,IKB-1:IKE) )         &
-       *PDZX(IIB:IIE,:,IKB:IKE+1) *0.25                                &
-     +(PRUCT(IIB+1:IIE+1,:,IKB:IKE+1)+PRUCT(IIB+1:IIE+1,:,IKB-1:IKE) ) &
-       *PDZX(IIB+1:IIE+1,:,IKB:IKE+1) *0.25   
-                        
-!PW:TODO: not parallelized with openacc
-  Z2(:,IJB:IJE,IKB:IKE+1)=                                             &
-      (PRVCT(:,IJB:IJE,IKB:IKE+1)+PRVCT(:,IJB:IJE,IKB-1:IKE) )         &
-       *PDZY(:,IJB:IJE,IKB:IKE+1) *0.25                                &
-     +(PRVCT(:,IJB+1:IJE+1,IKB:IKE+1)+PRVCT(:,IJB+1:IJE+1,IKB-1:IKE) ) &
-       *PDZY(:,IJB+1:IJE+1,IKB:IKE+1) *0.25   
+!$acc loop independent collapse(3)
+  do jk = ikb, ike + 1
+    do jj = 1, iju
+      do ji = iib, iie
+        Z1(ji, jj, jk ) =   ( PRUCT(ji,     jj, jk ) + PRUCT(ji,     jj, jk - 1 ) ) * PDZX (ji,     jj, jk ) * 0.25 &
+                          + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25
+      end do
+    end do
+  end do
+
+!$acc loop independent collapse(3)
+  do jk = ikb, ike + 1
+    do jj = ijb, ije
+      do ji = 1, iiu
+        Z2(:, jj, jk ) =   ( PRVCT(:, jj,     jk) + PRVCT( :, jj,    jk - 1) ) * PDZY(:, jj,     jk) * 0.25 &
+                         + ( PRVCT(:, jj + 1, jk) + PRVCT( :, jj + 1,jk - 1) ) * PDZY(:, jj + 1, jk) * 0.25
+      end do
+    end do
+  end do
+
   PRWCT(:,:,:)=0.
-!PW:TODO: not parallelized with openacc
-  PRWCT(IIB:IIE,IJB:IJE,IKB:IKE+1) =                 &
-      (   PRWT(IIB:IIE,IJB:IJE,IKB:IKE+1)            &
-        - Z1(IIB:IIE,IJB:IJE,IKB:IKE+1)              &
-        - Z2(IIB:IIE,IJB:IJE,IKB:IKE+1)              &
-      ) / PDZZ(IIB:IIE,IJB:IJE,IKB:IKE+1)  
+
+!$acc loop independent collapse(3)
+  do jk = ikb, ike + 1
+    do jj = ijb, ije
+      do ji = iib, iie
+        PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
+      end do
+    end do
+  end do
 !
 ELSE IF (KADV_ORDER == 4 ) THEN
 !
@@ -755,109 +764,90 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !!$   END IF
    !
    !
-   !*       3.1    interior of the processor subdomain
-!
-!
-   Z1(IW:IE,:,IKB:IKE+1)=                                               &
-       7.0*( (PRUCT(IW:IE,:,IKB:IKE+1)+PRUCT(IW:IE,:,IKB-1:IKE)) &
-            *( 9.0*PDZX(IW:IE,:,IKB:IKE+1)-(PDZX(IW+1:IE+1,:,IKB:IKE+1) &
-                  +PDZX(IW:IE,:,IKB:IKE+1)+PDZX(IW-1:IE-1,:,IKB:IKE+1))/3.0)/8.0 * 0.5 &
-           +(PRUCT(IW+1:IE+1,:,IKB:IKE+1)+PRUCT(IW+1:IE+1,:,IKB-1:IKE)) &
-            *( 9.0*PDZX(IW+1:IE+1,:,IKB:IKE+1)-(PDZX(IW+2:IE+2,:,IKB:IKE+1) &
-                  +PDZX(IW+1:IE+1,:,IKB:IKE+1)+PDZX(IW:IE,:,IKB:IKE+1))/3.0)/8.0 * 0.5 )/12.0 &
-          -( (PRUCT(IW-1:IE-1,:,IKB:IKE+1)+PRUCT(IW-1:IE-1,:,IKB-1:IKE)) &
-            *PDZX(IW-1:IE-1,:,IKB:IKE+1) *0.5 &
-            +(PRUCT(IW+2:IE+2,:,IKB:IKE+1)+PRUCT(IW+2:IE+2,:,IKB-1:IKE)) &
-            *PDZX(IW+2:IE+2,:,IKB:IKE+1) *0.5)/12.0
-
-!
-   Z2(:,IS:IN,IKB:IKE+1)=                                               &
-       7.0*( (PRVCT(:,IS:IN,IKB:IKE+1)+PRVCT(:,IS:IN,IKB-1:IKE)) &
-            *( 9.0*PDZY(:,IS:IN,IKB:IKE+1)-(PDZY(:,IS+1:IN+1,IKB:IKE+1) &
-                  +PDZY(:,IS:IN,IKB:IKE+1)+PDZY(:,IS-1:IN-1,IKB:IKE+1))/3.0)/8.0 * 0.5 &
-           +(PRVCT(:,IS+1:IN+1,IKB:IKE+1)+PRVCT(:,IS+1:IN+1,IKB-1:IKE)) &
-            *( 9.0*PDZY(:,IS+1:IN+1,IKB:IKE+1)-(PDZY(:,IS+2:IN+2,IKB:IKE+1) &
-                  +PDZY(:,IS+1:IN+1,IKB:IKE+1)+PDZY(:,IS:IN,IKB:IKE+1))/3.0)/8.0 * 0.5 )/12.0 &
-          -( (PRVCT(:,IS-1:IN-1,IKB:IKE+1)+PRVCT(:,IS-1:IN-1,IKB-1:IKE)) &
-            *PDZY(:,IS-1:IN-1,IKB:IKE+1) *0.5 &
-            +(PRVCT(:,IS+2:IN+2,IKB:IKE+1)+PRVCT(:,IS+2:IN+2,IKB-1:IKE)) &
-            *PDZY(:,IS+2:IN+2,IKB:IKE+1) *0.5)/12.0
-!
-!*       3.2    limits of the processor subdomain (inside the whole domain or in cyclic conditions)
+   !*       3.1    interior of the process subdomain
+!
+!
+!PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works
+!                      but ji loop can not be collapsed -> 10x slower on GPU
+!                     *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance)
+!$acc loop independent collapse(3) private(ztmp1, ztmp2)
+  do jk = IKB, IKE + 1
+    do jj = 1, iju
+      do ji = IW, IE
+        ztmp1 = ( 9.0 * PDZX(ji,   jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0
+        ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) ) / 3.0 ) / 16.0
+        Z1(ji, jj, jk ) =  7.0 * (  ( PRUCT(ji,   jj, jk ) + PRUCT(ji,   jj, jk-1 ) ) * ztmp1                        &
+                                  + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2               ) / 12.0 &
+                         - 0.5 * (  ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk)           &
+                                  + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk)  ) / 12.0
+      end do
+    end do
+  end do
+!
+!$acc loop independent collapse(3)
+  do jk = IKB, IKE + 1
+    do jj = is, in
+      do ji = 1, iiu
+        ztmp1 = ( 9.0 * PDZY(ji, jj,   jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0
+        ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) ) / 3.0 ) / 16.0
+        Z2(ji, jj, jk ) =  7.0 * (  ( PRVCT(ji, jj,   jk ) + PRVCT(ji, jj,   jk-1 ) ) * ztmp1                         &
+                                  + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2                ) / 12.0 &
+                         - 0.5 * (  ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk )           &
+                                  + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk )  ) / 12.0
+      end do
+    end do
+  end do
+!
+!*       3.2    limits of the process subdomain (inside the whole domain or in cyclic conditions)
 !
 !!$  IF (NHALO==1) THEN
-
-   Z1(IIE,:,IKB:IKE+1)=                                               &
-       7.0*( (PRUCT(IIE,:,IKB:IKE+1)+PRUCT(IIE,:,IKB-1:IKE)) &
-            *( 9.0*PDZX(IIE,:,IKB:IKE+1)-(PDZX(IIE+1,:,IKB:IKE+1) &
-                  +PDZX(IIE,:,IKB:IKE+1)+PDZX(IIE-1,:,IKB:IKE+1))/3.0)/8.0 * 0.5 &
-           +(PRUCT(IIE+1,:,IKB:IKE+1)+PRUCT(IIE+1,:,IKB-1:IKE)) &
-!            *( 9.0*PDZX(IIE+1,:,IKB:IKE+1)-(TZHALO2_DZX%HALO2%EAST(:,IKB:IKE+1) &
-            *( 9.0*PDZX(IIE+1,:,IKB:IKE+1)-(ZDZX_EAST(:,IKB:IKE+1) &
-                  +PDZX(IIE+1,:,IKB:IKE+1)+PDZX(IIE,:,IKB:IKE+1))/3.0)/8.0 * 0.5 )/12.0 &
-          -( (PRUCT(IIE-1,:,IKB:IKE+1)+PRUCT(IIE-1,:,IKB-1:IKE)) &
-            *PDZX(IIE-1,:,IKB:IKE+1) *0.5 &
-!            +(TZHALO2_U%HALO2%EAST(:,IKB:IKE+1)+TZHALO2_U%HALO2%EAST(:,IKB-1:IKE)) &
-            +(ZU_EAST(:,IKB:IKE+1)+ZU_EAST(:,IKB-1:IKE)) &
-!            *TZHALO2_DZX%HALO2%EAST(:,IKB:IKE+1) *0.5)/12.0
-            *ZDZX_EAST(:,IKB:IKE+1) *0.5)/12.0
-!
-   Z2(:,IJE,IKB:IKE+1)=                                               &
-       7.0*( (PRVCT(:,IJE,IKB:IKE+1)+PRVCT(:,IJE,IKB-1:IKE)) &
-            *( 9.0*PDZY(:,IJE,IKB:IKE+1)-(PDZY(:,IJE+1,IKB:IKE+1) &
-                  +PDZY(:,IJE,IKB:IKE+1)+PDZY(:,IJE-1,IKB:IKE+1))/3.0)/8.0 * 0.5 &
-           +(PRVCT(:,IJE+1,IKB:IKE+1)+PRVCT(:,IJE+1,IKB-1:IKE)) &
-!            *( 9.0*PDZY(:,IJE+1,IKB:IKE+1)-(TZHALO2_DZY%HALO2%NORTH(:,IKB:IKE+1) &
-            *( 9.0*PDZY(:,IJE+1,IKB:IKE+1)-(ZDZY_NORTH(:,IKB:IKE+1) &
-                  +PDZY(:,IJE+1,IKB:IKE+1)+PDZY(:,IJE,IKB:IKE+1))/3.0)/8.0 * 0.5 )/12.0 &
-          -( (PRVCT(:,IJE-1,IKB:IKE+1)+PRVCT(:,IJE-1,IKB-1:IKE)) &
-            *PDZY(:,IJE-1,IKB:IKE+1) *0.5 &
-!            +(TZHALO2_V%HALO2%NORTH(:,IKB:IKE+1)+TZHALO2_V%HALO2%NORTH(:,IKB-1:IKE)) &
-            +(ZV_NORTH(:,IKB:IKE+1)+ZV_NORTH(:,IKB-1:IKE)) &
-!            *TZHALO2_DZY%HALO2%NORTH(:,IKB:IKE+1) *0.5)/12.0
-            *ZDZY_NORTH(:,IKB:IKE+1) *0.5)/12.0
+!$acc loop independent collapse(2)
+  do jk = IKB, IKE + 1
+    do jj = 1, iju
+      ztmp1 = ( 9.0 * PDZX(IIE,   jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0
+      ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk )   + PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) ) / 3.0 ) / 16.0
+      Z1(IIE, jj, jk ) =  7.0 * (  ( PRUCT(IIE,   jj, jk ) + PRUCT(IIE,   jj, jk-1 ) ) * ztmp1                        &
+                                 + ( PRUCT(IIE+1, jj, jk ) + PRUCT(IIE+1, jj, jk-1 ) ) * ztmp2               ) / 12.0 &
+                        - 0.5 * (  ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk)          &
+                                 + ( ZU_EAST     (jj, jk ) + ZU_EAST     (jj, jk-1 ) ) * ZDZX_EAST  (jj, jk)  ) / 12.0
+    end do
+  end do
+!
+!$acc loop independent collapse(2)
+  do jk = IKB, IKE + 1
+    do ji = 1, iiu
+      ztmp1 = ( 9.0 * PDZY(ji, IJE,   jk) - ( PDZY      (ji, IJE+1, jk) + PDZY(ji, IJE,   jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0
+      ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji,        jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE,   jk) ) / 3.0 ) / 16.0
+      Z2(ji, IJE, jk ) =  7.0 * (  ( PRVCT   (ji, IJE,   jk ) + PRVCT   (ji, IJE,   jk-1 ) ) * ztmp1                               &
+                                 + ( PRVCT   (ji, IJE+1, jk ) + PRVCT   (ji, IJE+1, jk-1 ) ) * ztmp2                      ) / 12.0 &
+                        - 0.5 * (  ( PRVCT   (ji, IJE-1, jk ) + PRVCT   (ji, IJE-1, jk-1 ) ) * PDZY      (ji, IJE-1, jk )          &
+                                 + ( ZV_NORTH(ji,        jk ) + ZV_NORTH(ji,        jk-1 ) ) * ZDZY_NORTH(ji,        jk ) ) / 12.0
+    end do
+  end do
 !!$  END IF
 !
 !*       3.3    non-CYCLIC CASE IN THE X DIRECTION: 2nd order case
 !
   IF (HLBCX(1)/='CYCL' .AND. LWEST_ll()) THEN
-!
-   Z1(IIB,:,IKB:IKE+1)=                                     &
-       (PRUCT(IIB,:,IKB:IKE+1)+PRUCT(IIB,:,IKB-1:IKE) )     &
-        *PDZX(IIB,:,IKB:IKE+1) *0.25                        &
-      +(PRUCT(IIB+1,:,IKB:IKE+1)+PRUCT(IIB+1,:,IKB-1:IKE) ) &
-        *PDZX(IIB+1,:,IKB:IKE+1) *0.25   
+    Z1(IIB, :, IKB:IKE+1 ) =  ( PRUCT(IIB,   :, IKB:IKE+1 ) + PRUCT(IIB,   :, IKB-1:IKE ) ) * PDZX(IIB,   :, IKB:IKE+1 ) * 0.25 &
+                            + ( PRUCT(IIB+1, :, IKB:IKE+1 ) + PRUCT(IIB+1, :, IKB-1:IKE ) ) * PDZX(IIB+1, :, IKB:IKE+1 ) * 0.25
   END IF
 !
   IF (HLBCX(2)/='CYCL' .AND. LEAST_ll()) THEN
-!
-   Z1(IIE,:,IKB:IKE+1)=                                     &
-       (PRUCT(IIE,:,IKB:IKE+1)+PRUCT(IIE,:,IKB-1:IKE) )     &
-        *PDZX(IIE,:,IKB:IKE+1) *0.25                        &
-      +(PRUCT(IIE+1,:,IKB:IKE+1)+PRUCT(IIE+1,:,IKB-1:IKE) ) &
-        *PDZX(IIE+1,:,IKB:IKE+1) *0.25   
+    Z1(IIE, :, IKB:IKE+1 ) =  ( PRUCT(IIE,   :, IKB:IKE+1 ) + PRUCT(IIE,   :, IKB-1:IKE ) ) * PDZX(IIE,   :, IKB:IKE+1 ) * 0.25 &
+                            + ( PRUCT(IIE+1, :, IKB:IKE+1 ) + PRUCT(IIE+1, :, IKB-1:IKE ) ) * PDZX(IIE+1, :, IKB:IKE+1 ) * 0.25
   END IF
 !
 !*       3.4    non-CYCLIC CASE IN THE Y DIRECTION: 2nd order case
 !
   IF (HLBCY(1)/='CYCL' .AND. LSOUTH_ll()) THEN
-!
-   Z2(:,IJB,IKB:IKE+1)=                                     &
-       (PRVCT(:,IJB,IKB:IKE+1)+PRVCT(:,IJB,IKB-1:IKE) )     &
-        *PDZY(:,IJB,IKB:IKE+1) *0.25                        &
-      +(PRVCT(:,IJB+1,IKB:IKE+1)+PRVCT(:,IJB+1,IKB-1:IKE) ) &
-        *PDZY(:,IJB+1,IKB:IKE+1) *0.25   
-!
+    Z2(:, IJB, IKB:IKE+1 ) =  ( PRVCT(:, IJB,   IKB:IKE+1 ) + PRVCT(:, IJB,   IKB-1:IKE ) ) * PDZY(:, IJB,   IKB:IKE+1 ) * 0.25 &
+                            + ( PRVCT(:, IJB+1, IKB:IKE+1 ) + PRVCT(:, IJB+1, IKB-1:IKE ) ) * PDZY(:, IJB+1, IKB:IKE+1 ) * 0.25
   END IF
 !
   IF (HLBCY(2)/='CYCL' .AND. LNORTH_ll()) THEN
-!
-   Z2(:,IJE,IKB:IKE+1)=                                     &
-       (PRVCT(:,IJE,IKB:IKE+1)+PRVCT(:,IJE,IKB-1:IKE) )     &
-        *PDZY(:,IJE,IKB:IKE+1) *0.25                        &
-      +(PRVCT(:,IJE+1,IKB:IKE+1)+PRVCT(:,IJE+1,IKB-1:IKE) ) &
-        *PDZY(:,IJE+1,IKB:IKE+1) *0.25   
-!
+    Z2(:, IJE, IKB:IKE+1 ) =  ( PRVCT(:, IJE,   IKB:IKE+1 ) + PRVCT(:, IJE,   IKB-1:IKE ) ) * PDZY(:, IJE,   IKB:IKE+1 ) * 0.25 &
+                            + ( PRVCT(:, IJE+1, IKB:IKE+1 ) + PRVCT(:, IJE+1, IKB-1:IKE ) ) * PDZY(:, IJE+1, IKB:IKE+1 ) * 0.25
   END IF
 !
 !*       3.5    Vertical contyravariant wind
@@ -868,12 +858,14 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !!$
 !!$  CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ)
   PRWCT(:,:,:)=0.
-!PW:TODO: not parallelized with openacc
-  PRWCT(IIB:IIE,IJB:IJE,IKB:IKE+1) =                 &
-     (   PRWT(IIB:IIE,IJB:IJE,IKB:IKE+1)            &
-       - Z1(IIB:IIE,IJB:IJE,IKB:IKE+1)              &
-       - Z2(IIB:IIE,IJB:IJE,IKB:IKE+1)              &
-     ) / PDZZ(IIB:IIE,IJB:IJE,IKB:IKE+1)  
+!$acc loop independent collapse(3)
+  do jk = ikb, ike + 1
+    do jj = ijb, ije
+      do ji = iib, iie
+        PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
+      end do
+    end do
+  end do
 !
 !
 END IF
-- 
GitLab