From a3a67ba11f605ce2347cc41dec92849f77eb66a0 Mon Sep 17 00:00:00 2001
From: Juan Escobar <juan.escobar@aero.obs-mip.fr>
Date: Thu, 9 Sep 2021 17:38:08 +0200
Subject: [PATCH] Juan 09/09/2021:contrav.f90, add do concurrent + async loop
 for boundary

---
 src/ZSOLVER/contrav.f90 | 108 +++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 57 deletions(-)

diff --git a/src/ZSOLVER/contrav.f90 b/src/ZSOLVER/contrav.f90
index 4032bffe0..ebef79590 100644
--- a/src/ZSOLVER/contrav.f90
+++ b/src/ZSOLVER/contrav.f90
@@ -568,7 +568,9 @@ real                                :: ZTMP1, ZTMP2 ! Intermediate work variable
 REAL,   DIMENSION(:,:), POINTER , CONTIGUOUS :: ZU_EAST, ZV_NORTH, ZDZX_EAST, ZDZY_NORTH
 TYPE(LIST_ll),          POINTER     :: TZFIELD_U, TZFIELD_V, TZFIELD_DZX, TZFIELD_DZY
 TYPE(HALO2LIST_ll),     POINTER     :: TZHALO2_U, TZHALO2_V, TZHALO2_DZX, TZHALO2_DZY
-
+!
+LOGICAL :: GWEST,GEAST,GSOUTH,GNORTH
+!
 !$acc data present( PRUT, PRVT, PRWT, PDXX, PDYY, PDZZ, PDZX, PDZY, PRUCT, PRVCT, PRWCT, Z1, Z2 )
 
 IF ( PRESENT(ODATA_ON_DEVICE) ) THEN
@@ -597,6 +599,11 @@ IIU= SIZE(PDXX,1)
 IJU= SIZE(PDXX,2)
 IKU= SIZE(PDXX,3)
 !
+GWEST  = ( HLBCX(1) /= 'CYCL' .AND. LWEST_ll() )
+GEAST  = ( HLBCX(2) /= 'CYCL' .AND. LEAST_ll() )
+GSOUTH = ( HLBCY(1) /= 'CYCL' .AND. LSOUTH_ll() )
+GNORTH = ( HLBCY(2) /= 'CYCL' .AND. LNORTH_ll() )
+!
 CALL GET_INDICE_ll( IIB,IJB,IIE,IJE)
 !
 IKB=1+JPVEXT
@@ -679,50 +686,38 @@ ELSE
 !*       3.    Compute the vertical contravariant components (general case)
 !              ------------------------------------
 !
-!$acc kernels
 ! Z1(:,:,:) = 0.
-! Z2(:,:,:) = 0.
+! Z2(:,:,:) = 0.   
 !
 IF (KADV_ORDER == 2 ) THEN
 #ifdef MNH_OPENACC
   call Print_msg( NVERB_WARNING, 'GEN', 'CONTRAV', 'OpenACC: KADV_ORDER=2 and LFLAT=.TRUE. not yet tested' )
 #endif
+!$acc kernels  
 !
 !$acc loop independent collapse(3)
-  do jk = ikb, ike + 1
-    do jj = 1, iju
-      do ji = iib, iie
-        Z1(ji, jj, jk ) =   ( PRUCT(ji,     jj, jk ) + PRUCT(ji,     jj, jk - 1 ) ) * PDZX (ji,     jj, jk ) * 0.25 &
-                          + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25
-      end do
-    end do
+  do concurrent (ji=iib:iie,jj=1:iju,jk=ikb:ike+1)
+     Z1(ji, jj, jk ) =   ( PRUCT(ji,     jj, jk ) + PRUCT(ji,     jj, jk - 1 ) ) * PDZX (ji,     jj, jk ) * 0.25 &
+                       + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25
   end do
-
 !$acc loop independent collapse(3)
-  do jk = ikb, ike + 1
-    do jj = ijb, ije
-      do ji = 1, iiu
-        Z2(:, jj, jk ) =   ( PRVCT(:, jj,     jk) + PRVCT( :, jj,    jk - 1) ) * PDZY(:, jj,     jk) * 0.25 &
-                         + ( PRVCT(:, jj + 1, jk) + PRVCT( :, jj + 1,jk - 1) ) * PDZY(:, jj + 1, jk) * 0.25
-      end do
-    end do
+  do concurrent (ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1)
+     Z2(ji, jj, jk ) =   ( PRVCT(ji, jj,     jk) + PRVCT( ji, jj,    jk - 1) ) * PDZY(ji, jj,     jk) * 0.25 &
+                       + ( PRVCT(ji, jj + 1, jk) + PRVCT( ji, jj + 1,jk - 1) ) * PDZY(ji, jj + 1, jk) * 0.25
   end do
 
   PRWCT(:,:,:)=0.
 
 !$acc loop independent collapse(3)
-  do jk = ikb, ike + 1
-    do jj = ijb, ije
-      do ji = iib, iie
-        PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
-      end do
-    end do
+  do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
+     PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
   end do
 !
+!$acc end kernels  
 ELSE IF (KADV_ORDER == 4 ) THEN
 !
 !!$   IF (NHALO == 1) THEN
-      IF ( LWEST_ll() .AND. HLBCX(1)/='CYCL' ) THEN
+      IF ( GWEST ) THEN
          IW=IIB+2 -1
       ELSE
          IW=IIB+1 -1
@@ -742,7 +737,7 @@ ELSE IF (KADV_ORDER == 4 ) THEN
 !!$   END IF
    !
 !!$   IF(NHALO == 1) THEN
-      IF ( LSOUTH_ll() .AND. HLBCY(1)/='CYCL' ) THEN
+      IF ( GSOUTH ) THEN
          IS=IJB+2 -1
       ELSE
          IS=IJB+1 -1
@@ -763,45 +758,38 @@ ELSE IF (KADV_ORDER == 4 ) THEN
    !
    !
    !*       3.1    interior of the process subdomain
+!$acc kernels      
 !
 !
 !PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works
 !                      but ji loop can not be collapsed -> 10x slower on GPU
 !                     *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance)
 !$acc loop independent collapse(3) private(ztmp1, ztmp2)
-  do jk = IKB, IKE + 1
-    do jj = 1, iju
-      do ji = IW, IE
+  do concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1)
         ztmp1 = ( 9.0 * PDZX(ji,   jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0
         ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji,   jj, jk ) ) / 3.0 ) / 16.0
         Z1(ji, jj, jk ) =  7.0 * (  ( PRUCT(ji,   jj, jk ) + PRUCT(ji,   jj, jk-1 ) ) * ztmp1                        &
                                   + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2               ) / 12.0 &
                          - 0.5 * (  ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk)           &
                                   + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk)  ) / 12.0
-      end do
-    end do
   end do
 !
 !$acc loop independent collapse(3)
-  do jk = IKB, IKE + 1
-    do jj = is, in
-      do ji = 1, iiu
+  do concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1)
         ztmp1 = ( 9.0 * PDZY(ji, jj,   jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0
         ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj,   jk ) ) / 3.0 ) / 16.0
         Z2(ji, jj, jk ) =  7.0 * (  ( PRVCT(ji, jj,   jk ) + PRVCT(ji, jj,   jk-1 ) ) * ztmp1                         &
                                   + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2                ) / 12.0 &
                          - 0.5 * (  ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk )           &
                                   + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk )  ) / 12.0
-      end do
-    end do
   end do
+!$acc end kernels     
 !
 !*       3.2    limits of the process subdomain (inside the whole domain or in cyclic conditions)
 !
 !!$  IF (NHALO==1) THEN
-!$acc loop independent collapse(2)
-  do jk = IKB, IKE + 1
-    do jj = 1, iju
+!$acc parallel loop independent collapse(2) async
+    do concurrent(jj=1:iju,jk=IKB:IKE+1)
       ztmp1 = ( 9.0 * PDZX(IIE,   jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0
       ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk )   + PDZX(IIE+1, jj, jk ) + PDZX(IIE,   jj, jk ) ) / 3.0 ) / 16.0
       Z1(IIE, jj, jk ) =  7.0 * (  ( PRUCT(IIE,   jj, jk ) + PRUCT(IIE,   jj, jk-1 ) ) * ztmp1                        &
@@ -809,65 +797,71 @@ ELSE IF (KADV_ORDER == 4 ) THEN
                         - 0.5 * (  ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk)          &
                                  + ( ZU_EAST     (jj, jk ) + ZU_EAST     (jj, jk-1 ) ) * ZDZX_EAST  (jj, jk)  ) / 12.0
     end do
-  end do
 !
-!$acc loop independent collapse(2)
-  do jk = IKB, IKE + 1
-    do ji = 1, iiu
+!$acc parallel loop independent collapse(2) async
+    do concurrent(ji=1:iiu,jk=IKB:IKE+1)
       ztmp1 = ( 9.0 * PDZY(ji, IJE,   jk) - ( PDZY      (ji, IJE+1, jk) + PDZY(ji, IJE,   jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0
       ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji,        jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE,   jk) ) / 3.0 ) / 16.0
       Z2(ji, IJE, jk ) =  7.0 * (  ( PRVCT   (ji, IJE,   jk ) + PRVCT   (ji, IJE,   jk-1 ) ) * ztmp1                               &
                                  + ( PRVCT   (ji, IJE+1, jk ) + PRVCT   (ji, IJE+1, jk-1 ) ) * ztmp2                      ) / 12.0 &
                         - 0.5 * (  ( PRVCT   (ji, IJE-1, jk ) + PRVCT   (ji, IJE-1, jk-1 ) ) * PDZY      (ji, IJE-1, jk )          &
                                  + ( ZV_NORTH(ji,        jk ) + ZV_NORTH(ji,        jk-1 ) ) * ZDZY_NORTH(ji,        jk ) ) / 12.0
-    end do
-  end do
+   end do
+!$acc wait
 !!$  END IF
 !
 !*       3.3    non-CYCLIC CASE IN THE X DIRECTION: 2nd order case
 !
-  IF (HLBCX(1)/='CYCL' .AND. LWEST_ll()) THEN
+   IF ( GWEST ) THEN
+    !$acc kernels async  
     Z1(IIB, :, IKB:IKE+1 ) =  ( PRUCT(IIB,   :, IKB:IKE+1 ) + PRUCT(IIB,   :, IKB-1:IKE ) ) * PDZX(IIB,   :, IKB:IKE+1 ) * 0.25 &
                             + ( PRUCT(IIB+1, :, IKB:IKE+1 ) + PRUCT(IIB+1, :, IKB-1:IKE ) ) * PDZX(IIB+1, :, IKB:IKE+1 ) * 0.25
+    !$acc end kernels
   END IF
 !
-  IF (HLBCX(2)/='CYCL' .AND. LEAST_ll()) THEN
+  IF ( GEAST ) THEN
+    !$acc kernels async 
     Z1(IIE, :, IKB:IKE+1 ) =  ( PRUCT(IIE,   :, IKB:IKE+1 ) + PRUCT(IIE,   :, IKB-1:IKE ) ) * PDZX(IIE,   :, IKB:IKE+1 ) * 0.25 &
                             + ( PRUCT(IIE+1, :, IKB:IKE+1 ) + PRUCT(IIE+1, :, IKB-1:IKE ) ) * PDZX(IIE+1, :, IKB:IKE+1 ) * 0.25
-  END IF
+    !$acc end kernels
+ END IF
 !
 !*       3.4    non-CYCLIC CASE IN THE Y DIRECTION: 2nd order case
 !
-  IF (HLBCY(1)/='CYCL' .AND. LSOUTH_ll()) THEN
+ IF ( GSOUTH ) THEN
+    !$acc kernels async
     Z2(:, IJB, IKB:IKE+1 ) =  ( PRVCT(:, IJB,   IKB:IKE+1 ) + PRVCT(:, IJB,   IKB-1:IKE ) ) * PDZY(:, IJB,   IKB:IKE+1 ) * 0.25 &
                             + ( PRVCT(:, IJB+1, IKB:IKE+1 ) + PRVCT(:, IJB+1, IKB-1:IKE ) ) * PDZY(:, IJB+1, IKB:IKE+1 ) * 0.25
-  END IF
+    !$acc end kernels
+ END IF
 !
-  IF (HLBCY(2)/='CYCL' .AND. LNORTH_ll()) THEN
+ IF ( GNORTH ) THEN
+     !$acc kernels async
     Z2(:, IJE, IKB:IKE+1 ) =  ( PRVCT(:, IJE,   IKB:IKE+1 ) + PRVCT(:, IJE,   IKB-1:IKE ) ) * PDZY(:, IJE,   IKB:IKE+1 ) * 0.25 &
                             + ( PRVCT(:, IJE+1, IKB:IKE+1 ) + PRVCT(:, IJE+1, IKB-1:IKE ) ) * PDZY(:, IJE+1, IKB:IKE+1 ) * 0.25
-  END IF
+    !$acc end kernels
+ END IF
+!$acc wait 
 !
 !*       3.5    Vertical contyravariant wind
 !
 !
+!$acc kernels  
 !!$  CALL GET_HALO(Z1)
 !!$  CALL GET_HALO(Z2)
 !!$
 !!$  CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ)
   PRWCT(:,:,:)=0.
 !$acc loop independent collapse(3)
-  do jk = ikb, ike + 1
-    do jj = ijb, ije
-      do ji = iib, iie
+  do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1)
         PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk )
-      end do
-    end do
   end do
+!$acc end kernels 
 !
 !
 END IF
 !
+!$acc kernels
 PRWCT(:,:,1) = - PRWCT(:,:,3)     ! Mirror hypothesis
 !$acc end kernels
 !$acc update self(PRWCT)
-- 
GitLab