From 522c87d2fedda8794056e79536cd2f395aa7211b Mon Sep 17 00:00:00 2001
From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr>
Date: Mon, 21 Mar 2022 18:14:06 +0100
Subject: [PATCH] Juan 21/03/2022: Cray GPU opt in ZSOLVER source , add "!dirs$
 concurrent"

---
 src/ZSOLVER/advection_metsv.f90 | 14 +++++-
 src/ZSOLVER/dotprod.f90         |  4 ++
 src/ZSOLVER/p_abs.f90           | 15 ++++--
 src/ZSOLVER/pressurez.f90       |  9 ++--
 src/ZSOLVER/tke_eps_sources.f90 |  1 +
 src/ZSOLVER/turb.f90            | 88 +++++++++++++++++++--------------
 6 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/src/ZSOLVER/advection_metsv.f90 b/src/ZSOLVER/advection_metsv.f90
index e0e2c003d..dda0df699 100644
--- a/src/ZSOLVER/advection_metsv.f90
+++ b/src/ZSOLVER/advection_metsv.f90
@@ -489,7 +489,7 @@ IF(LBLOWSNOW) THEN    ! Put 2D Canopy blowing snow variables into a 3D array for
 #endif
   ZSNWC_INIT(:,:,:,:) = 0.
   ZRSNWCS(:,:,:,:) = 0.
-
+  !dir$ concurrent
   DO JSV=1,(NBLOWSNOW_2D)
      ZSNWC_INIT(:,:,IKB,JSV) = XSNWCANO(:,:,JSV)
      ZRSNWCS(:,:,IKB,JSV)    = XRSNWCANOS(:,:,JSV)
@@ -752,17 +752,22 @@ ZRWCPPM(:,:,:) = ZRWCPPM(:,:,:)*ZTSTEP_PPM
 !  Clouds    related processes from previous time-step are     taken into account in PRTHS_CLD
 !  Advection related processes from previous time-step will be taken into account in ZRTHS_PPM
 !
+!dir$ concurrent
 ZRTHS_OTHER(:,:,:) = PRTHS(:,:,:) - PTHT(:,:,:) * PRHODJ(:,:,:) / PTSTEP
+!dir$ concurrent
 IF (GTKE) ZRTKES_OTHER(:,:,:) = PRTKES(:,:,:) - PTKET(:,:,:) * PRHODJ(:,:,:) / PTSTEP
 DO JR = 1, KRR
+ !dir$ concurrent
  ZRRS_OTHER(:,:,:,JR) = PRRS(:,:,:,JR) - PRT(:,:,:,JR) * PRHODJ(:,:,:) / PTSTEP
 END DO
 DO JSV = 1, KSV
+ !dir$ concurrent   
  ZRSVS_OTHER(:,:,:,JSV) = PRSVS(:,:,:,JSV) - PSVT(:,:,:,JSV) * PRHODJ / PTSTEP
 END DO
 !$acc end kernels
 IF(LBLOWSNOW) THEN
    DO JSV = 1, (NBLOWSNOW_2D)
+     !dir$ concurrent 
      ZRSNWCS_OTHER(:,:,:,JSV) = ZRSNWCS(:,:,:,JSV) - ZSNWC_INIT(:,:,:,JSV) * PRHODJ / PTSTEP
    END DO   
 ENDIF
@@ -844,8 +849,11 @@ CALL PPM_RHODJ(HLBCX,HLBCY, ZRUCPPM, ZRVCPPM, ZRWCPPM,              &
 !
 !* values of the fields at the beginning of the time splitting loop
 !$acc kernels
+!dir$ concurrent
 ZTH(:,:,:)  = PTHT(:,:,:)
+!dir$ concurrent
 IF (KRR /=0 ) ZR(:,:,:,:)  = PRT(:,:,:,:)
+!dir$ concurrent
 IF (KSV /=0 ) ZSV(:,:,:,:) = PSVT(:,:,:,:)
 !
 IF (GTKE)  THEN
@@ -894,9 +902,13 @@ DO JSPL=1,KSPLIT
 !
 ! acc kernels
    !$acc kernels
+   !dir$ concurrent
    PRTHS(:,:,:)                      = PRTHS     (:,:,:)   + ZRTHS_PPM (:,:,:)   / KSPLIT
+   !dir$ concurrent
    IF (GTKE)     PRTKES_ADV(:,:,:)   = PRTKES_ADV(:,:,:)   + ZRTKES_PPM(:,:,:)   / KSPLIT
+   !dir$ concurrent
    IF (KRR /=0)  PRRS      (:,:,:,:) = PRRS      (:,:,:,:) + ZRRS_PPM  (:,:,:,:) / KSPLIT
+   !dir$ concurrent
    IF (KSV /=0 ) PRSVS     (:,:,:,:) = PRSVS     (:,:,:,:) + ZRSVS_PPM (:,:,:,:) / KSPLIT
    !$acc end kernels
 !
diff --git a/src/ZSOLVER/dotprod.f90 b/src/ZSOLVER/dotprod.f90
index fefef61ba..48aa61fa0 100644
--- a/src/ZSOLVER/dotprod.f90
+++ b/src/ZSOLVER/dotprod.f90
@@ -187,8 +187,12 @@ CALL MNH_MEM_GET(ZDOTPROD, ILBXB,ILBXE ,ILBYB,ILBYE )
 ZDOTPROD    = 0.
 !$acc loop seq
 DO JK = IKB-1,IKE+1
+#ifdef MNH_COMPILER_NVHPC   
    !$acc loop independent collapse(2) 
+#endif
+!dir$ doconcurrent
    DO JJ = ILBYB,ILBYE
+      !dir$ doconcurrent
       DO JI = ILBXB,ILBXE
          ZDOTPROD(JI,JJ) = ZDOTPROD(JI,JJ) + PA(JI,JJ,JK) * PB(JI,JJ,JK)
       END DO
diff --git a/src/ZSOLVER/p_abs.f90 b/src/ZSOLVER/p_abs.f90
index db94cb0fe..5fdb583cc 100644
--- a/src/ZSOLVER/p_abs.f90
+++ b/src/ZSOLVER/p_abs.f90
@@ -289,9 +289,13 @@ IF ( CEQNSYS=='DUR' .OR. CEQNSYS=='MAE' ) THEN
     !
     !$acc loop seq
     DO JK = IKB,IKE
-      !$acc loop independent collapse(2) 
+#ifdef MNH_COMPILER_NVHPC       
+      !$acc loop independent collapse(2)
+#endif
+      !dir$ concurrent ! collapse(JJ,JI) 
       DO JJ = IJB,IJE
-        DO JI = IIB,IIE
+         !dir$ concurrent
+         DO JI = IIB,IIE
            ZMASSGUESS_2D(JI,JJ)  = ZMASSGUESS_2D(JI,JJ) +                          &
 #ifndef MNH_BITREP
                 (PEXNREF(JI,JJ,JK)+PPHIT(JI,JJ,JK))**ZCVD_O_RD   &
@@ -346,8 +350,12 @@ IF ( CEQNSYS=='DUR' .OR. CEQNSYS=='MAE' ) THEN
      !$acc kernels
      !$acc loop seq 
      DO JK = IKB,IKE
-        !$acc loop independent collapse(2)  
+#ifdef MNH_COMPILER_NVHPC        
+        !$acc loop independent collapse(2)
+#endif
+        !dir$ concurrent ! collapse(JJ,JI)        
         DO JJ = IJB,IJE
+           !dir$ concurrent
            DO JI = IIB,IIE
               ZMASSGUESS_2D(JI,JJ)  = ZMASSGUESS_2D(JI,JJ) +                               &
                    (PEXNREF(JI,JJ,JK)+PPHIT(JI,JJ,JK))**ZCVD_O_RD          &
@@ -406,7 +414,6 @@ ELSEIF( CEQNSYS == 'LHE' ) THEN
   !   compute the mixing ratio of the total water (ZRRTOT)
     ZRV_O_RD = XRV / XRD
     ZRTOT(:,:,:) = PRT(:,:,:,1)
-    !$acc loop seq
     DO JWATER = 2 , 1+KRRL+KRRI                
       ZRTOT(:,:,:) = ZRTOT(:,:,:) + PRT(:,:,:,JWATER)
     END DO
diff --git a/src/ZSOLVER/pressurez.f90 b/src/ZSOLVER/pressurez.f90
index 4cd5cad08..56428f4d8 100644
--- a/src/ZSOLVER/pressurez.f90
+++ b/src/ZSOLVER/pressurez.f90
@@ -856,8 +856,10 @@ IF(CEQNSYS=='MAE' .OR. CEQNSYS=='DUR') THEN
   CALL MZM_DEVICE(ZPRHODJ, ZMZM_PRHODJ)
   CALL GZ_M_W_DEVICE(1,IKU,1,ZPHIT,PDZZ,ZGZ_M_W)
   !$acc kernels
-  PRUS = PRUS - ZMXM_PRHODJ * ZDV_SOURCE
-  PRWS = PRWS - ZMZM_PRHODJ * ZGZ_M_W
+  !dir$ concurrent
+  PRUS(:,:,:) = PRUS(:,:,:) - ZMXM_PRHODJ(:,:,:) * ZDV_SOURCE(:,:,:)
+  !dir$ concurrent
+  PRWS(:,:,:) = PRWS(:,:,:) - ZMZM_PRHODJ(:,:,:) * ZGZ_M_W(:,:,:)
   !$acc end kernels
 #endif
 ELSEIF(CEQNSYS=='LHE') THEN
@@ -923,7 +925,8 @@ END IF
 #else
      CALL MYM_DEVICE(ZPRHODJ,ZMYM_PRHODJ)
      !$acc kernels
-     PRVS = PRVS - ZMYM_PRHODJ * ZDV_SOURCE
+     !dir$ concurrent
+     PRVS(:,:,:) = PRVS(:,:,:) - ZMYM_PRHODJ(:,:,:) * ZDV_SOURCE(:,:,:)
      !$acc end kernels
 #endif
   ELSEIF(CEQNSYS=='LHE') THEN
diff --git a/src/ZSOLVER/tke_eps_sources.f90 b/src/ZSOLVER/tke_eps_sources.f90
index 15c82a0df..91f078a6d 100644
--- a/src/ZSOLVER/tke_eps_sources.f90
+++ b/src/ZSOLVER/tke_eps_sources.f90
@@ -553,6 +553,7 @@ if (lbudget_tke) then
 end if
 
 !$acc kernels
+!dir$ concurrent
 PRTKES(:,:,:) = ZRES(:,:,:) * PRHODJ(:,:,:) / PTSTEP -  PRTKESM(:,:,:)
 !$acc end kernels
 !
diff --git a/src/ZSOLVER/turb.f90 b/src/ZSOLVER/turb.f90
index c24ffa86a..9a7977551 100644
--- a/src/ZSOLVER/turb.f90
+++ b/src/ZSOLVER/turb.f90
@@ -862,6 +862,7 @@ IF ( KRRL >= 1 ) THEN
     PRRS(:,:,:,1) = PRRS(:,:,:,1) + PRRS(:,:,:,2)
     ! Theta_l at t
     PTHLT(:,:,:)  = PTHLT(:,:,:)  - ZLOCPEXNM(:,:,:) * PRT(:,:,:,2)
+    !dir$ concurrent
     PRTHLS(:,:,:) = PRTHLS(:,:,:) - ZLOCPEXNM(:,:,:) * PRRS(:,:,:,2)
   END IF
 !$acc end kernels
@@ -1458,6 +1459,7 @@ IF ( KRRL >= 1 ) THEN
     PRRS(:,:,:,1) = PRRS(:,:,:,1) - PRRS(:,:,:,2) - PRRS(:,:,:,4)
     PTHLT(:,:,:)  = PTHLT(:,:,:)  + ZLVOCPEXNM(:,:,:) * PRT(:,:,:,2) &
                                   + ZLSOCPEXNM(:,:,:) * PRT(:,:,:,4)
+    !dir$ concurrent
     PRTHLS(:,:,:) = PRTHLS(:,:,:) + ZLVOCPEXNM(:,:,:) * PRRS(:,:,:,2) &
                                   + ZLSOCPEXNM(:,:,:) * PRRS(:,:,:,4)
 !$acc end kernels
@@ -2539,48 +2541,58 @@ CALL EMOIST(KRR,KRRI,PTHLT,PRT,PLOCPEXNM,PAMOIST,PSRCT,ZEMOIST)
 !
 !$acc kernels
 IF (KRR>0) THEN
-!$acc loop independent collapse(3) private(ZVAR)
-  DO JK = KKTB+1,KKTE-1
-    DO JJ=1,SIZE(PLM,2)
-      DO JI=1,SIZE(PLM,1)
-        ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
-                                (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
-        ZDRTDZ(JI,JJ,JK) = 0.5*((PRT(JI,JJ,JK+KKL,1)-PRT(JI,JJ,JK    ,1))/PDZZ(JI,JJ,JK+KKL)+ &
-                                (PRT(JI,JJ,JK    ,1)-PRT(JI,JJ,JK-KKL,1))/PDZZ(JI,JJ,JK    ))
-        IF (GOCEAN) THEN
-          ZVAR=XG*(XALPHAOC*ZDTHLDZ(JI,JJ,JK)-XBETAOC*ZDRTDZ(JI,JJ,JK))
-        ELSE
-          ZVAR=XG/PTHVREF(JI,JJ,JK)*                                                  &
+#ifdef MNH_COMPILER_NVHPC
+   !$acc loop independent collapse(3) private(ZVAR)
+#endif
+   !dir$ concurrent 
+   DO JK = KKTB+1,KKTE-1
+      !dir$ concurrent
+      DO JJ=1,SIZE(PLM,2)
+         !dir$ concurrent
+         DO JI=1,SIZE(PLM,1)          
+            ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
+                 (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
+            ZDRTDZ(JI,JJ,JK) = 0.5*((PRT(JI,JJ,JK+KKL,1)-PRT(JI,JJ,JK    ,1))/PDZZ(JI,JJ,JK+KKL)+ &
+                 (PRT(JI,JJ,JK    ,1)-PRT(JI,JJ,JK-KKL,1))/PDZZ(JI,JJ,JK    ))
+            IF (GOCEAN) THEN
+               ZVAR=XG*(XALPHAOC*ZDTHLDZ(JI,JJ,JK)-XBETAOC*ZDRTDZ(JI,JJ,JK))
+            ELSE
+               ZVAR=XG/PTHVREF(JI,JJ,JK)*                                                  &
              (ZETHETA(JI,JJ,JK)*ZDTHLDZ(JI,JJ,JK)+ZEMOIST(JI,JJ,JK)*ZDRTDZ(JI,JJ,JK))
-        END IF
-        !
-        IF (ZVAR>0.) THEN
-          PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
-                        0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
-        END IF
+            END IF
+            !
+            IF (ZVAR>0.) THEN
+               PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
+                    0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
+            END IF
+         END DO
       END DO
-    END DO
-  END DO
+   END DO
 ELSE! For dry atmos or unsalted ocean runs
-!$acc loop independent collapse(3) private(ZVAR)
-  DO JK = KKTB+1,KKTE-1
-    DO JJ=1,SIZE(PLM,2)
-      DO JI=1,SIZE(PLM,1)
-        ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
-                                (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
-        IF (GOCEAN) THEN
-          ZVAR= XG*XALPHAOC*ZDTHLDZ(JI,JJ,JK)
-        ELSE
-          ZVAR= XG/PTHVREF(JI,JJ,JK)*ZETHETA(JI,JJ,JK)*ZDTHLDZ(JI,JJ,JK)
-        END IF
-!
-        IF (ZVAR>0.) THEN
-          PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
-                        0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
-        END IF
+#ifdef MNH_COMPILER_NVHPC
+   !$acc loop independent collapse(3) private(ZVAR)
+#endif
+   !dir$ concurrent   
+   DO JK = KKTB+1,KKTE-1
+      !dir$ concurrent  
+      DO JJ=1,SIZE(PLM,2)
+         !dir$ concurrent
+         DO JI=1,SIZE(PLM,1)
+            ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK    ))/PDZZ(JI,JJ,JK+KKL)+ &
+                 (PTHLT(JI,JJ,JK    )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK    ))
+            IF (GOCEAN) THEN
+               ZVAR= XG*XALPHAOC*ZDTHLDZ(JI,JJ,JK)
+            ELSE
+               ZVAR= XG/PTHVREF(JI,JJ,JK)*ZETHETA(JI,JJ,JK)*ZDTHLDZ(JI,JJ,JK)
+            END IF
+            !
+            IF (ZVAR>0.) THEN
+               PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), &
+                    0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR)))
+            END IF
+         END DO
       END DO
-    END DO
-  END DO
+   END DO
 END IF
 !  special case near the surface
 ZDTHLDZ(:,:,KKB)=(PTHLT(:,:,KKB+KKL)-PTHLT(:,:,KKB))/PDZZ(:,:,KKB+KKL)
-- 
GitLab