From 2080de7cd92f74a2ef0b70e007e237a8963d7a52 Mon Sep 17 00:00:00 2001 From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr> Date: Fri, 4 Nov 2022 18:40:16 +0100 Subject: [PATCH] Juan 04/11/2022:MNH/tridiag*.f90, CCE Optimization, use "!$acc parallel" to avoid multiple kernel lunch inside seq loop --- src/MNH/tridiag_thermo.f90 | 48 ++++++++++++++++++++------------ src/MNH/tridiag_tke.f90 | 57 +++++++++++++++++++++++++------------- src/MNH/tridiag_w.f90 | 16 ++++++++++- src/MNH/tridiag_wind.f90 | 30 +++++++++++++------- 4 files changed, 104 insertions(+), 47 deletions(-) diff --git a/src/MNH/tridiag_thermo.f90 b/src/MNH/tridiag_thermo.f90 index c96d99382..871581a06 100644 --- a/src/MNH/tridiag_thermo.f90 +++ b/src/MNH/tridiag_thermo.f90 @@ -263,7 +263,7 @@ CALL MZM_DEVICE(PRHODJ,ZMZM_RHODJ) ZRHODJ_DFDDTDZ_O_DZ2(:,:,:) = ZMZM_RHODJ(:,:,:)*PDFDDTDZ(:,:,:)/PDZZ(:,:,:)**2 #else !$acc_nv loop independent collapse(3) -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) +DO CONCURRENT (JK=1:JKU,JJ=1:JJU,JI=1:JIU) ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,JK) = ZMZM_RHODJ(JI,JJ,JK)*PDFDDTDZ(JI,JJ,JK)/BR_P2(PDZZ(JI,JJ,JK)) END DO !CONCURRENT #endif @@ -285,7 +285,7 @@ ZY=0. #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKB) = PRHODJ(JI,JJ,IKB)*PVARM(JI,JJ,IKB)/PTSTEP & - ZMZM_RHODJ(JI,JJ,IKB+KKL) * PF(JI,JJ,IKB+KKL)/PDZZ(JI,JJ,IKB+KKL) & + ZMZM_RHODJ(JI,JJ,IKB ) * PF(JI,JJ,IKB )/PDZZ(JI,JJ,IKB ) & @@ -298,7 +298,7 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(3) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKTB+1:IKTE-1) +DO CONCURRENT (JK=IKTB+1:IKTE-1,JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,JK) = PRHODJ(JI,JJ,JK)*PVARM(JI,JJ,JK)/PTSTEP & - ZMZM_RHODJ(JI,JJ,JK+KKL) * PF(JI,JJ,JK+KKL)/PDZZ(JI,JJ,JK+KKL) & + ZMZM_RHODJ(JI,JJ,JK ) * PF(JI,JJ,JK )/PDZZ(JI,JJ,JK ) & @@ -313,7 +313,7 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKE) = PRHODJ(JI,JJ,IKE)*PVARM(JI,JJ,IKE)/PTSTEP & - ZMZM_RHODJ(JI,JJ,IKE+KKL) * PF(JI,JJ,IKE+KKL)/PDZZ(JI,JJ,IKE+KKL) & + ZMZM_RHODJ(JI,JJ,IKE ) * PF(JI,JJ,IKE )/PDZZ(JI,JJ,IKE ) & @@ -336,7 +336,7 @@ IF ( PIMPL > 1.E-10 ) THEN #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZB(JI,JJ,IKB) = PRHODJ(JI,JJ,IKB)/PTSTEP & - ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,IKB+KKL) * PIMPL END DO !CONCURRENT @@ -346,7 +346,7 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZC(JI,JJ,IKB) = ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,IKB+KKL) * PIMPL END DO !CONCURRENT !$acc end kernels @@ -355,7 +355,7 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(3) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKTB+1:IKTE-1) +DO CONCURRENT (JK=IKTB+1:IKTE-1,JJ=1:JJU,JI=1:JIU) ZA(JI,JJ,JK) = ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,JK) * PIMPL ZB(JI,JJ,JK) = PRHODJ(JI,JJ,JK)/PTSTEP & - ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,JK+KKL) * PIMPL & @@ -368,7 +368,7 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZA(JI,JJ,IKE) = ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,IKE ) * PIMPL ZB(JI,JJ,IKE) = PRHODJ(JI,JJ,IKE)/PTSTEP & - ZRHODJ_DFDDTDZ_O_DZ2(JI,JJ,IKE ) * PIMPL @@ -385,16 +385,22 @@ END DO !CONCURRENT #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZBET(JI,JJ) = ZB(JI,JJ,IKB) ! bet = b(ikb) PVARP(JI,JJ,IKB) = ZY(JI,JJ,IKB) / ZBET(JI,JJ) END DO !CONCURRENT +!$acc end kernels ! +!$acc parallel !$acc loop seq DO JK = IKB+KKL,IKE-KKL,KKL +#ifdef MNH_COMPILER_NVHPC ! gang+vector needed or parallisation vector only - !$acc_nv loop independent gang, vector collapse(2) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + !$acc loop independent gang, vector collapse(2) +#else + !$acc loop independent +#endif + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZGAM(JI,JJ,JK) = ZC(JI,JJ,JK-KKL) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet ZBET(JI,JJ) = ZB(JI,JJ,JK) - ZA(JI,JJ,JK) * ZGAM(JI,JJ,JK) @@ -403,11 +409,13 @@ DO JK = IKB+KKL,IKE-KKL,KKL ! res(k) = (y(k) -a(k)*res(k-1))/ bet END DO !CONCURRENT END DO +!$acc end parallel +!$acc kernels ! special treatment for the last level #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZGAM(JI,JJ,IKE) = ZC(JI,JJ,IKE-KKL) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet ZBET(JI,JJ) = ZB(JI,JJ,IKE) - ZA(JI,JJ,IKE) * ZGAM(JI,JJ,IKE) @@ -415,19 +423,25 @@ DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) PVARP(JI,JJ,IKE)= ( ZY(JI,JJ,IKE) - ZA(JI,JJ,IKE) * PVARP(JI,JJ,IKE-KKL) ) / ZBET(JI,JJ) ! res(k) = (y(k) -a(k)*res(k-1))/ bet END DO !CONCURRENT +!$acc end kernels ! !* 3.3 going down ! ---------- ! +!$acc parallel !$acc loop seq DO JK = IKE-KKL,IKB,-1*KKL +#ifdef MNH_COMPILER_NVHPC ! gang+vector needed or parallisation vector only - !$acc_nv loop independent gang, vector collapse(2) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + !$acc loop independent gang, vector collapse(2) +#else + !$acc loop independent +#endif + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,JK) = PVARP(JI,JJ,JK) - ZGAM(JI,JJ,JK+KKL) * PVARP(JI,JJ,JK+KKL) END DO !CONCURRENT END DO -!$acc end kernels +!$acc end parallel ! ELSE ! @@ -435,7 +449,7 @@ ELSE #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(3) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKTB:IKTE) +DO CONCURRENT (JK=IKTB:IKTE,JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,JK) = ZY(JI,JJ,JK) * PTSTEP / PRHODJ(JI,JJ,JK) END DO !CONCURRENT !$acc end kernels @@ -450,7 +464,7 @@ END IF #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,KKA)=PVARP(JI,JJ,IKB) PVARP(JI,JJ,KKU)=PVARP(JI,JJ,IKE) END DO !CONCURRENT diff --git a/src/MNH/tridiag_tke.f90 b/src/MNH/tridiag_tke.f90 index 4cf3e15c4..89840fb59 100644 --- a/src/MNH/tridiag_tke.f90 +++ b/src/MNH/tridiag_tke.f90 @@ -211,28 +211,34 @@ CALL MNH_MEM_GET( zbet, JIU, JJU ) !* 1. COMPUTE THE RIGHT HAND SIDE ! --------------------------- ! -!$acc kernels + IKT=SIZE(PVARM,3) IKTB=1+JPVEXT_TURB IKTE=IKT-JPVEXT_TURB IKB=KKA+JPVEXT_TURB*KKL IKE=KKU-JPVEXT_TURB*KKL +!$acc kernels ! ! #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKB) = PVARM(JI,JJ,IKB) + PTSTEP*PSOURCE(JI,JJ,IKB) - & PEXPL / PRHODJ(JI,JJ,IKB) * PA(JI,JJ,IKB+KKL) * (PVARM(JI,JJ,IKB+KKL) - PVARM(JI,JJ,IKB)) END DO !CONCURRENT +!$acc end kernels ! +!$acc parallel +!$acc loop seq DO JK=IKTB+1,IKTE-1 #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) +#else + !$acc loop independent #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,JK)= PVARM(JI,JJ,JK) + PTSTEP*PSOURCE(JI,JJ,JK) - & PEXPL / PRHODJ(JI,JJ,JK) * & ( PVARM(JI,JJ,JK-KKL)*PA(JI,JJ,JK) & @@ -241,42 +247,46 @@ DO JK=IKTB+1,IKTE-1 ) END DO !CONCURRENT END DO +!$acc end parallel ! +!$acc kernels #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKE)= PVARM(JI,JJ,IKE) + PTSTEP*PSOURCE(JI,JJ,IKE) + & PEXPL / PRHODJ(JI,JJ,IKE) * PA(JI,JJ,IKE) * (PVARM(JI,JJ,IKE)-PVARM(JI,JJ,IKE-KKL)) END DO !CONCURRENT +!$acc end kernels ! ! !* 2. INVERSION OF THE TRIDIAGONAL SYSTEM ! ----------------------------------- ! IF ( PIMPL > 1.E-10 ) THEN -! +!$acc kernels ! ! going up - ! + ! #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZBET(JI,JJ) = 1. + PIMPL * (PDIAG(JI,JJ,IKB)-PA(JI,JJ,IKB+KKL) / PRHODJ(JI,JJ,IKB)) ! bet = b(ikb) PVARP(JI,JJ,IKB) = ZY(JI,JJ,IKB) / ZBET(JI,JJ) END DO !CONCURRENT ! +!$acc end kernels +!$acc parallel !$acc loop seq DO JK = IKB+KKL,IKE-KKL,KKL #ifdef MNH_COMPILER_NVHPC !$acc loop gang, vector collapse(2) independent +#else + !$acc loop independent #endif - !dir$ concurrent ! collapse(JJ,JI) - DO JJ=1,JJU - !dir$ concurrent - DO JI=1,JIU + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZGAM(JI,JJ,JK) = PIMPL * PA(JI,JJ,JK) / PRHODJ(JI,JJ,JK-KKL) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet ZBET(JI,JJ) = 1. + PIMPL * ( PDIAG(JI,JJ,JK) - & @@ -288,14 +298,15 @@ IF ( PIMPL > 1.E-10 ) THEN * PVARP(JI,JJ,JK-KKL) & ) / ZBET(JI,JJ) ! res(k) = (y(k) -a(k)*res(k-1))/ bet - END DO - END DO + END DO END DO +!$acc end parallel +!$acc kernels ! special treatment for the last level #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZGAM(JI,JJ,IKE) = PIMPL * PA(JI,JJ,IKE) / PRHODJ(JI,JJ,IKE-KKL) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet ZBET(JI,JJ) = 1. + PIMPL * ( PDIAG(JI,JJ,IKE) - & @@ -307,38 +318,46 @@ IF ( PIMPL > 1.E-10 ) THEN ) / ZBET(JI,JJ) ! res(k) = (y(k) -a(k)*res(k-1))/ bet END DO !CONCURRENT +!$acc end kernels ! ! going down ! +!$acc parallel !$acc loop seq DO JK = IKE-KKL,IKB,-1*KKL #ifdef MNH_COMPILER_NVHPC !$acc loop gang, vector collapse(2) +#else + !$acc loop independent #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,JK) = PVARP(JI,JJ,JK) - ZGAM(JI,JJ,JK+KKL) * PVARP(JI,JJ,JK+KKL) END DO !CONCURRENT END DO -! +!$acc end parallel +! ELSE ! +!$acc kernels #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,IKTB:IKTE) = ZY(JI,JJ,IKTB:IKTE) END DO !CONCURRENT ! -END IF +!$acc end kernels +END IF ! ! !* 3. FILL THE UPPER AND LOWER EXTERNAL VALUES ! ---------------------------------------- ! +!$acc kernels #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,KKA)=PVARP(JI,JJ,IKB) PVARP(JI,JJ,KKU)=PVARP(JI,JJ,IKE) END DO !CONCURRENT diff --git a/src/MNH/tridiag_w.f90 b/src/MNH/tridiag_w.f90 index a0647140e..62561c15f 100644 --- a/src/MNH/tridiag_w.f90 +++ b/src/MNH/tridiag_w.f90 @@ -160,7 +160,7 @@ USE MODI_SHUMAN_DEVICE #if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif -#ifdef MNH_BITREP_OMP +#ifdef MNH_COMPILER_CCE !$mnh_undef(LOOP) !$mnh_undef(OPENACC) #endif @@ -380,9 +380,14 @@ ZY=0. ZBET(JI,JJ) = ZB(JI,JJ,IKB) ! bet = b(ikb) PVARP(JI,JJ,IKB) = ZY(JI,JJ,IKB) / ZBET(JI,JJ) !$mnh_end_do() +!$acc end kernels ! +!$acc parallel !$acc loop seq DO JK = IKB+1,IKE-1 +#ifdef MNH_COMPILER_CCE + !$acc loop independent +#endif !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) ZGAM(JI,JJ,JK) = ZC(JI,JJ,JK-1) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet @@ -392,7 +397,9 @@ DO JK = IKB+1,IKE-1 ! res(k) = (y(k) -a(k)*res(k-1))/ bet !$mnh_end_do() END DO +!$acc end parallel ! special treatment for the last level +!$acc kernels !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) ZGAM(JI,JJ,IKE) = ZC(JI,JJ,IKE-1) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet @@ -401,21 +408,28 @@ END DO PVARP(JI,JJ,IKE)= ( ZY(JI,JJ,IKE) - ZA(JI,JJ,IKE) * PVARP(JI,JJ,IKE-1) ) / ZBET(JI,JJ) ! res(k) = (y(k) -a(k)*res(k-1))/ bet !$mnh_end_do() +!$acc end kernels ! !* 3.3 going down ! ---------- ! +!$acc parallel !$acc loop seq DO JK = IKE-1,IKB,-1 +#ifdef MNH_COMPILER_CCE + !$acc loop independent +#endif !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) PVARP(JI,JJ,JK) = PVARP(JI,JJ,JK) - ZGAM(JI,JJ,JK+1) * PVARP(JI,JJ,JK+1) !$mnh_end_do() END DO +!$acc end parallel ! ! !* 4. FILL THE UPPER AND LOWER EXTERNAL VALUES ! ---------------------------------------- ! +!$acc kernels !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) PVARP(JI,JJ,IKB-1)=PVARP(JI,JJ,IKB) PVARP(JI,JJ,IKE+1)=0. diff --git a/src/MNH/tridiag_wind.f90 b/src/MNH/tridiag_wind.f90 index d0c16c3fb..c26ecad00 100644 --- a/src/MNH/tridiag_wind.f90 +++ b/src/MNH/tridiag_wind.f90 @@ -227,14 +227,14 @@ IKE=KKU-JPVEXT_TURB*KKL ! ! !$acc kernels ! async -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKB) = PVARM(JI,JJ,IKB) + PTSTEP*PSOURCE(JI,JJ,IKB) - & PEXPL / PRHODJA(JI,JJ,IKB) * PA(JI,JJ,IKB+KKL) * (PVARM(JI,JJ,IKB+KKL) - PVARM(JI,JJ,IKB)) END DO !CONCURRENT !$acc end kernels ! !$acc kernels ! async -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKTB+1:IKTE-1) +DO CONCURRENT (JK=IKTB+1:IKTE-1,JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,JK)= PVARM(JI,JJ,JK) + PTSTEP*PSOURCE(JI,JJ,JK) - & PEXPL / PRHODJA(JI,JJ,JK) * & ( PVARM(JI,JJ,JK-KKL)*PA(JI,JJ,JK) & @@ -245,7 +245,7 @@ END DO !CONCURRENT !$acc end kernels ! !$acc kernels ! async -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZY(JI,JJ,IKE)= PVARM(JI,JJ,IKE) + PTSTEP*PSOURCE(JI,JJ,IKE) + & PEXPL / PRHODJA(JI,JJ,IKE) * PA(JI,JJ,IKE) * (PVARM(JI,JJ,IKE)-PVARM(JI,JJ,IKE-KKL)) END DO !CONCURRENT @@ -262,20 +262,24 @@ IF ( PIMPL > 1.E-10 ) THEN ! ! going up ! - !$acc kernels + !$acc kernels #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZBET(JI,JJ) = 1. - PIMPL * ( PA(JI,JJ,IKB+KKL) / PRHODJA(JI,JJ,IKB) & + PCOEFS(JI,JJ) * PTSTEP ) ! bet = b(ikb) PVARP(JI,JJ,IKB) = ZY(JI,JJ,IKB) / ZBET(JI,JJ) END DO !CONCURRENT + !$acc end kernels ! + !$acc parallel !$acc loop seq DO JK = IKB+KKL,IKE-KKL,KKL #ifdef MNH_COMPILER_NVHPC !$acc loop independent gang, vector collapse(2) +#else + !$acc loop independent #endif DO CONCURRENT ( JJ=1:JJU , JI=1:JIU ) ZGAM(JI,JJ,JK) = PIMPL * PA(JI,JJ,JK) / PRHODJA(JI,JJ,JK-KKL) / ZBET(JI,JJ) @@ -290,11 +294,13 @@ IF ( PIMPL > 1.E-10 ) THEN ! res(k) = (y(k) -a(k)*res(k-1))/ bet END DO ! CONCURRENT END DO + !$acc end parallel + !$acc kernels ! special treatment for the last level #ifdef MNH_COMPILER_NVHPC !$acc loop independent gang, vector collapse(2) #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) ZGAM(JI,JJ,IKE) = PIMPL * PA(JI,JJ,IKE) / PRHODJA(JI,JJ,IKE-KKL) / ZBET(JI,JJ) ! gam(k) = c(k-1) / bet ZBET(JI,JJ) = 1. - PIMPL * ( PA(JI,JJ,IKE) * (1. + ZGAM(JI,JJ,IKE)) & @@ -305,24 +311,28 @@ IF ( PIMPL > 1.E-10 ) THEN ) / ZBET(JI,JJ) ! res(k) = (y(k) -a(k)*res(k-1))/ bet END DO !CONCURRENT + !$acc end kernels ! ! going down ! + !$acc parallel !$acc loop seq DO JK = IKE-KKL,IKB,-1*KKL #ifdef MNH_COMPILER_NVHPC !$acc loop gang, vector collapse(2) +#else + !$acc loop independent #endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) + DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,JK) = PVARP(JI,JJ,JK) - ZGAM(JI,JJ,JK+KKL) * PVARP(JI,JJ,JK+KKL) END DO !CONCURRENT END DO -!$acc end kernels + !$acc end parallel ! ELSE ! !$acc kernels - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKTB:IKTE) + DO CONCURRENT (JK=IKTB:IKTE,JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,JK) = ZY(JI,JJ,JK) END DO !CONCURRENT !$acc end kernels @@ -334,7 +344,7 @@ END IF ! ---------------------------------------- ! !$acc kernels -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU) +DO CONCURRENT (JJ=1:JJU,JI=1:JIU) PVARP(JI,JJ,KKA)=PVARP(JI,JJ,IKB) PVARP(JI,JJ,KKU)=PVARP(JI,JJ,IKE) END DO !CONCURRENT -- GitLab