From a5af36d5bf5fd89a0948fe87eb326317316e0f47 Mon Sep 17 00:00:00 2001 From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr> Date: Fri, 16 Dec 2022 14:18:41 +0100 Subject: [PATCH] Juan 16/12/2022:ZSOLVER/* , GPU loop optimization in good order : DO CONCURRENT -> !$mnh_do_concurrent --- src/ZSOLVER/advection_metsv.f90 | 97 +++++++++++++++++++------------ src/ZSOLVER/advection_uvw.f90 | 14 ++--- src/ZSOLVER/advection_uvw_cen.f90 | 3 +- src/ZSOLVER/contrav.f90 | 40 +++++-------- src/ZSOLVER/dotprod.f90 | 20 +++---- src/ZSOLVER/get_halo.f90 | 42 ++++++++++--- src/ZSOLVER/ppm.f90 | 9 +-- src/ZSOLVER/turb.f90 | 93 ++++++++++++++--------------- 8 files changed, 170 insertions(+), 148 deletions(-) diff --git a/src/ZSOLVER/advection_metsv.f90 b/src/ZSOLVER/advection_metsv.f90 index 69e179229..410aece0a 100644 --- a/src/ZSOLVER/advection_metsv.f90 +++ b/src/ZSOLVER/advection_metsv.f90 @@ -191,9 +191,9 @@ USE MODI_ADV_BOUNDARIES #if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif -#ifdef MNH_COMPILER_CCE -!$mnh_undef(LOOP) -!$mnh_undef(OPENACC) +#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP) +! mnh_undef(LOOP) +! mnh_undef(OPENACC) #endif USE MODI_CONTRAV @@ -539,29 +539,40 @@ IF (.NOT. L1D) THEN !$acc end kernels IF (LIBM) THEN !$acc kernels -!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU) #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) +!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU) ZCFLU(IIB:IIE,IJB:IJE,:) = ZCFLU(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,2)/& (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.)) ZCFLV(IIB:IIE,IJB:IJE,:) = ZCFLV(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,3)/& (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.)) ZCFLW(IIB:IIE,IJB:IJE,:) = ZCFLW(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,4)/& (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.)) +!$mnh_end_expand_array() #else +#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP) +DO CONCURRENT (JK=1:JKU,JJ=IJB:IJE,JI=IIB:IIE) +#else +!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU) +#endif ZCFLU(IIB:IIE,IJB:IJE,:) = ZCFLU(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,2)/& Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.))) ZCFLV(IIB:IIE,IJB:IJE,:) = ZCFLV(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,3)/& Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.))) ZCFLW(IIB:IIE,IJB:IJE,:) = ZCFLW(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,4)/& Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.))) -#endif +#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP) +END DO ! CONCURRENT +#else !$mnh_end_expand_array() +#endif +#endif WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,2).GT.(-ZIBM_EPSI)) ZCFLU(IIB:IIE,IJB:IJE,:)=0. WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,3).GT.(-ZIBM_EPSI)) ZCFLV(IIB:IIE,IJB:IJE,:)=0. WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,4).GT.(-ZIBM_EPSI)) ZCFLW(IIB:IIE,IJB:IJE,:)=0. !$acc end kernels ENDIF -#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) +!if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) +#if !defined(MNH_BITREP) IF (.NOT. L2D) THEN !$acc kernels present_cr(ZCFL) ZCFL(:,:,:) = SQRT(ZCFLU(:,:,:)**2+ZCFLV(:,:,:)**2+ZCFLW(:,:,:)**2) @@ -574,17 +585,15 @@ IF (.NOT. L1D) THEN #else IF (.NOT. L2D) THEN !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU ) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ZCFL(JI,JJ,JK) = SQRT(BR_P2(ZCFLU(JI,JJ,JK))+BR_P2(ZCFLV(JI,JJ,JK))+BR_P2(ZCFLW(JI,JJ,JK))) - END DO + !$mnh_end_do() !$acc end kernels ELSE !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU ) + !$mnh_do_concurrent( JI=1:JIU,JJ=1:JJU,JK=1:JKU ) ZCFL(JI,JJ,JK) = SQRT(BR_P2(ZCFLU(JI,JJ,JK))+BR_P2(ZCFLW(JI,JJ,JK))) - END DO + !$mnh_end_do() !$acc end kernels END IF #endif @@ -592,13 +601,13 @@ ELSE !$acc kernels ZCFLU(:,:,:) = 0.0 ; ZCFLV(:,:,:) = 0.0 ; ZCFLW(:,:,:) = 0.0 ZCFLW(IIB:IIE,IJB:IJE,:) = ABS(ZRWCPPM(IIB:IIE,IJB:IJE,:) * PTSTEP) -#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) +!if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) +#if !defined(MNH_BITREP) ZCFL(:,:,:) = SQRT(ZCFLW(:,:,:)**2) -#else - !$acc_nv loop independent collapse(3) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU ) +#else + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU ) ZCFL(JI,JJ,JK) = SQRT(BR_P2(ZCFLW(JI,JJ,JK))) - END DO + !$mnh_end_do() #endif !$acc end kernels END IF @@ -678,7 +687,7 @@ ZCFL_MAX = MAXVAL(ZCFL (IIB:IIE,IJB:IJE,IKB:IKE)) #else ZCFLU_MAX = 0.0 ; ZCFLV_MAX = 0.0 ; ZCFLW_MAX = 0.0 ; ZCFL_MAX = 0.0 !$acc parallel reduction(max:ZCFLU_MAX,ZCFLV_MAX,ZCFLW_MAX,ZCFL_MAX) -!$mnh_do_concurrent (JI=IIB:IIE,JJ=IJB:IJE,JK=IKB:IKE) +!$mnh_do_concurrent(JI=IIB:IIE,JJ=IJB:IJE,JK=IKB:IKE) ZCFLU_MAX = MAX(ZCFLU_MAX,ZCFLU(JI,JJ,JK)) ZCFLV_MAX = MAX(ZCFLV_MAX,ZCFLV(JI,JJ,JK)) ZCFLW_MAX = MAX(ZCFLW_MAX,ZCFLW(JI,JJ,JK)) @@ -768,7 +777,9 @@ ZRWCPPM(:,:,:) = ZRWCPPM(:,:,:)*ZTSTEP_PPM !dir$ concurrent ZRTHS_OTHER(:,:,:) = PRTHS(:,:,:) - PTHT(:,:,:) * PRHODJ(:,:,:) / PTSTEP !dir$ concurrent -IF (GTKE) ZRTKES_OTHER(:,:,:) = PRTKES(:,:,:) - PTKET(:,:,:) * PRHODJ(:,:,:) / PTSTEP +IF (GTKE) THEN + ZRTKES_OTHER(:,:,:) = PRTKES(:,:,:) - PTKET(:,:,:) * PRHODJ(:,:,:) / PTSTEP +END IF DO JR = 1, KRR !dir$ concurrent ZRRS_OTHER(:,:,:,JR) = PRRS(:,:,:,JR) - PRT(:,:,:,JR) * PRHODJ(:,:,:) / PTSTEP @@ -864,10 +875,18 @@ CALL PPM_RHODJ(HLBCX,HLBCY, ZRUCPPM, ZRVCPPM, ZRWCPPM, & !$acc kernels !dir$ concurrent ZTH(:,:,:) = PTHT(:,:,:) -!dir$ concurrent -IF (KRR /=0 ) ZR(:,:,:,:) = PRT(:,:,:,:) -!dir$ concurrent -IF (KSV /=0 ) ZSV(:,:,:,:) = PSVT(:,:,:,:) +!dir concurrent +IF (KRR /=0 ) THEN + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JR=1:KRR ) + ZR(:,:,:,:) = PRT(:,:,:,:) + !$mnh_end_expand_array() +END IF +!dir concurrent +IF (KSV /=0 ) THEN + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JSV=1:KSV) + ZSV(:,:,:,:) = PSVT(:,:,:,:) + !$mnh_end_expand_array() +END IF ! IF (GTKE) THEN PRTKES_ADV(:,:,:) = 0. @@ -914,15 +933,21 @@ DO JSPL=1,KSPLIT ! Tendencies of PPM ! ! acc kernels - !$acc kernels - !dir$ concurrent + !$acc kernels present_cr(PRTHS,ZRTHS_PPM) PRTHS(:,:,:) = PRTHS (:,:,:) + ZRTHS_PPM (:,:,:) / KSPLIT - !dir$ concurrent - IF (GTKE) PRTKES_ADV(:,:,:) = PRTKES_ADV(:,:,:) + ZRTKES_PPM(:,:,:) / KSPLIT - !dir$ concurrent - IF (KRR /=0) PRRS (:,:,:,:) = PRRS (:,:,:,:) + ZRRS_PPM (:,:,:,:) / KSPLIT - !dir$ concurrent - IF (KSV /=0 ) PRSVS (:,:,:,:) = PRSVS (:,:,:,:) + ZRSVS_PPM (:,:,:,:) / KSPLIT + IF (GTKE) THEN + PRTKES_ADV(:,:,:) = PRTKES_ADV(:,:,:) + ZRTKES_PPM(:,:,:) / KSPLIT + END IF + IF (KRR /=0) THEN + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JR=1:KRR) + PRRS (:,:,:,:) = PRRS (:,:,:,:) + ZRRS_PPM (:,:,:,:) / KSPLIT + !$mnh_end_expand_array() + END IF + IF (KSV /=0 ) THEN + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU,JSV=1:KSV) + PRSVS (:,:,:,:) = PRSVS (:,:,:,:) + ZRSVS_PPM (:,:,:,:) / KSPLIT + !$mnh_end_expand_array() + END IF !$acc end kernels ! IF (JSPL<KSPLIT) THEN @@ -939,18 +964,16 @@ DO JSPL=1,KSPLIT !$acc end kernels END IF !$acc kernels - !$acc_nv loop independent collapse(4) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU, JR=1:KRR ) + !$mnh_do_concurrent( JI=1:JIU,JJ=1:JJU,JK=1:JKU, JR=1:KRR ) ZR(JI,JJ,JK,JR) = ZR(JI,JJ,JK,JR) + ( ZRRS_PPM(JI,JJ,JK,JR) + ZRRS_OTHER(JI,JJ,JK,JR) + PRRS_CLD(JI,JJ,JK,JR) ) & * ZTSTEP_PPM / PRHODJ(JI,JJ,JK) - END DO !CONCURRENT + !$mnh_end_do() !CONCURRENT !$acc loop seq DO JSV = 1, KSV - !$acc_nv loop independent collapse(3) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) + !$mnh_do_concurrent ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) ZSV(JI,JJ,JK,JSV) = ZSV(JI,JJ,JK,JSV) + ( ZRSVS_PPM(JI,JJ,JK,JSV) + ZRSVS_OTHER(JI,JJ,JK,JSV) + & PRSVS_CLD(JI,JJ,JK,JSV) ) * ZTSTEP_PPM / PRHODJ(JI,JJ,JK) - END DO !CONCURRENT + !$mnh_end_do() !CONCURRENT END DO !$acc end kernels END IF diff --git a/src/ZSOLVER/advection_uvw.f90 b/src/ZSOLVER/advection_uvw.f90 index 6885bca91..58ba42fe5 100644 --- a/src/ZSOLVER/advection_uvw.f90 +++ b/src/ZSOLVER/advection_uvw.f90 @@ -437,30 +437,24 @@ DO JSPL=1,ISPLIT ! Tendencies on wind ! acc update device(ZRUS_ADV,ZRVS_ADV,ZRWS_ADV) !$acc kernels -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(3) -#endif -DO CONCURRENT (JI=1:IIU , JJ=1:IJU , JK=1:IKU ) +!$mnh_do_concurrent(JI=1:IIU,JJ=1:IJU,JK=1:IKU ) PRUS(JI,JJ,JK) = PRUS(JI,JJ,JK) + ZRUS_ADV(JI,JJ,JK) / ISPLIT PRVS(JI,JJ,JK) = PRVS(JI,JJ,JK) + ZRVS_ADV(JI,JJ,JK) / ISPLIT PRWS(JI,JJ,JK) = PRWS(JI,JJ,JK) + ZRWS_ADV(JI,JJ,JK) / ISPLIT -END DO +!$mnh_end_do() IF (JSPL<ISPLIT) THEN ! ! Guesses for next time splitting loop ! ! -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(3) -#endif -DO CONCURRENT (JI=1:IIU , JJ=1:IJU , JK=1:IKU ) +!$mnh_do_concurrent(JI=1:IIU,JJ=1:IJU,JK=1:IKU) ZU(JI,JJ,JK) = ZU(JI,JJ,JK) + ZTSTEP / ZMXM_RHODJ(JI,JJ,JK) * & (ZRUS_OTHER(JI,JJ,JK) + ZRUS_ADV(JI,JJ,JK)) ZV(JI,JJ,JK) = ZV(JI,JJ,JK) + ZTSTEP / ZMYM_RHODJ(JI,JJ,JK) * & (ZRVS_OTHER(JI,JJ,JK) + ZRVS_ADV(JI,JJ,JK)) ZW(JI,JJ,JK) = ZW(JI,JJ,JK) + ZTSTEP / ZMZM_RHODJ(JI,JJ,JK) * & (ZRWS_OTHER(JI,JJ,JK) + ZRWS_ADV(JI,JJ,JK)) -END DO +!$mnh_end_do() END IF !$acc end kernels ! diff --git a/src/ZSOLVER/advection_uvw_cen.f90 b/src/ZSOLVER/advection_uvw_cen.f90 index 313a36457..a4055eda1 100644 --- a/src/ZSOLVER/advection_uvw_cen.f90 +++ b/src/ZSOLVER/advection_uvw_cen.f90 @@ -403,7 +403,8 @@ ELSEIF (HUVW_ADV_SCHEME=='CEN4TH') THEN ! END IF ! -!$acc kernels present( ZRUS, ZRVS, ZRWS, ZMXM_RHODJ, ZMYM_RHODJ, ZMZM_RHODJ ) +!$acc kernels present(ZRUS,ZRVS,ZRWS,ZMXM_RHODJ,ZMYM_RHODJ,ZMZM_RHODJ) & +!$acc present_cr(PRUS,PRVS,PRWS,PDUM,PDWM) ZUS(:,:,:) = ZRUS(:,:,:)/ZMXM_RHODJ(:,:,:)*2.*PTSTEP ZVS(:,:,:) = ZRVS(:,:,:)/ZMYM_RHODJ(:,:,:)*2.*PTSTEP ZWS(:,:,:) = ZRWS(:,:,:)/ZMZM_RHODJ(:,:,:)*2.*PTSTEP diff --git a/src/ZSOLVER/contrav.f90 b/src/ZSOLVER/contrav.f90 index 78b9f656f..2976ea754 100644 --- a/src/ZSOLVER/contrav.f90 +++ b/src/ZSOLVER/contrav.f90 @@ -708,23 +708,20 @@ IF (KADV_ORDER == 2 ) THEN #endif !$acc kernels ! -!$acc_nv loop independent collapse(3) - do concurrent (ji=iib:iie,jj=1:iju,jk=ikb:ike+1) + !$mnh_do_concurrent(ji=iib:iie,jj=1:iju,jk=ikb:ike+1) Z1(ji, jj, jk ) = ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk - 1 ) ) * PDZX (ji, jj, jk ) * 0.25 & + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25 - end do -!$acc_nv loop independent collapse(3) - do concurrent (ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1) + !$mnh_end_do() + !$mnh_do_concurrent(ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1) Z2(ji, jj, jk ) = ( PRVCT(ji, jj, jk) + PRVCT( ji, jj, jk - 1) ) * PDZY(ji, jj, jk) * 0.25 & + ( PRVCT(ji, jj + 1, jk) + PRVCT( ji, jj + 1,jk - 1) ) * PDZY(ji, jj + 1, jk) * 0.25 - end do + !$mnh_end_do() PRWCT(:,:,:)=0. -!$acc_nv loop independent collapse(3) - do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) + !$mnh_do_concurrent(ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do + !$mnh_end_do() ! !$acc end kernels ELSE IF (KADV_ORDER == 4 ) THEN @@ -777,25 +774,23 @@ ELSE IF (KADV_ORDER == 4 ) THEN !PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works ! but ji loop can not be collapsed -> 10x slower on GPU ! *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance) -!$acc_nv loop independent collapse(3) - do concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1) + !$mnh_do_concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(ji, jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) ) / 3.0 ) / 16.0 Z1(ji, jj, jk ) = 7.0 * ( ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk) & + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk) ) / 12.0 - end do + !$mnh_end_do() ! -!$acc_nv loop independent collapse(3) - do concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1) + !$mnh_do_concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, jj, jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) ) / 3.0 ) / 16.0 Z2(ji, jj, jk ) = 7.0 * ( ( PRVCT(ji, jj, jk ) + PRVCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk ) & + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk ) ) / 12.0 - end do + !$mnh_end_do() !$acc end kernels ! !!$CALL MPPDB_CHECK3DM("contrav_device :: dom Z1/Z2",PRECISION,Z1,Z2) @@ -804,27 +799,25 @@ ELSE IF (KADV_ORDER == 4 ) THEN ! !!$ IF (NHALO==1) THEN !$acc kernels async -!$acc_nv loop independent collapse(2) - do concurrent(jj=1:iju,jk=IKB:IKE+1) + !$mnh_do_concurrent(jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(IIE, jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk ) + PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) ) / 3.0 ) / 16.0 Z1(IIE, jj, jk ) = 7.0 * ( ( PRUCT(IIE, jj, jk ) + PRUCT(IIE, jj, jk-1 ) ) * ztmp1 & + ( PRUCT(IIE+1, jj, jk ) + PRUCT(IIE+1, jj, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk) & + ( ZU_EAST (jj, jk ) + ZU_EAST (jj, jk-1 ) ) * ZDZX_EAST (jj, jk) ) / 12.0 - end do + !$mnh_end_do() !$acc end kernels ! !$acc kernels async -!$acc_nv loop independent collapse(2) - do concurrent(ji=1:iiu,jk=IKB:IKE+1) + !$mnh_do_concurrent(ji=1:iiu,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, IJE, jk) - ( PDZY (ji, IJE+1, jk) + PDZY(ji, IJE, jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji, jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE, jk) ) / 3.0 ) / 16.0 Z2(ji, IJE, jk ) = 7.0 * ( ( PRVCT (ji, IJE, jk ) + PRVCT (ji, IJE, jk-1 ) ) * ztmp1 & + ( PRVCT (ji, IJE+1, jk ) + PRVCT (ji, IJE+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT (ji, IJE-1, jk ) + PRVCT (ji, IJE-1, jk-1 ) ) * PDZY (ji, IJE-1, jk ) & + ( ZV_NORTH(ji, jk ) + ZV_NORTH(ji, jk-1 ) ) * ZDZY_NORTH(ji, jk ) ) / 12.0 - end do + !$mnh_end_do() !$acc end kernels !$acc wait !!$ END IF @@ -871,10 +864,9 @@ ELSE IF (KADV_ORDER == 4 ) THEN !!$ !!$ CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ) PRWCT(:,:,:)=0. -!$acc_nv loop independent collapse(3) - do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) + !$mnh_do_concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do + !$mnh_end_do() !$acc end kernels ! CALL MPPDB_CHECK3DM("contrav_device :: PRWCT/Z1/Z2",PRECISION,PRWCT,Z1,Z2) diff --git a/src/ZSOLVER/dotprod.f90 b/src/ZSOLVER/dotprod.f90 index 2c4791cc6..dab2e6a88 100644 --- a/src/ZSOLVER/dotprod.f90 +++ b/src/ZSOLVER/dotprod.f90 @@ -185,18 +185,16 @@ CALL MNH_MEM_GET(ZDOTPROD, ILBXB,ILBXE ,ILBYB,ILBYE ) #endif !$acc kernels present(ZDOTPROD) ZDOTPROD(:,:) = 0. -!$acc loop seq -!dir nextscalar -DO JK = IKB-1,IKE+1 - !DO CONCURRENT (JI=ILBXB:ILBXE,JJ=ILBYB:ILBYE) - !$acc loop collapse(2) independent - DO JJ = ILBYB,ILBYE - DO JI = ILBXB,ILBXE - ZDOTPROD(JI,JJ) = ZDOTPROD(JI,JJ) + PA(JI,JJ,JK) * PB(JI,JJ,JK) - END DO - END DO -END DO !$acc end kernels +!$acc parallel +!$mnh_do_concurrent(JI=ILBXB:ILBXE,JJ=ILBYB:ILBYE) + !dir$ nextscalar + !$acc loop seq + DO JK = IKB-1,IKE+1 + ZDOTPROD(JI,JJ) = ZDOTPROD(JI,JJ) + PA(JI,JJ,JK) * PB(JI,JJ,JK) + END DO +!$mnh_end_do() +!$acc end parallel !$acc update host(ZDOTPROD) PDOTPROD = SUM_DD_R2_ll(ZDOTPROD) !JUAN16 diff --git a/src/ZSOLVER/get_halo.f90 b/src/ZSOLVER/get_halo.f90 index 719fa0a19..e53338d1f 100644 --- a/src/ZSOLVER/get_halo.f90 +++ b/src/ZSOLVER/get_halo.f90 @@ -464,6 +464,11 @@ INTEGER,PARAMETER :: IS_WEST=1 , IS_EAST=2, IS_SOUTH=3, IS_NORTH=4 LOGICAL :: LX , LY INTEGER :: NB_REQ, IERR ! +INTEGER :: JI,JJ,JK, JIU,JJU,JKU + +JIU = SIZE(PSRC,1) +JJU = SIZE(PSRC,2) +JKU = SIZE(PSRC,3) CALL INIT_HALO_D() @@ -553,12 +558,16 @@ END IF IF (LX) THEN IF (.NOT. GWEST) THEN !$acc kernels async(IS_WEST) - ZWEST_IN ( IIB:IIB+IHALO_1 , IJB:IJE , : ) = PSRC( IIB:IIB+IHALO_1 , IJB:IJE , : ) + !$mnh_expand_array(JI=IIB:IIB+IHALO_1 , JJ=IJB:IJE , JK=1:JKU ) + ZWEST_IN ( IIB:IIB+IHALO_1 , IJB:IJE , : ) = PSRC( IIB:IIB+IHALO_1 , IJB:IJE , : ) + !$mnh_end_expand_array() !$acc end kernels END IF IF (.NOT.GEAST) THEN !$acc kernels async(IS_EAST) - ZEAST_IN ( IIE-IHALO_1:IIE , IJB:IJE , : ) = PSRC( IIE-IHALO_1:IIE , IJB:IJE , : ) + !$mnh_expand_array(JI=IIE-IHALO_1:IIE , JJ=IJB:IJE , JK=1:JKU) + ZEAST_IN ( IIE-IHALO_1:IIE , IJB:IJE , : ) = PSRC( IIE-IHALO_1:IIE , IJB:IJE , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF END IF @@ -566,12 +575,16 @@ END IF IF (LY) THEN IF (.NOT.GSOUTH) THEN !$acc kernels async(IS_SOUTH) - ZSOUTH_IN ( IIB:IIE , IJB:IJB+IHALO_1 , : ) = PSRC( IIB:IIE , IJB:IJB+IHALO_1 , : ) + !$mnh_expand_array(JI=IIB:IIE , JJ=IJB:IJB+IHALO_1 , JK=1:JKU ) + ZSOUTH_IN ( IIB:IIE , IJB:IJB+IHALO_1 , : ) = PSRC( IIB:IIE , IJB:IJB+IHALO_1 , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF IF (.NOT.GNORTH) THEN !$acc kernels async(IS_NORTH) - ZNORTH_IN ( IIB:IIE , IJE-IHALO_1:IJE , : ) = PSRC( IIB:IIE , IJE-IHALO_1:IJE , : ) + !$mnh_expand_array(JI=IIB:IIE , JJ=IJE-IHALO_1:IJE , JK=1:JKU ) + ZNORTH_IN ( IIB:IIE , IJE-IHALO_1:IJE , : ) = PSRC( IIB:IIE , IJE-IHALO_1:IJE , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF ENDIF @@ -676,6 +689,11 @@ INTEGER,PARAMETER :: IS_WEST=1 , IS_EAST=2, IS_SOUTH=3, IS_NORTH=4 LOGICAL :: LX , LY INTEGER :: NB_REQ, IERR ! +INTEGER :: JI,JJ,JK, JIU,JJU,JKU + +JIU = SIZE(PSRC,1) +JJU = SIZE(PSRC,2) +JKU = SIZE(PSRC,3) CALL INIT_HALO_D() @@ -715,7 +733,9 @@ IF (LX) THEN !$acc update device(ZWEST_OUT) async(IS_WEST) #endif !$acc kernels async(IS_WEST) - PSRC( 1:IIB-1 , IJB:IJE , : ) = ZWEST_OUT( 1:IIB-1 , IJB:IJE , : ) + !$mnh_expand_array(JI=1:IIB-1 , JJ=IJB:IJE , JK=1:JKU ) + PSRC( 1:IIB-1 , IJB:IJE , : ) = ZWEST_OUT( 1:IIB-1 , IJB:IJE , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF IF (.NOT.GEAST) THEN @@ -723,7 +743,9 @@ IF (LX) THEN !$acc update device(ZEAST_OUT) async(IS_EAST) #endif !$acc kernels async(IS_EAST) - PSRC( IIE+1:IIU , IJB:IJE , : ) = ZEAST_OUT( IIE+1:IIU , IJB:IJE , : ) + !$mnh_expand_array(JI=IIE+1:IIU , JJ=IJB:IJE , JK=1:JKU ) + PSRC( IIE+1:IIU , IJB:IJE , : ) = ZEAST_OUT( IIE+1:IIU , IJB:IJE , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF END IF @@ -733,7 +755,9 @@ IF (LY) THEN !$acc update device(ZSOUTH_OUT) async(IS_SOUTH) #endif !$acc kernels async(IS_SOUTH) - PSRC( IIB:IIE , 1:IJB-1 , : ) = ZSOUTH_OUT( IIB:IIE , 1:IJB-1 , : ) + !$mnh_expand_array(JI=IIB:IIE , JJ=1:IJB-1 , JK=1:JKU ) + PSRC( IIB:IIE , 1:IJB-1 , : ) = ZSOUTH_OUT( IIB:IIE , 1:IJB-1 , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF IF (.NOT.GNORTH) THEN @@ -741,7 +765,9 @@ IF (LY) THEN !$acc update device(ZNORTH_OUT) async(IS_NORTH) #endif !$acc kernels async(IS_NORTH) - PSRC( IIB:IIE , IJE+1:IJU , : ) = ZNORTH_OUT ( IIB:IIE , IJE+1:IJU , : ) + !$mnh_expand_array(JI=IIB:IIE , JJ=IJE+1:IJU , JK=1:JKU ) + PSRC( IIB:IIE , IJE+1:IJU , : ) = ZNORTH_OUT ( IIB:IIE , IJE+1:IJU , : ) + !$mnh_end_expand_array() !$acc end kernels ENDIF END IF diff --git a/src/ZSOLVER/ppm.f90 b/src/ZSOLVER/ppm.f90 index 608a9b2ae..51bf1a52a 100644 --- a/src/ZSOLVER/ppm.f90 +++ b/src/ZSOLVER/ppm.f90 @@ -525,10 +525,7 @@ ZFNEG(:,:,:) = PSRC(:,:,:) CALL GET_HALO_D(PSRC,HDIR="01_X", HNAME='PSRC') ! !$acc kernels -!$acc loop independent collapse(3) - do jk = 1, iku - do jj = 1, iju - do ji = 1, iiu +!$mnh_do_concurrent (ji=1:iiu,jj=1:iju,jk=1:iku) PR (ji, jj, jk ) = PSRC(ji, jj, jk ) ZQL (ji, jj, jk ) = PSRC(ji, jj, jk ) ZQR (ji, jj, jk ) = PSRC(ji, jj, jk ) @@ -538,9 +535,7 @@ CALL GET_HALO_D(PSRC,HDIR="01_X", HNAME='PSRC') ZQL0 (ji, jj, jk ) = PSRC(ji, jj, jk ) ZQR0 (ji, jj, jk ) = PSRC(ji, jj, jk ) ZQ60 (ji, jj, jk ) = PSRC(ji, jj, jk ) - end do - end do -end do +!$mnh_end_do() ! #if 0 ZFPOS(:,1:IJS,:)=PSRC(:,1:IJS,:) diff --git a/src/ZSOLVER/turb.f90 b/src/ZSOLVER/turb.f90 index a173a86c6..c72f7d21d 100644 --- a/src/ZSOLVER/turb.f90 +++ b/src/ZSOLVER/turb.f90 @@ -14,7 +14,7 @@ module mode_turb #if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) use modi_bitrep #endif -#ifdef MNH_COMPILER_CCE +#if defined(MNH_COMPILER_CCE) && defined(MNH_BITREP_OMP) !$mnh_undef(LOOP) !$mnh_undef(OPENACC) #endif @@ -760,9 +760,9 @@ ELSE #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZEXN(:,:,:) = (PPABST(:,:,:)/XP00) ** (XRD/XCPD) #else -DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ZEXN(JI,JJ,JK) = BR_POW(PPABST(JI,JJ,JK)/XP00,XRD/XCPD) -END DO +!$mnh_end_do() #endif END IF ! @@ -811,18 +811,20 @@ IF (KRRL >=1) THEN ZLSOCPEXNM,ZAMOIST_ICE,ZATHETA_ICE) ! !$acc kernels present_cr( zamoist, zatheta, zlocpexnm, zlvocpexnm, zlsocpexnm, zamoist_ice, zatheta_ice ) - DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) IF (PRT(JI,JJ,JK,2)+PRT(JI,JJ,JK,4)>0.0) THEN ZFRAC_ICE(JI,JJ,JK) = PRT(JI,JJ,JK,4) / ( PRT(JI,JJ,JK,2)+PRT(JI,JJ,JK,4) ) END IF - END DO + !$mnh_end_do() ! + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ZLOCPEXNM(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZLVOCPEXNM(:,:,:) & +ZFRAC_ICE(:,:,:) *ZLSOCPEXNM(:,:,:) ZAMOIST(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZAMOIST(:,:,:) & +ZFRAC_ICE(:,:,:) *ZAMOIST_ICE(:,:,:) ZATHETA(:,:,:) = (1.0-ZFRAC_ICE(:,:,:))*ZATHETA(:,:,:) & +ZFRAC_ICE(:,:,:) *ZATHETA_ICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels !$acc end data @@ -877,7 +879,7 @@ END IF ! loop end on KRRL >= 1 IF ( KRRL >= 1 ) THEN !$acc kernels present_cr( zlocpexnm ) IF ( KRRI >= 1 ) THEN - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ! Rnp at t PRT(JI,JJ,JK,1) = PRT(JI,JJ,JK,1) + PRT(JI,JJ,JK,2) + PRT(JI,JJ,JK,4) PRRS(JI,JJ,JK,1) = PRRS(JI,JJ,JK,1) + PRRS(JI,JJ,JK,2) + PRRS(JI,JJ,JK,4) @@ -886,16 +888,16 @@ IF ( KRRL >= 1 ) THEN - ZLSOCPEXNM(JI,JJ,JK) * PRT(JI,JJ,JK,4) PRTHLS(JI,JJ,JK) = PRTHLS(JI,JJ,JK) - ZLVOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,2) & - ZLSOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,4) - ENDDO + !$mnh_end_do() ELSE - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ! Rnp at t PRT(JI,JJ,JK,1) = PRT(JI,JJ,JK,1) + PRT(JI,JJ,JK,2) PRRS(JI,JJ,JK,1) = PRRS(JI,JJ,JK,1) + PRRS(JI,JJ,JK,2) ! Theta_l at t PTHLT(JI,JJ,JK) = PTHLT(JI,JJ,JK) - ZLOCPEXNM(JI,JJ,JK) * PRT(JI,JJ,JK,2) PRTHLS(JI,JJ,JK) = PRTHLS(JI,JJ,JK) - ZLOCPEXNM(JI,JJ,JK) * PRRS(JI,JJ,JK,2) - ENDDO + !$mnh_end_do() END IF !$acc end kernels END IF @@ -1083,11 +1085,10 @@ ENDIF ZCDUEFF(:,:) =-SQRT ( (PSFU(:,:)**2 + PSFV(:,:)**2) / & (XMNH_TINY + ZUSLOPE(:,:)**2 + ZVSLOPE(:,:)**2 ) ) #else - !$acc_nv loop independent collapse(2) - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) ZCDUEFF(JI,JJ) =-SQRT ( (BR_P2(PSFU(JI,JJ)) + BR_P2(PSFV(JI,JJ))) / & (XMNH_TINY + BR_P2(ZUSLOPE(JI,JJ)) + BR_P2(ZVSLOPE(JI,JJ)) ) ) - END DO + !$mnh_end_do() #endif !$acc end kernels ! @@ -1879,9 +1880,9 @@ CALL MNH_MEM_GET( zdrvsatdt, size( pexn, 1 ), size( pexn, 2 ), size( pexn, 3 ) ) #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZRVSAT(:,:,:) = EXP( PALP - PBETA/PT(:,:,:) - PGAM*ALOG( PT(:,:,:) ) ) #else - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ZRVSAT(JI,JJ,JK) = BR_EXP( PALP - PBETA/PT(JI,JJ,JK) - PGAM*BR_LOG( PT(JI,JJ,JK) ) ) - END DO + !$mnh_end_do() #endif !$acc end kernels !$acc kernels present_cr(ZRVSAT,ZDRVSATDT) @@ -1915,8 +1916,7 @@ CALL MNH_MEM_GET( zdrvsatdt, size( pexn, 1 ), size( pexn, 2 ), size( pexn, 3 ) ) - ZDRVSATDT(:,:,:) & ) #else -!$acc_nv loop independent collapse(3) -DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) PATHETA(JI,JJ,JK)= PAMOIST(JI,JJ,JK) * PEXN(JI,JJ,JK) * & ( ( ZRVSAT(JI,JJ,JK) - PRT(JI,JJ,JK,1) ) * PLOCPEXN(JI,JJ,JK) / & ( 1. + ZDRVSATDT(JI,JJ,JK) * PLOCPEXN(JI,JJ,JK) ) * & @@ -1928,16 +1928,15 @@ DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU) ) & - ZDRVSATDT(JI,JJ,JK) & ) -ENDDO +!$mnh_end_do() #endif !$acc end kernels !* 1.7 Lv/Cph/Exner at t-1 ! !$acc kernels present_cr(PLOCPEXN) -!$acc_nv loop independent collapse(3) -DO CONCURRENT(JI=1:JIU,JJ=1:JJU,JK=1:JKU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) PLOCPEXN(JI,JJ,JK) = PLOCPEXN(JI,JJ,JK) / PEXN(JI,JJ,JK) -END DO +!$mnh_end_do() !$acc end kernels if ( mppdb_initialized ) then @@ -2275,9 +2274,9 @@ IF (ODZ) THEN #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) PLM(:,:,:) = ( PLM(:,:,:) * ZTMP1_DEVICE(:,:,:) * ZTMP2_DEVICE(:,:,:) ) ** (1./3.) #else -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) PLM(JI,JJ,JK) = BR_POW( PLM(JI,JJ,JK) * ZTMP1_DEVICE(JI,JJ,JK) * ZTMP2_DEVICE(JI,JJ,JK), 1./3. ) -ENDDO +!$mnh_end_do() #endif !$acc end kernels #endif @@ -2309,9 +2308,9 @@ ELSE #if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) PLM(:,:,:) = ( ZTMP1_DEVICE * ZTMP2_DEVICE ) ** (1./2.) #else - DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK=1:JKU ) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) PLM(JI,JJ,JK) = BR_POW( ZTMP1_DEVICE(JI,JJ,JK) * ZTMP2_DEVICE(JI,JJ,JK), 1. / 2. ) - END DO + !$mnh_end_do() #endif !$acc end kernels #endif @@ -2360,12 +2359,12 @@ IF (.NOT. ORMC01) THEN END IF ! !$acc kernels -DO CONCURRENT(JI=1:JIU , JJ=1:JJU ) - PLM(JI,JJ,KKA) = PLM(JI,JJ,KKB ) -END DO -DO CONCURRENT(JI=1:JIU , JJ=1:JJU ) - PLM(JI,JJ,KKU ) = PLM(JI,JJ,KKE) -END DO +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) + PLM(JI,JJ,KKA) = PLM(JI,JJ,KKB) +! mnh_end_do() +! mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) + PLM(JI,JJ,KKU) = PLM(JI,JJ,KKE) +!$mnh_end_do() !$acc end kernels !$acc end data @@ -2580,9 +2579,9 @@ IF ( HTURBDIM /= '1DIM' ) THEN ! 3D turbulence scheme call Mppdb_check( plm, "Dear mid1:plm" ) end if !$acc kernels -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=1:JKU) PLM(JI,JJ,JK) = BR_POW( PLM(JI,JJ,JK)*ZTMP1_DEVICE(JI,JJ,JK) *ZTMP2_DEVICE(JI,JJ,JK) , 1./3. ) -ENDDO +!$mnh_end_do() !$acc end kernels if ( mppdb_initialized ) then call Mppdb_check( plm, "Dear mid2:plm" ) @@ -2603,8 +2602,8 @@ CALL EMOIST(KRR,KRRI,PTHLT,PRT,PLOCPEXNM,PAMOIST,PSRCT,ZEMOIST) ! !$acc kernels present(ZWORK2D,PLM) IF (KRR>0) THEN - !$acc_nv loop independent collapse(3) private(ZVAR) - DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK = KKTB+1:KKTE-1) + ! acc loop private(ZVAR) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=KKTB+1:KKTE-1) ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK ))/PDZZ(JI,JJ,JK+KKL)+ & (PTHLT(JI,JJ,JK )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK )) ZDRTDZ(JI,JJ,JK) = 0.5*((PRT(JI,JJ,JK+KKL,1)-PRT(JI,JJ,JK ,1))/PDZZ(JI,JJ,JK+KKL)+ & @@ -2620,10 +2619,10 @@ IF (KRR>0) THEN PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), & 0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR))) END IF - END DO + !$mnh_end_do() ELSE! For dry atmos or unsalted ocean runs - !$acc_nv loop independent collapse(3) private(ZVAR) - DO CONCURRENT( JI=1:JIU, JJ=1:JJU, JK = KKTB+1:KKTE-1) + ! acc loop private(ZVAR) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU,JK=KKTB+1:KKTE-1) ZDTHLDZ(JI,JJ,JK)= 0.5*((PTHLT(JI,JJ,JK+KKL)-PTHLT(JI,JJ,JK ))/PDZZ(JI,JJ,JK+KKL)+ & (PTHLT(JI,JJ,JK )-PTHLT(JI,JJ,JK-KKL))/PDZZ(JI,JJ,JK )) IF (GOCEAN) THEN @@ -2636,7 +2635,7 @@ ELSE! For dry atmos or unsalted ocean runs PLM(JI,JJ,JK)=MAX(XMNH_EPSILON,MIN(PLM(JI,JJ,JK), & 0.76* SQRT(PTKET(JI,JJ,JK)/ZVAR))) END IF - END DO + !$mnh_end_do() END IF ! special case near the surface ZDTHLDZ(:,:,KKB)=(PTHLT(:,:,KKB+KKL)-PTHLT(:,:,KKB))/PDZZ(:,:,KKB+KKL) @@ -2653,12 +2652,9 @@ IF (GOCEAN) THEN ZWORK2D(:,:)=XG*(XALPHAOC*ZDTHLDZ(:,:,KKB)-XBETAOC*ZDRTDZ(:,:,KKB)) #else !PW: bug: nvhpc 21.11 does not parallelize this loop even with loop independent directive! -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(2) -#endif - DO CONCURRENT( JI = 1 : JIU, JJ = 1 : JJU ) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) ZWORK2D(JI,JJ)=XG*(XALPHAOC*ZDTHLDZ(JI,JJ,KKB)-XBETAOC*ZDRTDZ(JI,JJ,KKB)) - END DO + !$mnh_end_do() #endif ELSE #if 0 @@ -2667,21 +2663,18 @@ ELSE (ZETHETA(:,:,KKB)*ZDTHLDZ(:,:,KKB)+ZEMOIST(:,:,KKB)*ZDRTDZ(:,:,KKB)) #else !PW: bug: nvhpc 21.11 does not parallelize this loop even with loop independent directive! -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(2) -#endif - DO CONCURRENT( JI = 1 : JIU, JJ = 1 : JJU ) + !$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) ZWORK2D(JI,JJ)=XG/PTHVREF(JI,JJ,KKB)* & (ZETHETA(JI,JJ,KKB)*ZDTHLDZ(JI,JJ,KKB)+ZEMOIST(JI,JJ,KKB)*ZDRTDZ(JI,JJ,KKB)) - END DO + !$mnh_end_do() #endif END IF -DO CONCURRENT(JI=1:JIU,JJ=1:JJU) +!$mnh_do_concurrent(JI=1:JIU,JJ=1:JJU) IF (ZWORK2D(JI,JJ)>0.) THEN PLM(JI,JJ,KKB)=MAX(XMNH_EPSILON,MIN( PLM(JI,JJ,KKB), & 0.76* SQRT(PTKET(JI,JJ,KKB)/ZWORK2D(JI,JJ)))) END IF -END DO +!$mnh_end_do() ! ! mixing length limited by the distance normal to the surface (with the same factor as for BL89) ! -- GitLab