diff --git a/src/ZSOLVER/advection_metsv.f90 b/src/ZSOLVER/advection_metsv.f90 index c9cbce0aeec1af60169da227853743a30d66a052..034505c02822ae387020f56a68dd01e87c82c9d3 100644 --- a/src/ZSOLVER/advection_metsv.f90 +++ b/src/ZSOLVER/advection_metsv.f90 @@ -188,9 +188,14 @@ use mode_sum_ll, only: MAX_ll use mode_tools_ll, only: GET_INDICE_ll, lnorth_ll, lsouth_ll, least_ll, lwest_ll ! USE MODI_ADV_BOUNDARIES -#ifdef MNH_BITREP +#if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif +#ifdef MNH_BITREP_OMP +!$mnh_undef(LOOP) +!$mnh_undef(OPENACC) +#endif + USE MODI_CONTRAV USE MODI_GET_HALO USE MODI_PPM_RHODJ @@ -534,7 +539,8 @@ IF (.NOT. L1D) THEN !$acc end kernels IF (LIBM) THEN !$acc kernels -#ifndef MNH_BITREP +!$mnh_expand_array(JI=IIB:IIE,JJ=IJB:IJE,JK=1:JKU) +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZCFLU(IIB:IIE,IJB:IJE,:) = ZCFLU(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,2)/& (XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:))**(1./3.))**2.)) ZCFLV(IIB:IIE,IJB:IJE,:) = ZCFLV(IIB:IIE,IJB:IJE,:)*(1.-exp(-(XIBM_LS(IIB:IIE,IJB:IJE,:,3)/& @@ -549,12 +555,13 @@ IF (.NOT. L1D) THEN ZCFLW(IIB:IIE,IJB:IJE,:) = ZCFLW(IIB:IIE,IJB:IJE,:)*(1.-Br_exp(-Br_pow(XIBM_LS(IIB:IIE,IJB:IJE,:,4)/& Br_pow(XRHODJ(IIB:IIE,IJB:IJE,:)/XRHODREF(IIB:IIE,IJB:IJE,:),1./3.),2.))) #endif +!$mnh_end_expand_array() WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,2).GT.(-ZIBM_EPSI)) ZCFLU(IIB:IIE,IJB:IJE,:)=0. WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,3).GT.(-ZIBM_EPSI)) ZCFLV(IIB:IIE,IJB:IJE,:)=0. WHERE (XIBM_LS(IIB:IIE,IJB:IJE,:,4).GT.(-ZIBM_EPSI)) ZCFLW(IIB:IIE,IJB:IJE,:)=0. !$acc end kernels ENDIF -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) IF (.NOT. L2D) THEN !$acc kernels present_cr(ZCFL) ZCFL(:,:,:) = SQRT(ZCFLU(:,:,:)**2+ZCFLV(:,:,:)**2+ZCFLW(:,:,:)**2) @@ -585,7 +592,7 @@ ELSE !$acc kernels ZCFLU(:,:,:) = 0.0 ; ZCFLV(:,:,:) = 0.0 ; ZCFLW(:,:,:) = 0.0 ZCFLW(IIB:IIE,IJB:IJE,:) = ABS(ZRWCPPM(IIB:IIE,IJB:IJE,:) * PTSTEP) -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZCFL(:,:,:) = SQRT(ZCFLW(:,:,:)**2) #else !$acc_nv loop independent collapse(3) diff --git a/src/ZSOLVER/ppm.f90 b/src/ZSOLVER/ppm.f90 index 83f0849e7424d7c723d17c9f989e793359f142eb..608a9b2ae8cabd2515bc1241ae786f4910cd2a04 100644 --- a/src/ZSOLVER/ppm.f90 +++ b/src/ZSOLVER/ppm.f90 @@ -397,9 +397,12 @@ use mode_mppdb use mode_msg #endif -#ifdef MNH_BITREP +#if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif +#ifdef MNH_BITREP_OMP +USE MODI_BITREPZ +#endif USE MODI_GET_HALO #ifndef MNH_OPENACC USE MODI_SHUMAN @@ -470,7 +473,11 @@ INTEGER :: IJS,IJN #endif LOGICAL :: GWEST , GEAST !------------------------------------------------------------------------------- - +! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PSRC(:,:,:)) +#endif +! !$acc data present( PSRC, PCR, PRHO, PR , & !$acc & ZQL, ZQR, ZDQ, ZQ6, ZDMQ, ZQL0, ZQR0, ZQ60, ZFPOS, ZFNEG ) @@ -632,7 +639,7 @@ CASE ('CYCL','WALL') ! In that case one must have HLBCX(1) == HLBCX(2) ZQL(:,IJS:IJN,:) = PSRC(:,IJS:IJN,:) ZQR(:,IJS:IJN,:) = PSRC(:,IJS:IJN,:) ZQ6(:,IJS:IJN,:) = 0.0 -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ELSEWHERE ( ZQ60(:,IJS:IJN,:)*ZDQ(:,IJS:IJN,:) < -(ZDQ(:,IJS:IJN,:))**2 ) #else ELSEWHERE ( ZQ60(:,IJS:IJN,:)*ZDQ(:,IJS:IJN,:) < -BR_P2(ZDQ(:,IJS:IJN,:)) ) @@ -640,7 +647,7 @@ CASE ('CYCL','WALL') ! In that case one must have HLBCX(1) == HLBCX(2) ZQ6(:,IJS:IJN,:) = 3.0*(ZQL0(:,IJS:IJN,:) - PSRC(:,IJS:IJN,:)) ZQR(:,IJS:IJN,:) = ZQL0(:,IJS:IJN,:) - ZQ6(:,IJS:IJN,:) ZQL(:,IJS:IJN,:) = ZQL0(:,IJS:IJN,:) -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ELSEWHERE ( ZQ60(:,IJS:IJN,:)*ZDQ(:,IJS:IJN,:) > (ZDQ(:,IJS:IJN,:))**2 ) #else ELSEWHERE ( ZQ60(:,IJS:IJN,:)*ZDQ(:,IJS:IJN,:) > BR_P2(ZDQ(:,IJS:IJN,:)) ) @@ -970,6 +977,10 @@ ENDDO ; ENDDO ; ENDDO ! END SELECT ! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PR(:,:,:)) +#endif +! IF (MPPDB_INITIALIZED) THEN !Check all INOUT arrays CALL MPPDB_CHECK(PSRC,"PPM_01_X end:PSRC") @@ -1197,9 +1208,12 @@ use mode_msg #endif use mode_mppdb -#ifdef MNH_BITREP +#if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif +#ifdef MNH_BITREP_OMP +USE MODI_BITREPZ +#endif USE MODI_GET_HALO #ifndef MNH_OPENACC USE MODI_SHUMAN @@ -1272,7 +1286,11 @@ INTEGER :: IJN,IJS #endif integer :: ji, jj, jk !------------------------------------------------------------------------------- - +! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PSRC(:,:,:)) +#endif +! !$acc data present( PSRC, PCR, PRHO, PR, & !$acc & ZQL, ZQR, ZDQ, ZQ6, ZDMQ, ZQL0, ZQR0, ZQ60, ZFPOS, ZFNEG ) @@ -1850,6 +1868,10 @@ CALL GET_HALO_D(ZQL0,HDIR="01_Y", HNAME='ZQL0') ! END SELECT ! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PR(:,:,:)) +#endif +! IF (MPPDB_INITIALIZED) THEN !Check all INOUT arrays CALL MPPDB_CHECK(PSRC,"PPM_01_Y end:PSRC") @@ -2076,9 +2098,12 @@ USE MODI_SHUMAN USE MODI_SHUMAN_DEVICE #endif USE MODI_GET_HALO -#ifdef MNH_BITREP +#if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif +#ifdef MNH_BITREP_OMP +USE MODI_BITREPZ +#endif ! USE MODD_CONF USE MODD_PARAMETERS @@ -2147,7 +2172,11 @@ INTEGER :: I,J,K integer :: ji, jj, jk ! !------------------------------------------------------------------------------- - +! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PSRC(:,:,:)) +#endif +! !$acc data present( PSRC, PCR, PRHO, PR, & !$acc & ZQL, ZQR, ZDQ, ZQ6, ZDMQ, ZQL0, ZQR0, ZQ60, ZFPOS, ZFNEG ) IF (MPPDB_INITIALIZED) THEN @@ -2290,7 +2319,7 @@ WHERE ( ZDMQ == 0.0 ) ZQL = PSRC ZQR = PSRC ZQ6 = 0.0 -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ELSEWHERE ( ZQ60*ZDQ < -(ZDQ)**2 ) #else ELSEWHERE ( ZQ60*ZDQ < -BR_P2(ZDQ) ) @@ -2298,7 +2327,7 @@ ELSEWHERE ( ZQ60*ZDQ < -BR_P2(ZDQ) ) ZQ6 = 3.0*(ZQL0 - PSRC) ZQR = ZQL0 - ZQ6 ZQL = ZQL0 -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ELSEWHERE ( ZQ60*ZDQ > (ZDQ)**2 ) #else ELSEWHERE ( ZQ60*ZDQ > BR_P2(ZDQ) ) @@ -2419,6 +2448,10 @@ end do !Unnecessary CALL GET_HALO_D(PR) #endif ! +#ifdef MNH_BITREP_OMP +CALL SBR_FZ(PR(:,:,:)) +#endif +! IF (MPPDB_INITIALIZED) THEN !Check all INOUT arrays CALL MPPDB_CHECK(PSRC,"PPM_01_Z end:PSRC") diff --git a/src/ZSOLVER/turb_hor_dyn_corr.f90 b/src/ZSOLVER/turb_hor_dyn_corr.f90 index 850e010d2323a0b7673c250f90808939bfe408fd..b07a15f38506bdd88d9451cef10fd89635332227 100644 --- a/src/ZSOLVER/turb_hor_dyn_corr.f90 +++ b/src/ZSOLVER/turb_hor_dyn_corr.f90 @@ -174,9 +174,13 @@ USE MODI_SHUMAN_DEVICE #endif USE MODI_TRIDIAG_W ! -#ifdef MNH_BITREP +#if defined(MNH_BITREP) || defined(MNH_BITREP_OMP) USE MODI_BITREP #endif +#ifdef MNH_BITREP_OMP +!$mnh_undef(LOOP) +!$mnh_undef(OPENACC) +#endif ! IMPLICIT NONE ! @@ -395,15 +399,12 @@ IKU = SIZE(PUM,3) ! ! !$acc kernels async(1) -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZDIRSINZW(:,:) = SQRT( 1. - PDIRCOSZW(:,:)**2 ) #else -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(2) -#endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) - ZDIRSINZW(JI,JJ) = SQRT( 1. - BR_P2(PDIRCOSZW(JI,JJ)) ) -END DO +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU ) + ZDIRSINZW(:,:) = SQRT( 1. - BR_P2(PDIRCOSZW(:,:)) ) +!$mnh_end_expand_array() #endif !$acc end kernels ! @@ -437,16 +438,13 @@ CALL ADD3DFIELD_ll( TZFIELDS_ll, ZFLX, 'TURB_HOR_DYN_CORR::ZFLX' ) ! ! Computes the U variance IF (.NOT. L2D) THEN - !$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GX_U_M_PUM(JI,JJ,JK) & - -(2./3.) * ( GY_V_M_PVM(JI,JJ,JK) & - +GZ_W_M_PWM(JI,JJ,JK) ) ) - END DO !CONCURRENT + !$acc kernels async(2) present_cr(zflx,gz_w_m_pwm) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(:,:,:)= (2./3.) * PTKEM(:,:,:) & + - XCMFS * PK(:,:,:) *( (4./3.) * GX_U_M_PUM(:,:,:) & + -(2./3.) * ( GY_V_M_PVM(:,:,:) & + +GZ_W_M_PWM(:,:,:) ) ) + !$mnh_end_expand_array() !$acc end kernels !! & to be tested later !! + XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP @@ -461,10 +459,9 @@ ELSE END IF ! !$acc kernels async(2) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZFLX(JI,JJ,IKE+1) = ZFLX(JI,JJ,IKE) -ENDDO +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) +!$mnh_end_expand_array() !$acc end kernels ! !* prescription of du/dz and dv/dz with uncentered gradient at the surface @@ -477,16 +474,15 @@ ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2)) #else CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:)) #endif -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) / & - ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) ) - ZCOEFF(JI,JJ,IKB+1)= (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) / & - ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) ) - ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) / & - ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) ) -ENDDO +!$acc kernels async(3) present_cr(zdzz,zcoeff) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) / & + ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) ) + ZCOEFF(:,:,IKB+1)= (ZDZZ(:,:,3)+ZDZZ(:,:,2)) / & + ( ZDZZ(:,:,2) * ZDZZ(:,:,3) ) + ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) / & + ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) ) +!$mnh_end_expand_array() !$acc end kernels ! #ifndef MNH_OPENACC @@ -498,14 +494,13 @@ ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2) & ! ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2)) #else -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZTMP1_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PUM(JI,JJ,IKB+2) & - +ZCOEFF(JI,JJ,IKB+1)*PUM(JI,JJ,IKB+1) & - +ZCOEFF(JI,JJ,IKB)*PUM(JI,JJ,IKB) & - )* 0.5 * ( PDZX(JI,JJ,IKB+1)+PDZX(JI,JJ,IKB)) -ENDDO +!$acc kernels async(3) present_cr(pum,ztmp1_device) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2) & + +ZCOEFF(:,:,IKB+1)*PUM(:,:,IKB+1) & + +ZCOEFF(:,:,IKB)*PUM(:,:,IKB) & + )* 0.5 * ( PDZX(:,:,IKB+1)+PDZX(:,:,IKB)) +!$mnh_end_expand_array() !$acc end kernels ! !!! wait for the computation of ZCOEFF and ZTMP1_DEVICE @@ -513,25 +508,23 @@ ENDDO ! CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1)) CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1)) -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZDU_DZ_DZS_DX(JI,JJ,1) = ZTMP2_DEVICE(JI,JJ,1) / ZTMP1_DEVICE(JI,JJ,1) -ENDDO +!$acc kernels async(3) present_cr(ztmp1_device,zdu_dz_dzs_dx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZDU_DZ_DZS_DX(:,:,1) = ZTMP2_DEVICE(:,:,1) / ZTMP1_DEVICE(:,:,1) +!$mnh_end_expand_array() !$acc end kernels ! CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:)) #endif -!$acc kernels async(4) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) / & - ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) ) - ZCOEFF(JI,JJ,IKB+1)= (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) / & - ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) ) - ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) / & - ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) ) -ENDDO +!$acc kernels async(4) present_cr(zdzz,zcoeff) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) / & + ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) ) + ZCOEFF(:,:,IKB+1)= (ZDZZ(:,:,3)+ZDZZ(:,:,2)) / & + ( ZDZZ(:,:,2) * ZDZZ(:,:,3) ) + ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) / & + ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) ) +!$mnh_end_expand_array() !$acc end kernels ! #ifndef MNH_OPENACC @@ -541,14 +534,13 @@ ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2) & )* 0.5 * ( PDZY(:,:,IKB+1:IKB+1)+PDZY(:,:,IKB:IKB)) & )/ MYF(PDYY(:,:,IKB:IKB)) #else -!$acc kernels async(4) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZTMP3_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PVM(JI,JJ,IKB+2) & - +ZCOEFF(JI,JJ,IKB+1)*PVM(JI,JJ,IKB+1) & - +ZCOEFF(JI,JJ,IKB)*PVM(JI,JJ,IKB) & - )* 0.5 * ( PDZY(JI,JJ,IKB+1)+PDZY(JI,JJ,IKB)) -ENDDO +!$acc kernels async(4) present_cr(pvm,ztmp3_device) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZTMP3_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PVM(:,:,IKB+2) & + +ZCOEFF(:,:,IKB+1)*PVM(:,:,IKB+1) & + +ZCOEFF(:,:,IKB)*PVM(:,:,IKB) & + )* 0.5 * ( PDZY(:,:,IKB+1)+PDZY(:,:,IKB)) +!$mnh_end_expand_array() !$acc end kernels ! !!! wait for the computation of ZCOEFF and ZTMP3_DEVICE @@ -574,11 +566,10 @@ ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1) ! CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1)) CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1)) -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZDU_DX(JI,JJ,1)= ZTMP1_DEVICE(JI,JJ,1) / ZTMP2_DEVICE(JI,JJ,1) - ZDU_DZ_DZS_DX(JI,JJ,1) -ENDDO +!$acc kernels async(3) present_cr(zdu_dz_dzs_dx,zdu_dx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZDU_DX(:,:,1)= ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1) +!$mnh_end_expand_array() !$acc end kernels !!! wait for the computation of ZDV_DZ_DZS_DY @@ -586,11 +577,10 @@ ENDDO ! CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1)) CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1)) -!$acc kernels async(4) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZDV_DY(JI,JJ,1)= ZTMP3_DEVICE(JI,JJ,1) / ZTMP4_DEVICE(JI,JJ,1) - ZDV_DZ_DZS_DY(JI,JJ,1) -ENDDO +!$acc kernels async(4) present_cr(zdv_dz_dzs_dy,zdv_dy) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZDV_DY(:,:,1)= ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1) +!$mnh_end_expand_array() !$acc end kernels ! ! @@ -598,11 +588,10 @@ ENDDO !$acc wait(3) async(4) #endif ! -!$acc kernels async(4) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZDW_DZ(JI,JJ,1)=-ZDU_DX(JI,JJ,1)-ZDV_DY(JI,JJ,1) -ENDDO +!$acc kernels async(4) present_cr(zdv_dy,zdw_dz) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZDW_DZ(:,:,1)=-ZDU_DX(:,:,1)-ZDV_DY(:,:,1) +!$mnh_end_expand_array() !$acc end kernels ! !* computation @@ -617,12 +606,11 @@ ENDDO !attention !!!!! je ne comprends pas pourquoi mais ce update plante à l'execution... ! du coup je ne peux pas faire de update self asynchrone... ! -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & - - XCMFS * PK(JI,JJ,IKB) * 2. * ZDU_DX(JI,JJ,1) -ENDDO +!$acc kernels async(3) present_cr(zdu_dx,zflx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & + - XCMFS * PK(:,:,IKB) * 2. * ZDU_DX(:,:,1) +!$mnh_end_expand_array() !$acc end kernels !! & to be tested later @@ -635,8 +623,8 @@ ENDDO !!! wait for the computation of ZDIRSINZW !$acc wait(1) ! -!$acc kernels async(4) present_cr(ZFLX) -#ifndef MNH_BITREP +!$acc kernels async(4) present_cr(ZFLX,ZDIRSINZW) +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZFLX(:,:,IKB-1) = & PTAU11M(:,:) * PCOSSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2 & -2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:) & @@ -647,17 +635,16 @@ ZFLX(:,:,IKB-1) = & - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) ) #else !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) -!$acc_nv loop independent collapse(2) -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) -ZFLX(JI,JJ,IKB-1) = & - PTAU11M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ)) & - -2. * PTAU12M(JI,JJ) * PCOSSLOPE(JI,JJ)* PSINSLOPE(JI,JJ) * PDIRCOSZW(JI,JJ) & - + PTAU22M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) & - + PTAU33M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(ZDIRSINZW(JI,JJ)) & - +2. * PCDUEFF(JI,JJ) * ( & - PVSLOPEM(JI,JJ) * PCOSSLOPE(JI,JJ) * PSINSLOPE(JI,JJ) * ZDIRSINZW(JI,JJ) & - - PUSLOPEM(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ) ) -END DO ! CONCURRENT +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) +ZFLX(:,:,IKB-1) = & + PTAU11M(:,:) * BR_P2(PCOSSLOPE(:,:)) * BR_P2(PDIRCOSZW(:,:)) & + -2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:) & + + PTAU22M(:,:) * BR_P2(PSINSLOPE(:,:)) & + + PTAU33M(:,:) * BR_P2(PCOSSLOPE(:,:)) * BR_P2(ZDIRSINZW(:,:)) & + +2. * PCDUEFF(:,:) * ( & + PVSLOPEM(:,:) * PCOSSLOPE(:,:) * PSINSLOPE(:,:) * ZDIRSINZW(:,:) & + - PUSLOPEM(:,:) * BR_P2(PCOSSLOPE(:,:)) * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) ) +!$mnh_end_expand_array() #endif !$acc end kernels ! @@ -665,10 +652,9 @@ END DO ! CONCURRENT !$acc wait(3) async(4) ! !$acc kernels async(4) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZFLX(JI,JJ,IKB-1) = 2. * ZFLX(JI,JJ,IKB-1) - ZFLX(JI,JJ,IKB) -ENDDO +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) - ZFLX(:,:,IKB) +!$mnh_end_expand_array() !$acc end kernels ! ! @@ -724,13 +710,10 @@ ELSE END IF #else CALL MXF_DEVICE(PDXX, ZTMP1_DEVICE) -!$acc kernels async(10) -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(3) -#endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) -END DO !CONCURRENT +!$acc kernels async(10) present_cr(ztmp1_device,ztmp2_device) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PRHODJ(:,:,:) * ZFLX(:,:,:) / ZTMP1_DEVICE(:,:,:) +!$mnh_end_expand_array() !$acc end kernels ! !!! wait for the computation of ZTMP2_DEVICE and the update of ZFLX @@ -739,31 +722,22 @@ END DO !CONCURRENT CALL DXM_DEVICE(ZTMP2_DEVICE, ZTMP3_DEVICE) IF (.NOT. LFLAT) THEN CALL MZM_DEVICE(PDXX,ZTMP1_DEVICE) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(zflx,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PRHODJ(:,:,:) * ZFLX(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL MZM_DEVICE(ZTMP2_DEVICE,ZTMP4_DEVICE) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = ZTMP4_DEVICE(JI,JJ,JK) * PINV_PDZZ(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(ztmp4_device,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = ZTMP4_DEVICE(:,:,:) * PINV_PDZZ(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL MXM_DEVICE( ZTMP2_DEVICE, ZTMP4_DEVICE ) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PDZX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) * ZTMP4_DEVICE(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(ztmp4_device,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PDZX(:,:,:) / ZTMP1_DEVICE(:,:,:) * ZTMP4_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL DZF_DEVICE( ZTMP2_DEVICE, ZTMP1_DEVICE ) !$acc kernels async(1) @@ -780,22 +754,18 @@ END IF ! IF (KSPLT==1) THEN ! Contribution to the dynamic production of TKE: - !$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZWORK(JI,JJ,JK) = - ZFLX(JI,JJ,JK) * GX_U_M_PUM(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels async(2) present_cr(gx_u_m_pum,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZWORK(:,:,:) = - ZFLX(:,:,:) * GX_U_M_PUM(:,:,:) + !$mnh_end_expand_array() !$acc end kernels ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - !$acc_nv loop independent collapse(2) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDU_DX(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) - ENDDO + !$acc kernels async(2) present_cr(zdu_dx,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDU_DX(:,:,1) + ZWORK(:,:,IKB+1) ) + !$mnh_end_expand_array() !$acc end kernels ! !$acc kernels async(2) @@ -836,28 +806,24 @@ END IF ! ! Computes the V variance IF (.NOT. L2D) THEN - !$acc kernels async(3) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GY_V_M_PVM(JI,JJ,JK) & - -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) & - +GZ_W_M_PWM(JI,JJ,JK) ) ) - END DO !CONCURRENT + !$acc kernels async(3) present_cr(gz_w_m_pwm,zflx) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(:,:,:)= (2./3.) * PTKEM(:,:,:) & + - XCMFS * PK(:,:,:) *( (4./3.) * GY_V_M_PVM(:,:,:) & + -(2./3.) * ( GX_U_M_PUM(:,:,:) & + +GZ_W_M_PWM(:,:,:) ) ) + !$mnh_end_expand_array() !$acc end kernels !! & to be tested !! + XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP ! ELSE - !$acc kernels async(3) - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) *(-(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) & - +GZ_W_M_PWM(JI,JJ,JK) ) ) - ENDDO + !$acc kernels async(3) present_cr(gz_w_m_pwm,zflx) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(:,:,:)= (2./3.) * PTKEM(:,:,:) & + - XCMFS * PK(:,:,:) *(-(2./3.) * ( GX_U_M_PUM(:,:,:) & + +GZ_W_M_PWM(:,:,:) ) ) + !$mnh_end_expand_array() !$acc end kernels !! & to be tested !! + XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP @@ -872,12 +838,11 @@ ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) ! ! !$acc wait(3) ! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10) ! -!$acc kernels async(3) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & - - XCMFS * PK(JI,JJ,IKB) * 2. * ZDV_DY(JI,JJ,1) -ENDDO +!$acc kernels async(3) present_cr(zdv_dy,zflx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & + - XCMFS * PK(:,:,IKB) * 2. * ZDV_DY(:,:,1) +!$mnh_end_expand_array() !$acc end kernels !! & to be tested @@ -886,7 +851,7 @@ ENDDO ! ! extrapolates this flux under the ground with the surface flux !$acc kernels async(3) present_cr(ZFLX) -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZFLX(:,:,IKB-1) = & PTAU11M(:,:) * PSINSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2 & +2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:) & @@ -897,17 +862,16 @@ ZFLX(:,:,IKB-1) = & + PVSLOPEM(:,:) * PCOSSLOPE(:,:) * PSINSLOPE(:,:) * ZDIRSINZW(:,:) ) #else !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) -!$acc_nv loop independent collapse(2) -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) -ZFLX(JI,JJ,IKB-1) = & - PTAU11M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ)) & - +2. * PTAU12M(JI,JJ) * PCOSSLOPE(JI,JJ)* PSINSLOPE(JI,JJ) * PDIRCOSZW(JI,JJ) & - + PTAU22M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) & - + PTAU33M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(ZDIRSINZW(JI,JJ)) & - -2. * PCDUEFF(JI,JJ)* ( & - PUSLOPEM(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ) & - + PVSLOPEM(JI,JJ) * PCOSSLOPE(JI,JJ) * PSINSLOPE(JI,JJ) * ZDIRSINZW(JI,JJ) ) -END DO ! CONCURRENT +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU ) +ZFLX(:,:,IKB-1) = & + PTAU11M(:,:) * BR_P2(PSINSLOPE(:,:)) * BR_P2(PDIRCOSZW(:,:)) & + +2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:) & + + PTAU22M(:,:) * BR_P2(PCOSSLOPE(:,:)) & + + PTAU33M(:,:) * BR_P2(PSINSLOPE(:,:)) * BR_P2(ZDIRSINZW(:,:)) & + -2. * PCDUEFF(:,:)* ( & + PUSLOPEM(:,:) * BR_P2(PSINSLOPE(:,:)) * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) & + + PVSLOPEM(:,:) * PCOSSLOPE(:,:) * PSINSLOPE(:,:) * ZDIRSINZW(:,:) ) +!$mnh_end_expand_array() #endif !$acc end kernels ! @@ -966,13 +930,10 @@ IF (.NOT. L2D) THEN IF (KSPLT==1) ZWORK(:,:,:) = - ZFLX(:,:,:) * GY_V_M_PVM #else CALL MYF_DEVICE(PDYY, ZTMP1_DEVICE) - !$acc kernels async(10) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels async(10) present_cr(ztmp1_device,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PRHODJ(:,:,:) * ZFLX(:,:,:) / ZTMP1_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels ! !!! wait for the computation of ZTMP2_DEVICE and the update of ZFLX @@ -981,31 +942,22 @@ IF (.NOT. L2D) THEN CALL DYM_DEVICE( ZTMP2_DEVICE,ZTMP3_DEVICE ) IF (.NOT. LFLAT) THEN CALL MZM_DEVICE(PDYY,ZTMP1_DEVICE) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(zflx,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PRHODJ(:,:,:) * ZFLX(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL MZM_DEVICE(ZTMP2_DEVICE,ZTMP4_DEVICE) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = ZTMP4_DEVICE(JI,JJ,JK) * PINV_PDZZ(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(ztmp4_device,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = ZTMP4_DEVICE(:,:,:) * PINV_PDZZ(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL MYM_DEVICE( ZTMP2_DEVICE,ZTMP4_DEVICE ) - !$acc kernels -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = PDZY(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) * ZTMP4_DEVICE(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels present_cr(ztmp4_device,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = PDZY(:,:,:) / ZTMP1_DEVICE(:,:,:) * ZTMP4_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL DZF_DEVICE( ZTMP2_DEVICE, ZTMP4_DEVICE ) !$acc kernels async(1) @@ -1015,23 +967,17 @@ IF (.NOT. L2D) THEN !$acc end kernels ELSE !$acc kernels async(1) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - PRVS(JI,JJ,JK)=PRVS(JI,JJ,JK) - ZTMP3_DEVICE(JI,JJ,JK) - END DO !CONCURRENT + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + PRVS(:,:,:)=PRVS(:,:,:) - ZTMP3_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels END IF ! Contribution to the dynamic production of TKE: IF (KSPLT==1) THEN - !$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZWORK(JI,JJ,JK) = - ZFLX(JI,JJ,JK) * GY_V_M_PVM(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels async(2) present_cr(gy_v_m_pvm,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZWORK(:,:,:) = - ZFLX(:,:,:) * GY_V_M_PVM(:,:,:) + !$mnh_end_expand_array() !$acc end kernels ENDIF #endif @@ -1045,11 +991,10 @@ IF (KSPLT==1) THEN ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - !$acc_nv loop independent collapse(2) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDV_DY(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) - ENDDO + !$acc kernels async(2) present_cr(zdv_dy,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDV_DY(:,:,1) + ZWORK(:,:,IKB+1) ) + !$mnh_end_expand_array() !$acc end kernels ! !$acc kernels async(2) @@ -1070,11 +1015,10 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !!! wait for the computation of ZWORK and PDP !$acc wait(2) ! - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(zwork,ztmp1_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP1_DEVICE(:,:,:) = -ZWORK(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.) ! @@ -1089,27 +1033,23 @@ END IF ! ! Computes the W variance IF (.NOT. L2D) THEN - !$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZFLX(JI,JJ,JK) = (2./3.) * PTKEM(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK) & - -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) & - +GY_V_M_PVM(JI,JJ,JK) ) ) - END DO !CONCURRENT + !$acc kernels async(2) present_cr(gy_v_m_pvm,zflx) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(:,:,:) = (2./3.) * PTKEM(:,:,:) & + - XCMFS * PK(:,:,:) *( (4./3.) * GZ_W_M_PWM(:,:,:) & + -(2./3.) * ( GX_U_M_PUM(:,:,:) & + +GY_V_M_PVM(:,:,:) ) ) + !$mnh_end_expand_array() !$acc end kernels !! & to be tested !! -2.* XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP ELSE - !$acc kernels async(2) - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK) & - -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) ) ) - ENDDO + !$acc kernels async(2) present_cr(gx_u_m_pum,zflx) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(:,:,:)= (2./3.) * PTKEM(:,:,:) & + - XCMFS * PK(:,:,:) *( (4./3.) * GZ_W_M_PWM(:,:,:) & + -(2./3.) * ( GX_U_M_PUM(:,:,:) ) ) + !$mnh_end_expand_array() !$acc end kernels !! & to be tested !! -2.* XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP @@ -1123,12 +1063,11 @@ ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE) !$acc wait(2) ! ! -!$acc kernels async(2) -!$acc_nv loop independent collapse(2) -DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & - - XCMFS * PK(JI,JJ,IKB) * 2. * ZDW_DZ(JI,JJ,1) -ENDDO +!$acc kernels async(2) present_cr(zdw_dz,zflx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & + - XCMFS * PK(:,:,IKB) * 2. * ZDW_DZ(:,:,1) +!$mnh_end_expand_array() !$acc end kernels ! @@ -1138,20 +1077,19 @@ ENDDO ! (-2./3.) * PTP(:,:,IKB:IKB) ! extrapolates this flux under the ground with the surface flux !$acc kernels async(3) present_cr(ZFLX) -#ifndef MNH_BITREP +#if !defined(MNH_BITREP) && !defined(MNH_BITREP_OMP) ZFLX(:,:,IKB-1) = & PTAU11M(:,:) * ZDIRSINZW(:,:)**2 & + PTAU33M(:,:) * PDIRCOSZW(:,:)**2 & +2. * PCDUEFF(:,:)* PUSLOPEM(:,:) * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) #else !PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) -!$acc_nv loop independent collapse(2) -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) -ZFLX(JI,JJ,IKB-1) = & - PTAU11M(JI,JJ) * BR_P2(ZDIRSINZW(JI,JJ)) & - + PTAU33M(JI,JJ) * BR_P2(PDIRCOSZW(JI,JJ)) & - +2. * PCDUEFF(JI,JJ)* PUSLOPEM(JI,JJ) * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ) -END DO ! CONCURRENT +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU ) +ZFLX(:,:,IKB-1) = & + PTAU11M(:,:) * BR_P2(ZDIRSINZW(:,:)) & + + PTAU33M(:,:) * BR_P2(PDIRCOSZW(:,:)) & + +2. * PCDUEFF(:,:)* PUSLOPEM(:,:) * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) +!$mnh_end_expand_array() #endif !$acc end kernels ! @@ -1218,34 +1156,27 @@ GZ_W_M_ZWP = GZ_W_M(ZWP,PDZZ) #else CALL GZ_W_M_DEVICE(ZWP,PDZZ,GZ_W_M_ZWP) #endif -!$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(3) -#endif -DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKB+1:JKU) - ZFLX(JI,JJ,JK)=ZFLX(JI,JJ,JK) & - - XCMFS * PK(JI,JJ,JK) * (4./3.) * (GZ_W_M_ZWP(JI,JJ,JK) - GZ_W_M_PWM(JI,JJ,JK)) -END DO !CONCURRENT +!$acc kernels async(2) present_cr(gz_w_m_pwm,zflx) +!$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=IKB+1:JKU) + ZFLX(:,:,:)=ZFLX(:,:,:) & + - XCMFS * PK(:,:,:) * (4./3.) * (GZ_W_M_ZWP(:,:,:) - GZ_W_M_PWM(:,:,:)) +!$mnh_end_expand_array() !$acc end kernels ! IF (KSPLT==1) THEN !Contribution to the dynamic production of TKE: - !$acc kernels async(2) -#ifdef MNH_COMPILER_NVHPC - !$acc loop independent collapse(3) -#endif - DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZWORK(JI,JJ,JK) = - ZFLX(JI,JJ,JK) * GZ_W_M_ZWP(JI,JJ,JK) - END DO !CONCURRENT + !$acc kernels async(2) present_cr(gz_w_m_zwp,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZWORK(:,:,:) = - ZFLX(:,:,:) * GZ_W_M_ZWP(:,:,:) + !$mnh_end_expand_array() !$acc end kernels ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - !$acc_nv loop independent collapse(2) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU) - ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDW_DZ(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) - ENDDO + !$acc kernels async(2) present_cr(zdw_dz,zwork) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU) + ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDW_DZ(:,:,1) + ZWORK(:,:,IKB+1) ) + !$mnh_end_expand_array() !$acc end kernels ! !$acc kernels async(2) @@ -1284,30 +1215,27 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !!! wait for the computation of ZFLX, ZDP and ZWORK !$acc wait(2) ! - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(zwork,ztmp1_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP1_DEVICE(:,:,:) = -ZWORK(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.) ! CALL GZ_M_M_DEVICE(PTHLM,PDZZ,ZTMP1_DEVICE) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK) * ZFLX(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(zflx,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = ZTMP1_DEVICE(:,:,:) * ZFLX(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.) ! CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE ) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(ztmp2_device,ztmp3_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(:,:,:) = ZFLX(:,:,:)*ZTMP2_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE,X_LES_RES_ddz_Thl_SBG_W2) ! @@ -1317,21 +1245,19 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !$acc data copy(X_LES_RES_ddxa_Rt_SBG_UaW,X_LES_RES_ddz_Rt_SBG_W2) ! CALL GZ_M_M_DEVICE(PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(zflx,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = ZTMP1_DEVICE(:,:,:)*ZFLX(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.) ! CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE ) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(ztmp2_device,ztmp3_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(:,:,:) = ZFLX(:,:,:)*ZTMP2_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Rt_SBG_W2) ! @@ -1342,22 +1268,20 @@ IF (LLES_CALL .AND. KSPLT==1) THEN ! ! CALL GZ_M_M_DEVICE(PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(zflx,ztmp2_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(:,:,:) = ZTMP1_DEVICE(:,:,:)*ZFLX(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, & X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.) ! CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE(ZTMP1_DEVICE,ZTMP2_DEVICE) - !$acc kernels - !$acc_nv loop independent collapse(3) - DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) - ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) - ENDDO + !$acc kernels present_cr(ztmp2_device,ztmp3_device) + !$mnh_expand_array(JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(:,:,:) = ZFLX(:,:,:)*ZTMP2_DEVICE(:,:,:) + !$mnh_end_expand_array() !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV)) !