From b85eafc116825b219a09a08b339547cbab0ecbf0 Mon Sep 17 00:00:00 2001 From: ESCOBAR Juan <escj@nuwa> Date: Thu, 3 Mar 2022 16:22:03 +0100 Subject: [PATCH] Juan 03/03/2022:ZSOLVER/turb_hor_dyn_corr.f90, nvhpc22.2 bug/optimisation -> replace WHERE+ARRAY SYNTAX -> DO CONCURRENT --- src/ZSOLVER/turb_hor_dyn_corr.f90 | 265 ++++++++++++++++++++++-------- 1 file changed, 201 insertions(+), 64 deletions(-) diff --git a/src/ZSOLVER/turb_hor_dyn_corr.f90 b/src/ZSOLVER/turb_hor_dyn_corr.f90 index 2e0fa87bc..e255ed69a 100644 --- a/src/ZSOLVER/turb_hor_dyn_corr.f90 +++ b/src/ZSOLVER/turb_hor_dyn_corr.f90 @@ -461,7 +461,13 @@ ELSE END IF ! !$acc kernels async(2) -ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZFLX(JI,JJ,IKE+1) = ZFLX(JI,JJ,IKE) +ENDDO + !$acc end kernels ! !* prescription of du/dz and dv/dz with uncentered gradient at the surface @@ -475,13 +481,19 @@ ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2)) CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:)) #endif !$acc kernels async(3) -ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) / & - ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) ) -ZCOEFF(:,:,IKB+1)= (ZDZZ(:,:,3)+ZDZZ(:,:,2)) / & - ( ZDZZ(:,:,2) * ZDZZ(:,:,3) ) -ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) / & - ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) ) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) / & + ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) ) + ZCOEFF(JI,JJ,IKB+1)= (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) / & + ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) ) + ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) / & + ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) ) +ENDDO !$acc end kernels + ! #ifndef MNH_OPENACC ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2) & @@ -493,10 +505,15 @@ ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2) & ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2)) #else !$acc kernels async(3) -ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2) & - +ZCOEFF(:,:,IKB+1)*PUM(:,:,IKB+1) & - +ZCOEFF(:,:,IKB )*PUM(:,:,IKB) & - )* 0.5 * ( PDZX(:,:,IKB+1)+PDZX(:,:,IKB)) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZTMP1_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PUM(JI,JJ,IKB+2) & + +ZCOEFF(JI,JJ,IKB+1)*PUM(JI,JJ,IKB+1) & + +ZCOEFF(JI,JJ,IKB)*PUM(JI,JJ,IKB) & + )* 0.5 * ( PDZX(JI,JJ,IKB+1)+PDZX(JI,JJ,IKB)) +ENDDO !$acc end kernels ! !!! wait for the computation of ZCOEFF and ZTMP1_DEVICE @@ -505,20 +522,32 @@ ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2) & CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1)) CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1)) !$acc kernels async(3) -ZDU_DZ_DZS_DX(:,:,1) = ZTMP2_DEVICE(:,:,1) / ZTMP1_DEVICE(:,:,1) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZDU_DZ_DZS_DX(JI,JJ,1) = ZTMP2_DEVICE(JI,JJ,1) / ZTMP1_DEVICE(JI,JJ,1) +ENDDO !$acc end kernels ! CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:)) #endif !$acc kernels async(4) -ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) / & - ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) ) -ZCOEFF(:,:,IKB+1)= (ZDZZ(:,:,3)+ZDZZ(:,:,2)) / & - ( ZDZZ(:,:,2) * ZDZZ(:,:,3) ) -ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) / & - ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) ) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZCOEFF(JI,JJ,IKB+2)= - ZDZZ(JI,JJ,2) / & + ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,3) ) + ZCOEFF(JI,JJ,IKB+1)= (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) / & + ( ZDZZ(JI,JJ,2) * ZDZZ(JI,JJ,3) ) + ZCOEFF(JI,JJ,IKB)= - (ZDZZ(JI,JJ,3)+2.*ZDZZ(JI,JJ,2)) / & + ( (ZDZZ(JI,JJ,3)+ZDZZ(JI,JJ,2)) * ZDZZ(JI,JJ,2) ) +ENDDO + !$acc end kernels ! + #ifndef MNH_OPENACC ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2) & +ZCOEFF(:,:,IKB+1:IKB+1)*PVM(:,:,IKB+1:IKB+1) & @@ -527,10 +556,15 @@ ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2) & )/ MYF(PDYY(:,:,IKB:IKB)) #else !$acc kernels async(4) -ZTMP3_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PVM(:,:,IKB+2) & - +ZCOEFF(:,:,IKB+1)*PVM(:,:,IKB+1) & - +ZCOEFF(:,:,IKB)*PVM(:,:,IKB) & - )* 0.5 * ( PDZY(:,:,IKB+1)+PDZY(:,:,IKB)) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZTMP3_DEVICE(JI,JJ,1) = (ZCOEFF(JI,JJ,IKB+2)*PVM(JI,JJ,IKB+2) & + +ZCOEFF(JI,JJ,IKB+1)*PVM(JI,JJ,IKB+1) & + +ZCOEFF(JI,JJ,IKB)*PVM(JI,JJ,IKB) & + )* 0.5 * ( PDZY(JI,JJ,IKB+1)+PDZY(JI,JJ,IKB)) +ENDDO !$acc end kernels ! !!! wait for the computation of ZCOEFF and ZTMP3_DEVICE @@ -557,7 +591,12 @@ ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1) CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1)) CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1)) !$acc kernels async(3) -ZDU_DX(:,:,1)= ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZDU_DX(JI,JJ,1)= ZTMP1_DEVICE(JI,JJ,1) / ZTMP2_DEVICE(JI,JJ,1) - ZDU_DZ_DZS_DX(JI,JJ,1) +ENDDO !$acc end kernels !!! wait for the computation of ZDV_DZ_DZS_DY @@ -565,8 +604,13 @@ ZDU_DX(:,:,1)= ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1) ! CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1)) CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1)) -!$acc kernels! async(4) -ZDV_DY(:,:,1)= ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1) +!$acc kernels async(4) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZDV_DY(JI,JJ,1)= ZTMP3_DEVICE(JI,JJ,1) / ZTMP4_DEVICE(JI,JJ,1) - ZDV_DZ_DZS_DY(JI,JJ,1) +ENDDO !$acc end kernels ! ! @@ -575,7 +619,12 @@ ZDV_DY(:,:,1)= ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1) #endif ! !$acc kernels async(4) -ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(3) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZDW_DZ(JI,JJ,JK)=-ZDU_DX(JI,JJ,JK)-ZDV_DY(JI,JJ,JK) +ENDDO !$acc end kernels ! !* computation @@ -591,11 +640,15 @@ ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:) ! du coup je ne peux pas faire de update self asynchrone... ! !$acc kernels async(3) -ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & - - XCMFS * PK(:,:,IKB) * 2. * ZDU_DX(:,:,1) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & + - XCMFS * PK(JI,JJ,IKB) * 2. * ZDU_DX(JI,JJ,1) +ENDDO !$acc end kernels - !! & to be tested later !! + XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) * & !! (-2./3.) * PTP(:,:,IKB:IKB) @@ -617,8 +670,9 @@ ZFLX(:,:,IKB-1) = & PVSLOPEM(:,:) * PCOSSLOPE(:,:) * PSINSLOPE(:,:) * ZDIRSINZW(:,:) & - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) ) #else +!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) #ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(2) +!$acc loop independent collapse(2) #endif DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) ZFLX(JI,JJ,IKB-1) = & @@ -637,7 +691,12 @@ END DO ! CONCURRENT !$acc wait(3) async(4) ! !$acc kernels async(4) -ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) - ZFLX(:,:,IKB) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZFLX(JI,JJ,IKB-1) = 2. * ZFLX(JI,JJ,IKB-1) - ZFLX(JI,JJ,IKB) +ENDDO !$acc end kernels ! ! @@ -760,9 +819,14 @@ IF (KSPLT==1) THEN ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDU_DX(:,:,1) + ZWORK(:,:,IKB+1) ) - !$acc end kernels + !$acc kernels async(2) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(2) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDU_DX(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) + ENDDO + !$acc end kernels ! !$acc kernels async(2) PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:) @@ -783,7 +847,7 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !$acc wait(2) ! !$acc kernels - ZTMP1_DEVICE = -ZWORK + ZTMP1_DEVICE(:,:,:) = -ZWORK(:,:,:) !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_U_SBG_UaU , .TRUE.) ! @@ -817,11 +881,16 @@ IF (.NOT. L2D) THEN !! + XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP ! ELSE - !$acc kernels async(3) - ZFLX(:,:,:)= (2./3.) * PTKEM & - - XCMFS * PK *(-(2./3.) * ( GX_U_M_PUM & - +GZ_W_M_PWM ) ) - !$acc end kernels + !$acc kernels async(3) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & + - XCMFS * PK(JI,JJ,JK) *(-(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) & + +GZ_W_M_PWM(JI,JJ,JK) ) ) + ENDDO + !$acc end kernels !! & to be tested !! + XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP ! @@ -836,8 +905,13 @@ ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) ! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10) ! !$acc kernels async(3) -ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & - - XCMFS * PK(:,:,IKB) * 2. * ZDV_DY(:,:,1) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & + - XCMFS * PK(JI,JJ,IKB) * 2. * ZDV_DY(JI,JJ,1) +ENDDO !$acc end kernels !! & to be tested @@ -856,8 +930,9 @@ ZFLX(:,:,IKB-1) = & PUSLOPEM(:,:) * PSINSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) & + PVSLOPEM(:,:) * PCOSSLOPE(:,:) * PSINSLOPE(:,:) * ZDIRSINZW(:,:) ) #else +!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) #ifdef MNH_COMPILER_NVHPC -!$acc loop independent collapse(2) +!$acc loop independent collapse(2) #endif DO CONCURRENT ( JI=1:JIU,JJ=1:JJU ) ZFLX(JI,JJ,IKB-1) = & @@ -1006,9 +1081,14 @@ IF (KSPLT==1) THEN ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDV_DY(:,:,1) + ZWORK(:,:,IKB+1) ) - !$acc end kernels + !$acc kernels async(2) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(2) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDV_DY(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) + ENDDO + !$acc end kernels ! !$acc kernels async(2) PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:) @@ -1029,7 +1109,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !$acc wait(2) ! !$acc kernels - ZTMP1_DEVICE = -ZWORK +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.) ! @@ -1058,11 +1143,16 @@ IF (.NOT. L2D) THEN !! & to be tested !! -2.* XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP ELSE - !$acc kernels async(2) - ZFLX(:,:,:)= (2./3.) * PTKEM & - - XCMFS * PK *( (4./3.) * GZ_W_M_PWM & - -(2./3.) * ( GX_U_M_PUM ) ) - !$acc end kernels + !$acc kernels async(2) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK) & + - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK) & + -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK) ) ) + ENDDO + !$acc end kernels !! & to be tested !! -2.* XCMFB * PLM / SQRT(PTKEM) * (-2./3.) * PTP END IF @@ -1076,8 +1166,14 @@ ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE) ! ! !$acc kernels async(2) -ZFLX(:,:,IKB) = (2./3.) * PTKEM(:,:,IKB) & - - XCMFS * PK(:,:,IKB) * 2. * ZDW_DZ(:,:,1) +#ifdef MNH_COMPILER_NVHPC +!$acc loop independent collapse(2) +#endif +DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZFLX(JI,JJ,IKB) = (2./3.) * PTKEM(JI,JJ,IKB) & + - XCMFS * PK(JI,JJ,IKB) * 2. * ZDW_DZ(JI,JJ,1) +ENDDO + !$acc end kernels ! @@ -1092,6 +1188,7 @@ ZFLX(:,:,IKB-1) = & + PTAU33M(:,:) * PDIRCOSZW(:,:)**2 & +2. * PCDUEFF(:,:)* PUSLOPEM(:,:) * ZDIRSINZW(:,:) * PDIRCOSZW(:,:) #else +!PW: BUG: commented 'acc loop independent collapse(2)' to workaround compiler bug (NVHPC 21.1) #ifdef MNH_COMPILER_NVHPC !$acc loop independent collapse(2) #endif @@ -1190,9 +1287,14 @@ IF (KSPLT==1) THEN ! ! evaluate the dynamic production at w(IKB+1) in PDP(IKB) ! - !$acc kernels async(2) - ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDW_DZ(:,:,1) + ZWORK(:,:,IKB+1) ) - !$acc end kernels + !$acc kernels async(2) +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(2) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU) + ZWORK(JI,JJ,IKB) = 0.5* ( -ZFLX(JI,JJ,IKB)*ZDW_DZ(JI,JJ,1) + ZWORK(JI,JJ,IKB+1) ) + ENDDO + !$acc end kernels ! !$acc kernels async(2) PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:) @@ -1231,20 +1333,35 @@ IF (LLES_CALL .AND. KSPLT==1) THEN !$acc wait(2) ! !$acc kernels - ZTMP1_DEVICE = -ZWORK +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP1_DEVICE(JI,JJ,JK) = -ZWORK(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.) ! CALL GZ_M_M_DEVICE(PTHLM,PDZZ,ZTMP1_DEVICE) !$acc kernels - ZTMP2_DEVICE = ZTMP1_DEVICE * ZFLX +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK) * ZFLX(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.) ! CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE ) !$acc kernels - ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE,X_LES_RES_ddz_Thl_SBG_W2) ! @@ -1255,14 +1372,24 @@ IF (LLES_CALL .AND. KSPLT==1) THEN ! CALL GZ_M_M_DEVICE(PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE) !$acc kernels - ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.) ! CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE ) !$acc kernels - ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Rt_SBG_W2) ! @@ -1274,7 +1401,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN ! CALL GZ_M_M_DEVICE(PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE) !$acc kernels - ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP2_DEVICE(JI,JJ,JK) = ZTMP1_DEVICE(JI,JJ,JK)*ZFLX(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, & X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.) @@ -1282,7 +1414,12 @@ IF (LLES_CALL .AND. KSPLT==1) THEN CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE) CALL MZF_DEVICE( ZTMP1_DEVICE, ZTMP2_DEVICE ) !$acc kernels - ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE +#ifdef MNH_COMPILER_NVHPC + !$acc loop independent collapse(3) +#endif + DO CONCURRENT (JI=1:JIU,JJ=1:JJU,JK=1:JKU) + ZTMP3_DEVICE(JI,JJ,JK) = ZFLX(JI,JJ,JK)*ZTMP2_DEVICE(JI,JJ,JK) + ENDDO !$acc end kernels CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV)) ! -- GitLab