From a3a67ba11f605ce2347cc41dec92849f77eb66a0 Mon Sep 17 00:00:00 2001 From: Juan Escobar <juan.escobar@aero.obs-mip.fr> Date: Thu, 9 Sep 2021 17:38:08 +0200 Subject: [PATCH] Juan 09/09/2021:contrav.f90, add do concurrent + async loop for boundary --- src/ZSOLVER/contrav.f90 | 108 +++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/src/ZSOLVER/contrav.f90 b/src/ZSOLVER/contrav.f90 index 4032bffe0..ebef79590 100644 --- a/src/ZSOLVER/contrav.f90 +++ b/src/ZSOLVER/contrav.f90 @@ -568,7 +568,9 @@ real :: ZTMP1, ZTMP2 ! Intermediate work variable REAL, DIMENSION(:,:), POINTER , CONTIGUOUS :: ZU_EAST, ZV_NORTH, ZDZX_EAST, ZDZY_NORTH TYPE(LIST_ll), POINTER :: TZFIELD_U, TZFIELD_V, TZFIELD_DZX, TZFIELD_DZY TYPE(HALO2LIST_ll), POINTER :: TZHALO2_U, TZHALO2_V, TZHALO2_DZX, TZHALO2_DZY - +! +LOGICAL :: GWEST,GEAST,GSOUTH,GNORTH +! !$acc data present( PRUT, PRVT, PRWT, PDXX, PDYY, PDZZ, PDZX, PDZY, PRUCT, PRVCT, PRWCT, Z1, Z2 ) IF ( PRESENT(ODATA_ON_DEVICE) ) THEN @@ -597,6 +599,11 @@ IIU= SIZE(PDXX,1) IJU= SIZE(PDXX,2) IKU= SIZE(PDXX,3) ! +GWEST = ( HLBCX(1) /= 'CYCL' .AND. LWEST_ll() ) +GEAST = ( HLBCX(2) /= 'CYCL' .AND. LEAST_ll() ) +GSOUTH = ( HLBCY(1) /= 'CYCL' .AND. LSOUTH_ll() ) +GNORTH = ( HLBCY(2) /= 'CYCL' .AND. LNORTH_ll() ) +! CALL GET_INDICE_ll( IIB,IJB,IIE,IJE) ! IKB=1+JPVEXT @@ -679,50 +686,38 @@ ELSE !* 3. Compute the vertical contravariant components (general case) ! ------------------------------------ ! -!$acc kernels ! Z1(:,:,:) = 0. -! Z2(:,:,:) = 0. +! Z2(:,:,:) = 0. ! IF (KADV_ORDER == 2 ) THEN #ifdef MNH_OPENACC call Print_msg( NVERB_WARNING, 'GEN', 'CONTRAV', 'OpenACC: KADV_ORDER=2 and LFLAT=.TRUE. not yet tested' ) #endif +!$acc kernels ! !$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = 1, iju - do ji = iib, iie - Z1(ji, jj, jk ) = ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk - 1 ) ) * PDZX (ji, jj, jk ) * 0.25 & - + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25 - end do - end do + do concurrent (ji=iib:iie,jj=1:iju,jk=ikb:ike+1) + Z1(ji, jj, jk ) = ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk - 1 ) ) * PDZX (ji, jj, jk ) * 0.25 & + + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25 end do - !$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = 1, iiu - Z2(:, jj, jk ) = ( PRVCT(:, jj, jk) + PRVCT( :, jj, jk - 1) ) * PDZY(:, jj, jk) * 0.25 & - + ( PRVCT(:, jj + 1, jk) + PRVCT( :, jj + 1,jk - 1) ) * PDZY(:, jj + 1, jk) * 0.25 - end do - end do + do concurrent (ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1) + Z2(ji, jj, jk ) = ( PRVCT(ji, jj, jk) + PRVCT( ji, jj, jk - 1) ) * PDZY(ji, jj, jk) * 0.25 & + + ( PRVCT(ji, jj + 1, jk) + PRVCT( ji, jj + 1,jk - 1) ) * PDZY(ji, jj + 1, jk) * 0.25 end do PRWCT(:,:,:)=0. !$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = iib, iie - PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do - end do + do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) + PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) end do ! +!$acc end kernels ELSE IF (KADV_ORDER == 4 ) THEN ! !!$ IF (NHALO == 1) THEN - IF ( LWEST_ll() .AND. HLBCX(1)/='CYCL' ) THEN + IF ( GWEST ) THEN IW=IIB+2 -1 ELSE IW=IIB+1 -1 @@ -742,7 +737,7 @@ ELSE IF (KADV_ORDER == 4 ) THEN !!$ END IF ! !!$ IF(NHALO == 1) THEN - IF ( LSOUTH_ll() .AND. HLBCY(1)/='CYCL' ) THEN + IF ( GSOUTH ) THEN IS=IJB+2 -1 ELSE IS=IJB+1 -1 @@ -763,45 +758,38 @@ ELSE IF (KADV_ORDER == 4 ) THEN ! ! !* 3.1 interior of the process subdomain +!$acc kernels ! ! !PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works ! but ji loop can not be collapsed -> 10x slower on GPU ! *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance) !$acc loop independent collapse(3) private(ztmp1, ztmp2) - do jk = IKB, IKE + 1 - do jj = 1, iju - do ji = IW, IE + do concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(ji, jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) ) / 3.0 ) / 16.0 Z1(ji, jj, jk ) = 7.0 * ( ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk) & + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk) ) / 12.0 - end do - end do end do ! !$acc loop independent collapse(3) - do jk = IKB, IKE + 1 - do jj = is, in - do ji = 1, iiu + do concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, jj, jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) ) / 3.0 ) / 16.0 Z2(ji, jj, jk ) = 7.0 * ( ( PRVCT(ji, jj, jk ) + PRVCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk ) & + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk ) ) / 12.0 - end do - end do end do +!$acc end kernels ! !* 3.2 limits of the process subdomain (inside the whole domain or in cyclic conditions) ! !!$ IF (NHALO==1) THEN -!$acc loop independent collapse(2) - do jk = IKB, IKE + 1 - do jj = 1, iju +!$acc parallel loop independent collapse(2) async + do concurrent(jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(IIE, jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk ) + PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) ) / 3.0 ) / 16.0 Z1(IIE, jj, jk ) = 7.0 * ( ( PRUCT(IIE, jj, jk ) + PRUCT(IIE, jj, jk-1 ) ) * ztmp1 & @@ -809,65 +797,71 @@ ELSE IF (KADV_ORDER == 4 ) THEN - 0.5 * ( ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk) & + ( ZU_EAST (jj, jk ) + ZU_EAST (jj, jk-1 ) ) * ZDZX_EAST (jj, jk) ) / 12.0 end do - end do ! -!$acc loop independent collapse(2) - do jk = IKB, IKE + 1 - do ji = 1, iiu +!$acc parallel loop independent collapse(2) async + do concurrent(ji=1:iiu,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, IJE, jk) - ( PDZY (ji, IJE+1, jk) + PDZY(ji, IJE, jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji, jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE, jk) ) / 3.0 ) / 16.0 Z2(ji, IJE, jk ) = 7.0 * ( ( PRVCT (ji, IJE, jk ) + PRVCT (ji, IJE, jk-1 ) ) * ztmp1 & + ( PRVCT (ji, IJE+1, jk ) + PRVCT (ji, IJE+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT (ji, IJE-1, jk ) + PRVCT (ji, IJE-1, jk-1 ) ) * PDZY (ji, IJE-1, jk ) & + ( ZV_NORTH(ji, jk ) + ZV_NORTH(ji, jk-1 ) ) * ZDZY_NORTH(ji, jk ) ) / 12.0 - end do - end do + end do +!$acc wait !!$ END IF ! !* 3.3 non-CYCLIC CASE IN THE X DIRECTION: 2nd order case ! - IF (HLBCX(1)/='CYCL' .AND. LWEST_ll()) THEN + IF ( GWEST ) THEN + !$acc kernels async Z1(IIB, :, IKB:IKE+1 ) = ( PRUCT(IIB, :, IKB:IKE+1 ) + PRUCT(IIB, :, IKB-1:IKE ) ) * PDZX(IIB, :, IKB:IKE+1 ) * 0.25 & + ( PRUCT(IIB+1, :, IKB:IKE+1 ) + PRUCT(IIB+1, :, IKB-1:IKE ) ) * PDZX(IIB+1, :, IKB:IKE+1 ) * 0.25 + !$acc end kernels END IF ! - IF (HLBCX(2)/='CYCL' .AND. LEAST_ll()) THEN + IF ( GEAST ) THEN + !$acc kernels async Z1(IIE, :, IKB:IKE+1 ) = ( PRUCT(IIE, :, IKB:IKE+1 ) + PRUCT(IIE, :, IKB-1:IKE ) ) * PDZX(IIE, :, IKB:IKE+1 ) * 0.25 & + ( PRUCT(IIE+1, :, IKB:IKE+1 ) + PRUCT(IIE+1, :, IKB-1:IKE ) ) * PDZX(IIE+1, :, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF ! !* 3.4 non-CYCLIC CASE IN THE Y DIRECTION: 2nd order case ! - IF (HLBCY(1)/='CYCL' .AND. LSOUTH_ll()) THEN + IF ( GSOUTH ) THEN + !$acc kernels async Z2(:, IJB, IKB:IKE+1 ) = ( PRVCT(:, IJB, IKB:IKE+1 ) + PRVCT(:, IJB, IKB-1:IKE ) ) * PDZY(:, IJB, IKB:IKE+1 ) * 0.25 & + ( PRVCT(:, IJB+1, IKB:IKE+1 ) + PRVCT(:, IJB+1, IKB-1:IKE ) ) * PDZY(:, IJB+1, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF ! - IF (HLBCY(2)/='CYCL' .AND. LNORTH_ll()) THEN + IF ( GNORTH ) THEN + !$acc kernels async Z2(:, IJE, IKB:IKE+1 ) = ( PRVCT(:, IJE, IKB:IKE+1 ) + PRVCT(:, IJE, IKB-1:IKE ) ) * PDZY(:, IJE, IKB:IKE+1 ) * 0.25 & + ( PRVCT(:, IJE+1, IKB:IKE+1 ) + PRVCT(:, IJE+1, IKB-1:IKE ) ) * PDZY(:, IJE+1, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF +!$acc wait ! !* 3.5 Vertical contyravariant wind ! ! +!$acc kernels !!$ CALL GET_HALO(Z1) !!$ CALL GET_HALO(Z2) !!$ !!$ CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ) PRWCT(:,:,:)=0. !$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = iib, iie + do concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do - end do end do +!$acc end kernels ! ! END IF ! +!$acc kernels PRWCT(:,:,1) = - PRWCT(:,:,3) ! Mirror hypothesis !$acc end kernels !$acc update self(PRWCT) -- GitLab