diff --git a/src/MNH/contrav.f90 b/src/MNH/contrav.f90 index be1726c65c51d625538769cd37a9d70987b1f535..5918c3455968297b7c425734f95e56282f5cbac2 100644 --- a/src/MNH/contrav.f90 +++ b/src/MNH/contrav.f90 @@ -609,7 +609,7 @@ IJU= SIZE(PDXX,2) IKU= SIZE(PDXX,3) !Pin positions in the pools of MNH memory -CALL MNH_MEM_POSITION_PIN( 'CONTRAV_DEVICE 1' ) +CALL MNH_MEM_POSITION_PIN( 'CONTRAV_DEVICE' ) CALL MNH_MEM_GET( Z1, IIU, IJU, IKU ) CALL MNH_MEM_GET( Z2, IIU, IJU, IKU ) @@ -624,6 +624,41 @@ CALL GET_INDICE_ll( IIB,IJB,IIE,IJE) IKB=1+JPVEXT IKE=IKU - JPVEXT +CALL CONTRAV_DEVICE_DIM(& + PRUT,PRVT,PRWT,& + PDXX,PDYY,PDZZ,PDZX,PDZY,& + PRUCT,PRVCT,PRWCT,Z1,Z2) +! ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH) + +!Release all memory allocated with MNH_MEM_GET calls since last call to MNH_MEM_POSITION_PIN +CALL MNH_MEM_RELEASE( 'CONTRAV_DEVICE' ) + +IF (MPPDB_INITIALIZED) THEN + !Check all OUT arrays + CALL MPPDB_CHECK(PRUCT,"CONTRAV end:PRUCT") + CALL MPPDB_CHECK(PRVCT,"CONTRAV end:PRVCT") + CALL MPPDB_CHECK(PRWCT,"CONTRAV end:PRWCT") +END IF + +CONTAINS + + SUBROUTINE CONTRAV_DEVICE_DIM(& + PRUT,PRVT,PRWT,& + PDXX,PDYY,PDZZ,PDZX,PDZY,& + PRUCT,PRVCT,PRWCT,Z1,Z2) +! ZU_EAST,ZV_NORTH,ZDZX_EAST,ZDZY_NORTH) + + IMPLICIT NONE + + REAL, DIMENSION(IIU, IJU, IKU) :: & + PRUT,PRVT,PRWT,& + PDXX,PDYY,PDZZ,PDZX,PDZY,& + PRUCT,PRVCT,PRWCT,Z1,Z2 +! REAL :: ZU_EAST(IJU,IKU),ZV_NORTH(IIU,IKU),ZDZX_EAST(IJU,IKU),ZDZY_NORTH(IIU,IKU) +! +! Begin Compute +! + !$acc data present( PRUT, PRVT, PRWT, PDXX, PDYY, PDZZ, PDZX, PDZY, PRUCT, PRVCT, PRWCT, Z1, Z2 ) IF (GDATA_ON_DEVICE) THEN @@ -703,46 +738,31 @@ ELSE ! Z1(:,:,:) = 0. ! Z2(:,:,:) = 0. ! +CALL MPPDB_CHECK3DM("contrav_device :: init Z1/Z2",PRECISION,Z1,Z2) +! IF (KADV_ORDER == 2 ) THEN #ifdef MNH_OPENACC call Print_msg( NVERB_WARNING, 'GEN', 'CONTRAV', 'OpenACC: KADV_ORDER=2 and LFLAT=.TRUE. not yet tested' ) #endif -! !$acc kernels -!$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = 1, iju - do ji = iib, iie - Z1(ji, jj, jk ) = ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk - 1 ) ) * PDZX (ji, jj, jk ) * 0.25 & - + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25 - end do - end do - end do - -!$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = 1, iiu - Z2(:, jj, jk ) = ( PRVCT(:, jj, jk) + PRVCT( :, jj, jk - 1) ) * PDZY(:, jj, jk) * 0.25 & - + ( PRVCT(:, jj + 1, jk) + PRVCT( :, jj + 1,jk - 1) ) * PDZY(:, jj + 1, jk) * 0.25 - end do - end do - end do +! + !$mnh_do_concurrent(ji=iib:iie,jj=1:iju,jk=ikb:ike+1) + Z1(ji, jj, jk ) = ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk - 1 ) ) * PDZX (ji, jj, jk ) * 0.25 & + + ( PRUCT(ji + 1, jj, jk ) + PRUCT(ji + 1, jj, jk - 1 ) ) * PDZX (ji + 1, jj, jk ) * 0.25 + !$mnh_end_do() + !$mnh_do_concurrent(ji=1:iiu,jj=ijb:ije,jk=ikb:ike+1) + Z2(ji, jj, jk ) = ( PRVCT(ji, jj, jk) + PRVCT( ji, jj, jk - 1) ) * PDZY(ji, jj, jk) * 0.25 & + + ( PRVCT(ji, jj + 1, jk) + PRVCT( ji, jj + 1,jk - 1) ) * PDZY(ji, jj + 1, jk) * 0.25 + !$mnh_end_do() PRWCT(:,:,:)=0. -!$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = iib, iie - PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do - end do - end do -!$acc end kernels + !$mnh_do_concurrent(ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) + PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) + !$mnh_end_do() ! +!$acc end kernels ELSE IF (KADV_ORDER == 4 ) THEN -!$acc kernels present( zu_east, zv_north, zdzx_east, zdzy_north ) ! !!$ IF (NHALO == 1) THEN IF ( GWEST ) THEN @@ -786,109 +806,108 @@ ELSE IF (KADV_ORDER == 4 ) THEN ! ! !* 3.1 interior of the process subdomain +!$acc kernels ! ! !PW: OpenACC remarks: *computing only ztmp2 and reusing it at next iteration works ! but ji loop can not be collapsed -> 10x slower on GPU ! *ztmp1 and ztmp2 are not necessary but improve readability (no impact on performance) -!$acc loop independent collapse(3) private(ztmp1, ztmp2) - do jk = IKB, IKE + 1 - do jj = 1, iju - do ji = IW, IE + !$mnh_do_concurrent(ji=IW:IE,jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(ji, jj, jk ) - ( PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) + PDZX(ji-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(ji+1, jj, jk ) - ( PDZX(ji+2, jj, jk ) + PDZX(ji+1, jj, jk ) + PDZX(ji, jj, jk ) ) / 3.0 ) / 16.0 Z1(ji, jj, jk ) = 7.0 * ( ( PRUCT(ji, jj, jk ) + PRUCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRUCT(ji+1, jj, jk ) + PRUCT(ji+1, jj, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRUCT(ji-1, jj, jk ) + PRUCT(ji-1, jj, jk-1 ) ) * PDZX(ji-1, jj, jk) & + ( PRUCT(ji+2, jj, jk ) + PRUCT(ji+2, jj, jk-1 ) ) * PDZX(ji+2, jj, jk) ) / 12.0 - end do - end do - end do -! -!$acc loop independent collapse(3) - do jk = IKB, IKE + 1 - do jj = is, in - do ji = 1, iiu + !$mnh_end_do() +! + !$mnh_do_concurrent(ji=1:iiu,jj=is:in,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, jj, jk ) - ( PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) + PDZY(ji, jj-1, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, jj+1, jk ) - ( PDZY(ji, jj+2, jk ) + PDZY(ji, jj+1, jk ) + PDZY(ji, jj, jk ) ) / 3.0 ) / 16.0 Z2(ji, jj, jk ) = 7.0 * ( ( PRVCT(ji, jj, jk ) + PRVCT(ji, jj, jk-1 ) ) * ztmp1 & + ( PRVCT(ji, jj+1, jk ) + PRVCT(ji, jj+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT(ji, jj-1, jk ) + PRVCT(ji, jj-1, jk-1 ) ) * PDZY(ji, jj-1, jk ) & + ( PRVCT(ji, jj+2, jk ) + PRVCT(ji, jj+2, jk-1 ) ) * PDZY(ji, jj+2, jk ) ) / 12.0 - end do - end do - end do + !$mnh_end_do() +!$acc end kernels +! +!!$CALL MPPDB_CHECK3DM("contrav_device :: dom Z1/Z2",PRECISION,Z1,Z2) ! !* 3.2 limits of the process subdomain (inside the whole domain or in cyclic conditions) ! !!$ IF (NHALO==1) THEN -!$acc loop independent collapse(2) - do jk = IKB, IKE + 1 - do jj = 1, iju +!$acc kernels async + !$mnh_do_concurrent(jj=1:iju,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZX(IIE, jj, jk ) - ( PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) + PDZX(IIE-1, jj, jk ) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZX(IIE+1, jj, jk ) - ( ZDZX_EAST(jj, jk ) + PDZX(IIE+1, jj, jk ) + PDZX(IIE, jj, jk ) ) / 3.0 ) / 16.0 Z1(IIE, jj, jk ) = 7.0 * ( ( PRUCT(IIE, jj, jk ) + PRUCT(IIE, jj, jk-1 ) ) * ztmp1 & + ( PRUCT(IIE+1, jj, jk ) + PRUCT(IIE+1, jj, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRUCT(IIE-1, jj, jk ) + PRUCT(IIE-1, jj, jk-1 ) ) * PDZX(IIE-1, jj, jk) & + ( ZU_EAST (jj, jk ) + ZU_EAST (jj, jk-1 ) ) * ZDZX_EAST (jj, jk) ) / 12.0 - end do - end do + !$mnh_end_do() +!$acc end kernels ! -!$acc loop independent collapse(2) - do jk = IKB, IKE + 1 - do ji = 1, iiu +!$acc kernels async + !$mnh_do_concurrent(ji=1:iiu,jk=IKB:IKE+1) ztmp1 = ( 9.0 * PDZY(ji, IJE, jk) - ( PDZY (ji, IJE+1, jk) + PDZY(ji, IJE, jk) + PDZY(ji, IJE-1, jk) ) / 3.0 ) / 16.0 ztmp2 = ( 9.0 * PDZY(ji, IJE+1, jk) - ( ZDZY_NORTH(ji, jk) + PDZY(ji, IJE+1, jk) + PDZY(ji, IJE, jk) ) / 3.0 ) / 16.0 Z2(ji, IJE, jk ) = 7.0 * ( ( PRVCT (ji, IJE, jk ) + PRVCT (ji, IJE, jk-1 ) ) * ztmp1 & + ( PRVCT (ji, IJE+1, jk ) + PRVCT (ji, IJE+1, jk-1 ) ) * ztmp2 ) / 12.0 & - 0.5 * ( ( PRVCT (ji, IJE-1, jk ) + PRVCT (ji, IJE-1, jk-1 ) ) * PDZY (ji, IJE-1, jk ) & + ( ZV_NORTH(ji, jk ) + ZV_NORTH(ji, jk-1 ) ) * ZDZY_NORTH(ji, jk ) ) / 12.0 - end do - end do + !$mnh_end_do() +!$acc end kernels +!$acc wait !!$ END IF ! !* 3.3 non-CYCLIC CASE IN THE X DIRECTION: 2nd order case ! - IF ( GWEST ) THEN + IF ( GWEST ) THEN + !$acc kernels async Z1(IIB, :, IKB:IKE+1 ) = ( PRUCT(IIB, :, IKB:IKE+1 ) + PRUCT(IIB, :, IKB-1:IKE ) ) * PDZX(IIB, :, IKB:IKE+1 ) * 0.25 & + ( PRUCT(IIB+1, :, IKB:IKE+1 ) + PRUCT(IIB+1, :, IKB-1:IKE ) ) * PDZX(IIB+1, :, IKB:IKE+1 ) * 0.25 + !$acc end kernels END IF ! IF ( GEAST ) THEN + !$acc kernels async Z1(IIE, :, IKB:IKE+1 ) = ( PRUCT(IIE, :, IKB:IKE+1 ) + PRUCT(IIE, :, IKB-1:IKE ) ) * PDZX(IIE, :, IKB:IKE+1 ) * 0.25 & + ( PRUCT(IIE+1, :, IKB:IKE+1 ) + PRUCT(IIE+1, :, IKB-1:IKE ) ) * PDZX(IIE+1, :, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF ! !* 3.4 non-CYCLIC CASE IN THE Y DIRECTION: 2nd order case ! - IF ( GSOUTH ) THEN + IF ( GSOUTH ) THEN + !$acc kernels async Z2(:, IJB, IKB:IKE+1 ) = ( PRVCT(:, IJB, IKB:IKE+1 ) + PRVCT(:, IJB, IKB-1:IKE ) ) * PDZY(:, IJB, IKB:IKE+1 ) * 0.25 & + ( PRVCT(:, IJB+1, IKB:IKE+1 ) + PRVCT(:, IJB+1, IKB-1:IKE ) ) * PDZY(:, IJB+1, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF ! - IF ( GNORTH ) THEN + IF ( GNORTH ) THEN + !$acc kernels async Z2(:, IJE, IKB:IKE+1 ) = ( PRVCT(:, IJE, IKB:IKE+1 ) + PRVCT(:, IJE, IKB-1:IKE ) ) * PDZY(:, IJE, IKB:IKE+1 ) * 0.25 & + ( PRVCT(:, IJE+1, IKB:IKE+1 ) + PRVCT(:, IJE+1, IKB-1:IKE ) ) * PDZY(:, IJE+1, IKB:IKE+1 ) * 0.25 - END IF + !$acc end kernels + END IF +!$acc wait ! !* 3.5 Vertical contyravariant wind ! ! +!$acc kernels !!$ CALL GET_HALO(Z1) !!$ CALL GET_HALO(Z2) !!$ !!$ CALL MPPDB_CHECK3DM("contrav_device ::Z1/Z2/ PDZZ",PRECISION,Z1,Z2,PDZZ) PRWCT(:,:,:)=0. -!$acc loop independent collapse(3) - do jk = ikb, ike + 1 - do jj = ijb, ije - do ji = iib, iie + !$mnh_do_concurrent (ji=iib:iie,jj=ijb:ije,jk=ikb:ike+1) PRWCT(ji ,jj, jk ) = ( PRWT(ji ,jj, jk ) - Z1(ji ,jj, jk ) - Z2(ji ,jj, jk ) ) / PDZZ(ji ,jj, jk ) - end do - end do - end do + !$mnh_end_do() !$acc end kernels ! +CALL MPPDB_CHECK3DM("contrav_device :: PRWCT/Z1/Z2",PRECISION,PRWCT,Z1,Z2) ! END IF ! @@ -915,15 +934,11 @@ END IF FLAT !$acc end data -!Release all memory allocated with MNH_MEM_GET calls since last call to MNH_MEM_POSITION_PIN -CALL MNH_MEM_RELEASE( 'CONTRAV_DEVICE 1' ) +! +! End Compute +! -IF (MPPDB_INITIALIZED) THEN - !Check all OUT arrays - CALL MPPDB_CHECK(PRUCT,"CONTRAV end:PRUCT") - CALL MPPDB_CHECK(PRVCT,"CONTRAV end:PRVCT") - CALL MPPDB_CHECK(PRWCT,"CONTRAV end:PRWCT") -END IF + END SUBROUTINE CONTRAV_DEVICE_DIM !----------------------------------------------------------------------- END SUBROUTINE CONTRAV_DEVICE