diff --git a/tensorproductmultigrid_Source/communication.f90 b/tensorproductmultigrid_Source/communication.f90 index b5b2091857c91d9c08322e01e9b531dddbd1bc6a..e362a6f2d1b59b90967586d9a93d6b3ba6121e8b 100644 --- a/tensorproductmultigrid_Source/communication.f90 +++ b/tensorproductmultigrid_Source/communication.f90 @@ -372,15 +372,24 @@ contains !================================================================== ! Finalise communication routines !================================================================== - subroutine comm_finalise(n_lev, & ! } - lev_split) ! } Multigrid parameters + subroutine comm_finalise(n_lev, & ! } Multigrid parameters + lev_split, & !} + grid_param ) ! } Grid parameters implicit none integer, intent(in) :: n_lev integer, intent(in) :: lev_split + type(grid_parameters), intent(in) :: grid_param + ! local var logical :: reduced_m integer :: level, m integer :: ierr + integer :: nlocal,n character(len=80) :: s + + ! Local size of (horizontal) grid + n = grid_param%n + nlocal = n/2**pproc + m = pproc level = n_lev reduced_m = .false. @@ -400,16 +409,18 @@ contains ! --- Free interior data types --- if (LUseO) call mpi_type_free(interior(level,m),ierr) if (LUseO) call mpi_type_free(sub_interior(level,m),ierr) - if (LUseT) call mpi_type_free(interiorT(level,m),ierr) - if (LUseT) call mpi_type_free(sub_interiorT(level,m),ierr) + if (LUseT .and. (nlocal /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr) + if (LUseT .and. ( (nlocal/2) /= 0 ) ) call mpi_type_free(sub_interiorT(level,m),ierr) ! If we are below L_split, split data if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then reduced_m = .true. m = m-1 + nlocal = 2*nlocal cycle end if reduced_m = .false. level = level-1 + nlocal = nlocal/2 end do do m=pproc,0,-1 write(s,'("m = ",I3)') m @@ -445,6 +456,7 @@ contains type(scalar3d), intent(in) :: a type(scalar3d), intent(in) :: b real(kind=rl), intent(out) :: s + !local var integer :: nprocs, rank, ierr integer :: p_horiz(2) integer :: stepsize @@ -455,8 +467,17 @@ contains integer :: ix,iy,iz real(kind=rl) :: ddot + integer :: iy_min,iy_max, ix_min,ix_max + real , dimension(:,:,:) , pointer :: za_st,zb_st + nlocal = a%ix_max-a%ix_min+1 nz = a%grid_param%nz + + iy_min = a%iy_min + iy_max = a%iy_max + ix_min = a%ix_min + ix_max = a%ix_max + ! Work out coordinates of processor call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr) call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr) @@ -504,15 +525,19 @@ contains end do endif if (LUseT) then + za_st => a%st + zb_st => b%st global_sumt = 0.0_rl + !$acc kernels do iz=0,nz+1 - do iy=a%iy_min,a%iy_max - do ix=a%ix_min,a%ix_max + do iy=iy_min,iy_max + do ix=ix_min,ix_max global_sumt = global_sumt & - + a%st(ix,iy,iz)*b%st(ix,iy,iz) + + za_st(ix,iy,iz)*zb_st(ix,iy,iz) end do end do end do + !$acc end kernels endif end if if (LUseO) then @@ -601,6 +626,9 @@ contains !local var integer :: n, ix_min,ix_max,iy_min,iy_max + integer :: icompx_max,icompy_max + + real , dimension(:,:,:) , pointer :: za_st ! Update Real Boundary for Newman case u(0) = u(1) , etc ... @@ -627,34 +655,36 @@ contains endif if (LUseT) then ! transpose + + za_st => a%st + icompx_max = a%icompx_max + icompy_max = a%icompy_max + + !$acc kernels if ( ix_min == 1 ) then - a%st(0,:,:) = a%st(1,:,:) + !acc kernels + za_st(0,:,:) = za_st(1,:,:) + !acc end kernels endif if ( ix_max == n ) then - a%st(a%icompx_max+1,:,:) = a%st(a%icompx_max,:,:) + !acc kernels + za_st(icompx_max+1,:,:) = za_st(icompx_max,:,:) + !acc end kernels endif if ( iy_min == 1 ) then - a%st(:,0,:) = a%st(:,1,:) + !acc kernels + za_st(:,0,:) = za_st(:,1,:) + !acc end kernels endif if ( iy_max == n ) then - a%st(:,a%icompy_max+1,:) = a%st(:,a%icompy_max,:) + !acc kernels + za_st(:,icompy_max+1,:) = za_st(:,icompy_max,:) + !acc end kernels endif + !$acc end kernels + endif - ! - ! corner ( inutile , deja réalisé au dessus ) - ! -!!$ if ( ( ix_min == 1 ) .and. ( iy_min == 1 ) ) then -!!$ a%s(:,0,0) = a%s(:,1,1) -!!$ endif -!!$ if ( ( ix_min == 1 ) .and. ( iy_max == n ) ) then -!!$ a%s(:,a%icompy_max+1,0) = a%s(:,a%icompy_max,1) -!!$ end if -!!$ if ( ( ix_max == n ) .and. ( iy_min == 1 ) ) then -!!$ a%s(:,0,a%icompx_max+1) = a%s(:,1,a%icompx_max) -!!$ end if -!!$ if ( ( ix_max == n ) .and. ( iy_max == n ) ) then -!!$ a%s(:,a%icompy_max+1,a%icompx_max+1) = a%s(:,a%icompy_max,a%icompx_max) -!!$ end if + end subroutine boundary_mnh !================================================================== ! Initiate asynchronous halo exchange diff --git a/tensorproductmultigrid_Source/datatypes.f90 b/tensorproductmultigrid_Source/datatypes.f90 index 76dc6445a323529e638ae1eeba5d073be9a44c53..ef71df25c8e791932abd914914712bbeadf38ce2 100644 --- a/tensorproductmultigrid_Source/datatypes.f90 +++ b/tensorproductmultigrid_Source/datatypes.f90 @@ -76,7 +76,7 @@ module datatypes logical :: isactive ! Is this field active, i.e. used on one of the ! active processes on coarser grids? real(kind=rl),allocatable :: s(:,:,:) - real(kind=rl),allocatable :: st(:,:,:) + real(kind=rl),pointer :: st(:,:,:) type(grid_parameters) :: grid_param end type scalar3d @@ -309,6 +309,7 @@ private implicit none type(scalar3d), intent(in) :: phi logical, optional :: phi_is_volumeintegral + !local var integer :: ix, iy, iz real(kind=rl) :: tmp, global_tmp real(kind=rl) :: tmpt, global_tmpt @@ -318,6 +319,8 @@ private logical :: divide_by_volume real(kind=rl) :: volume_factor + real , dimension(:,:,:) , pointer :: zphi_st + nlocalx = phi%ix_max-phi%ix_min+1 nlocaly = phi%iy_max-phi%iy_min+1 nz = phi%grid_param%nz @@ -337,13 +340,16 @@ private end if if (LUseT) then + zphi_st => phi%st + !$acc kernels loop collapse(3) do iz=1,nz do iy=1,nlocaly do ix=1,nlocalx - tmpt = tmpt + phi%st(ix,iy,iz)**2 ! * volume_factor + tmpt = tmpt + zphi_st(ix,iy,iz)**2 ! * volume_factor end do end do end do + !$acc end kernels end if end if diff --git a/tensorproductmultigrid_Source/dblas.f90 b/tensorproductmultigrid_Source/dblas.f90 index 487c947c77a9589c637aa12799d6ce7822169610..48d5dcdce4f328566fd6e968cd8ba21311c5dd30 100644 --- a/tensorproductmultigrid_Source/dblas.f90 +++ b/tensorproductmultigrid_Source/dblas.f90 @@ -38,6 +38,7 @@ subroutine dcopy(n,sx,incx,sy,incy) 30 continue if( n .lt. 7 ) return 40 mp1 = m + 1 + !$acc kernels do 50 i = mp1,n,7 sy(i) = sx(i) sy(i + 1) = sx(i + 1) @@ -47,6 +48,7 @@ subroutine dcopy(n,sx,incx,sy,incy) sy(i + 5) = sx(i + 5) sy(i + 6) = sx(i + 6) 50 continue + !$acc end kernels return end subroutine dcopy @@ -94,12 +96,14 @@ SUBROUTINE DAXPY( N, SA, SX, INCX, SY, INCY ) 20 CONTINUE ENDIF ! ** UNROLL LOOP FOR SPEED + !$acc kernels DO 30 I = M+1, N, 4 SY(I) = SY(I) + SA * SX(I) SY(I+1) = SY(I+1) + SA * SX(I+1) SY(I+2) = SY(I+2) + SA * SX(I+2) SY(I+3) = SY(I+3) + SA * SX(I+3) 30 CONTINUE + !$acc end kernels ELSE ! ** NONEQUAL OR NONPOSITIVE INCREMENTS. diff --git a/tensorproductmultigrid_Source/discretisation.f90 b/tensorproductmultigrid_Source/discretisation.f90 index 5717460c3b94aebf8db62a45a56851076b789398..7fca1ce705d2beb19dc7d8732596d8fb2b0aaf00 100644 --- a/tensorproductmultigrid_Source/discretisation.f90 +++ b/tensorproductmultigrid_Source/discretisation.f90 @@ -123,10 +123,10 @@ private ! Data structure for storing the vertical discretisation type vertical_coefficients - real(kind=rl), allocatable :: a(:) - real(kind=rl), allocatable :: b(:) - real(kind=rl), allocatable :: c(:) - real(kind=rl), allocatable :: d(:) + real(kind=rl), pointer :: a(:) + real(kind=rl), pointer :: b(:) + real(kind=rl), pointer :: c(:) + real(kind=rl), pointer :: d(:) end type vertical_coefficients ! Stoarge for vertical coefficients @@ -553,6 +553,8 @@ end subroutine construct_vertical_coeff integer :: ix,iy,iz integer :: iib,iie,ijb,ije,ikb,ike + real , dimension(:,:,:) , pointer :: zr_st , zb_st + ! r <- A.u !call boundary_mnh(u) call apply_mnh(u,r) @@ -581,7 +583,12 @@ end subroutine construct_vertical_coeff ije=u%icompy_max ikb=1 ike=u%grid_param%nz - r%st(iib:iie,ijb:ije,ikb:ike) = b%st(iib:iie,ijb:ije,ikb:ike) - r%st(iib:iie,ijb:ije,ikb:ike) + + zr_st => r%st + zb_st => b%st + !$acc kernels + zr_st(iib:iie,ijb:ije,ikb:ike) = zb_st(iib:iie,ijb:ije,ikb:ike) - zr_st(iib:iie,ijb:ije,ikb:ike) + !$acc end kernels endif end subroutine calculate_residual_mnh @@ -624,6 +631,10 @@ end subroutine construct_vertical_coeff real(kind=rl) :: tmp integer :: iib,iie,ijb,ije + real, dimension(:,:,:) , pointer :: zv_st , zu_st + real, dimension(:) , pointer :: za_k, zb_k, zc_k, zd_k + integer :: ii,ij + call boundary_mnh(u) if (LUseO) then @@ -660,63 +671,42 @@ end subroutine construct_vertical_coeff endif if (LUseT) then call construct_alpha_T_cst_mnh(u%grid_param,alpha_T,Tij) -!!$ do iz=1,u%grid_param%nz -!!$ do iy=u%icompy_min,u%icompy_max -!!$ do ix=u%icompx_min,u%icompx_max -!!$ ! Construct horizontal part of stencil -!!$ b_k = vert_coeff%b(iz) -!!$ c_k = vert_coeff%c(iz) -!!$ d_k = vert_coeff%d(iz) -!!$ tmp = ((-b_k-c_k)*Tij ) * u%st(ix,iy,iz) -!!$ if (iz < grid_param%nz) then -!!$ tmp = tmp + b_k*Tij * u%st(ix,iy,iz+1) -!!$ end if -!!$ if (iz > 1) then -!!$ tmp = tmp + c_k*Tij * u%st(ix,iy,iz-1) -!!$ end if -!!$ if ((iz > 1) .and. (iz < grid_param%nz)) then -!!$ tmp = tmp - alpha_T(5) * u%st(ix, iy ,iz ) & -!!$ + alpha_T(1) * u%st(ix+1, iy ,iz) & -!!$ + alpha_T(2) * u%st(ix-1, iy ,iz) & -!!$ + alpha_T(3) * u%st(ix , iy+1,iz) & -!!$ + alpha_T(4) * u%st(ix , iy-1,iz) -!!$ end if -!!$ v%st(ix,iy,iz) = d_k*tmp -!!$ end do -!!$ end do -!!$ end do !----------------------------------------------------------- iib=u%icompx_min iie=u%icompx_max ijb=u%icompy_min ije=u%icompy_max ! + zv_st => v%st + zu_st => u%st + zb_k => vert_coeff%b + zc_k => vert_coeff%c + zd_k => vert_coeff%d + + !$acc kernels iz=1 - b_k = vert_coeff%b(iz) - c_k = vert_coeff%c(iz) - d_k = vert_coeff%d(iz) - v%st(iib:iie,ijb:ije,iz) = d_k* ( (-b_k-c_k)*Tij * u%st(iib:iie,ijb:ije,iz ) & - +b_k *Tij * u%st(iib:iie,ijb:ije,iz+1) ) + zv_st(iib:iie,ijb:ije,iz) = zd_k(iz)* ( (-zb_k(iz)-zc_k(iz))*Tij * zu_st(iib:iie,ijb:ije,iz ) & + +zb_k(iz) *Tij * zu_st(iib:iie,ijb:ije,iz+1) ) ! do iz=2,u%grid_param%nz-1 - b_k = vert_coeff%b(iz) - c_k = vert_coeff%c(iz) - d_k = vert_coeff%d(iz) - v%st(iib:iie,ijb:ije,iz) = d_k* ( ((-b_k-c_k)*Tij - 4.0_rl ) * u%st(iib:iie,ijb:ije,iz) & - +b_k *Tij * u%st(iib:iie,ijb:ije,iz+1) & - +c_k *Tij * u%st(iib:iie,ijb:ije,iz-1) & - + u%st(iib+1:iie+1,ijb:ije,iz) & - + u%st(iib-1:iie-1,ijb:ije,iz) & - + u%st(iib:iie,ijb+1:ije+1,iz) & - + u%st(iib:iie,ijb-1:ije-1,iz) ) + do ij=ijb,ije + do ii=iib,iie + zv_st(ii,ij,iz) = zd_k(iz)* ( ((-zb_k(iz)-zc_k(iz))*Tij - 4.0_rl ) * zu_st(ii,ij,iz) & + +zb_k(iz) *Tij * zu_st(ii,ij,iz+1) & + +zc_k(iz) *Tij * zu_st(ii,ij,iz-1) & + + zu_st(ii+1,ij,iz) & + + zu_st(ii-1,ij,iz) & + + zu_st(ii,ij+1,iz) & + + zu_st(ii,ij-1,iz) & + ) + end do + end do end do ! iz=u%grid_param%nz - b_k = vert_coeff%b(iz) - c_k = vert_coeff%c(iz) - d_k = vert_coeff%d(iz) - v%st(iib:iie,ijb:ije,iz) = d_k* ( (-b_k-c_k)*Tij * u%st(iib:iie,ijb:ije,iz) & - +c_k *Tij * u%st(iib:iie,ijb:ije,iz-1) ) + zv_st(iib:iie,ijb:ije,iz) = zd_k(iz)* ( (-zb_k(iz)-zc_k(iz))*Tij * zu_st(iib:iie,ijb:ije,iz) & + +zc_k(iz) *Tij * zu_st(iib:iie,ijb:ije,iz-1) ) + !$acc end kernels endif end subroutine apply_mnh @@ -1315,6 +1305,8 @@ end subroutine construct_vertical_coeff integer :: iib,iie,ijb,ije type(scalar3d) :: Sr,Sc,Sut0,Sutmp + real , dimension(:,:,:) , pointer :: zSut0_st , zu_st + call boundary_mnh(u) ! Set optimal smoothing parameter on each level @@ -1374,7 +1366,12 @@ end subroutine construct_vertical_coeff 1-halo_size:nlocal+halo_size, & 0:u%grid_param%nz+1) ) if (LUseO) u0(:,:,:) = u%s(:,:,:) - if (LUseT) ut0(:,:,:) = u%st(:,:,:) + if (LUseT) then + zu_st => u%st + !$acc kernels + ut0(:,:,:) = zu_st(:,:,:) + !$acc end kernels + end if ! Create residual vector allocate(r(nz)) ! Allocate memory for auxiliary vectors for Thomas algorithm @@ -1384,16 +1381,21 @@ end subroutine construct_vertical_coeff allocate(Sr%st(1-halo_size:nlocal+halo_size, & 1-halo_size:nlocal+halo_size, & 0:u%grid_param%nz+1) ) -!!$ allocate(Sc%st(1-halo_size:nlocal+halo_size, & -!!$ 1-halo_size:nlocal+halo_size, & -!!$ 0:u%grid_param%nz+1) ) allocate(Sut0%st(1-halo_size:nlocal+halo_size, & 1-halo_size:nlocal+halo_size, & 0:u%grid_param%nz+1) ) - Sut0%st(:,:,:) = u%st(:,:,:) + + zSut0_st => Sut0%st + zu_st => u%st + + !$acc kernels + zSut0_st(:,:,:) = zu_st(:,:,:) + !$acc end kernels + allocate(Sutmp%st(1-halo_size:nlocal+halo_size, & 1-halo_size:nlocal+halo_size, & 0:u%grid_param%nz+1) ) + endif ! Loop over grid @@ -1433,6 +1435,8 @@ end subroutine construct_vertical_coeff implicit none integer, intent(in) :: iblock integer :: ix,iy,iz + + real , dimension(:,:,:) , pointer :: zu_st , zSutmp_st , zSut0_st if (LUseO) then do ix=ixmin(iblock),ixmax(iblock) @@ -1451,30 +1455,23 @@ end subroutine construct_vertical_coeff end do end if if (LUseT) then -!!$ do ix=ixmin(iblock),ixmax(iblock) -!!$ do iy=iymin(iblock),iymax(iblock) -!!$ call apply_tridiag_solve_mnhT(ix,iy,r,c,b, & -!!$ ut0(ix+1,iy ,1:nz), & -!!$ ut0(ix-1,iy ,1:nz), & -!!$ ut0(ix ,iy+1,1:nz), & -!!$ ut0(ix ,iy-1,1:nz), & -!!$ utmp) -!!$ ! Add correction -!!$ do iz=1,nz -!!$ u%st(ix,iy,iz) = rho*utmp(iz) + (1.0_rl-rho)*ut0(ix,iy,iz) -!!$ end do -!!$ end do -!!$ end do iib=ixmin(iblock) iie=ixmax(iblock) ijb=iymin(iblock) ije=iymax(iblock) + + zu_st => u%st + zSutmp_st => Sutmp%st + zSut0_st => Sut0%st + call apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije,Sr,c,b, & Sut0, & Sutmp ) - u%st(iib:iie,ijb:ije,1:nz) = & - rho*Sutmp%st(iib:iie,ijb:ije,1:nz) & - + (1.0_rl-rho)*Sut0%st(iib:iie,ijb:ije,1:nz) + !$acc kernels + zu_st(iib:iie,ijb:ije,1:nz) = & + rho*zSutmp_st(iib:iie,ijb:ije,1:nz) & + + (1.0_rl-rho)*zSut0_st(iib:iie,ijb:ije,1:nz) + !$acc end kernels end if end subroutine loop_over_grid_jacobi_mnh @@ -1794,55 +1791,86 @@ end subroutine line_Jacobi_mnh real(kind=rl) :: alpha_div_Tij, tmp, b_k_tmp, c_k_tmp integer :: iz, nz + real, dimension(:,:,:) , pointer :: zSr_st , zb_st , zSu_in_st , zSu_out_st + real, dimension(:) , pointer :: za_k, zb_k, zc_k, zd_k + integer :: ii,ij + nz = b%grid_param%nz call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij) ! Calculate r_i = b_i - A_{ij} u_i if (LUseT ) then + + zSr_st => Sr%st + zb_st => b%st + zSu_in_st => Su_in%st + zSu_out_st => Su_out%st + za_k => vert_coeff%a + zb_k => vert_coeff%b + zc_k => vert_coeff%c + zd_k => vert_coeff%d + + !$acc kernels iz=1 - Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz) + zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) do iz=2,nz-1 - Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz) - vert_coeff%d(iz) * ( & - Su_in%st(iib+1:iie+1,ijb:ije,iz) + & - Su_in%st(iib-1:iie-1,ijb:ije,iz) + & - Su_in%st(iib:iie,ijb+1:ije+1,iz) + & - Su_in%st(iib:iie,ijb-1:ije-1,iz) ) + zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) - zd_k(iz) * ( & + zSu_in_st(iib+1:iie+1,ijb:ije,iz) + & + zSu_in_st(iib-1:iie-1,ijb:ije,iz) + & + zSu_in_st(iib:iie,ijb+1:ije+1,iz) + & + zSu_in_st(iib:iie,ijb-1:ije-1,iz) ) end do iz=nz - Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz) + zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) ! Thomas algorithm ! STEP 1: Create modified coefficients iz = 1 alpha_div_Tij = alpha_T(5)/Tij - tmp = (vert_coeff%a(iz)-vert_coeff%b(iz)-vert_coeff%c(iz)) + tmp = (za_k(iz)-vert_coeff%b(iz)-vert_coeff%c(iz)) c(iz) = vert_coeff%b(iz)/tmp - Su_out%st(iib:iie,ijb:ije,iz) = Sr%st(iib:iie,ijb:ije,iz) / (tmp*Tij*vert_coeff%d(iz)) + zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp*Tij*zd_k(iz)) ! + !acc loop seq do iz=2,nz-1 - b_k_tmp = vert_coeff%b(iz) - c_k_tmp = vert_coeff%c(iz) - tmp = ((vert_coeff%a(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) & + b_k_tmp = zb_k(iz) + c_k_tmp = zc_k(iz) + tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) & - c(iz-1)*c_k_tmp c(iz) = b_k_tmp / tmp - Su_out%st(iib:iie,ijb:ije,iz) = (Sr%st(iib:iie,ijb:ije,iz) / (Tij*vert_coeff%d(iz)) & - - Su_out%st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp + +!!$ zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) & +!!$ - zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp + + !$acc loop independent collapse(2) + do ij=ijb,ije + do ii=iib,iie + zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & + - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp + end do + end do +!!$ do concurrent ( ij=ijb:ije , ii=iib:iie ) +!!$ zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & +!!$ - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp +!!$ end do + end do ! iz=nz - b_k_tmp = vert_coeff%b(iz) - c_k_tmp = vert_coeff%c(iz) - tmp = ((vert_coeff%a(iz)-b_k_tmp-c_k_tmp)) & + b_k_tmp = zb_k(iz) + c_k_tmp = zc_k(iz) + tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)) & - c(iz-1)*c_k_tmp c(iz) = b_k_tmp / tmp - Su_out%st(iib:iie,ijb:ije,iz) = (Sr%st(iib:iie,ijb:ije,iz) / (Tij*vert_coeff%d(iz)) & - - Su_out%st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp + zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) & + - zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp ! STEP 2: back substitution do iz=nz-1,1,-1 - Su_out%st(iib:iie,ijb:ije,iz) = Su_out%st(iib:iie,ijb:ije,iz) & - - c(iz) * Su_out%st(iib:iie,ijb:ije,iz+1) + zSu_out_st(iib:iie,ijb:ije,iz) = zSu_out_st(iib:iie,ijb:ije,iz) & + - c(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1) end do + !$acc end kernels end if end subroutine apply_tridiag_solve_mnh_allT diff --git a/tensorproductmultigrid_Source/mode_mg.f90 b/tensorproductmultigrid_Source/mode_mg.f90 index c0bf52f57638595a3c2b12c20f7506b71d3186b5..c388cee404dfd606e9a815d13adbd4677cce2ee1 100644 --- a/tensorproductmultigrid_Source/mode_mg.f90 +++ b/tensorproductmultigrid_Source/mode_mg.f90 @@ -310,7 +310,7 @@ implicit none ! Finalise communications ... - call comm_finalise(mg_param%n_lev,mg_param%lev_split) + call comm_finalise(mg_param%n_lev,mg_param%lev_split,grid_param) call finish_timer(t_finalise) call print_timerinfo("# --- Main timing results ---") call print_elapsed(t_readparam,.true.,1.0_rl) diff --git a/tensorproductmultigrid_Source/multigrid.f90 b/tensorproductmultigrid_Source/multigrid.f90 index a96e7a1362ee8bce2e62668068d6cf662aee57ab..2a594d28ef2e0d7ab82d2fb10ff8b8e955a1ffa7 100644 --- a/tensorproductmultigrid_Source/multigrid.f90 +++ b/tensorproductmultigrid_Source/multigrid.f90 @@ -452,10 +452,13 @@ contains implicit none type(scalar3d), intent(in) :: phifine type(scalar3d), intent(inout) :: phicoarse + ! local var integer :: ix,iy,iz integer :: ix_min, ix_max, iy_min, iy_max, n real(kind=rl) :: xn,xs,xw,xe + real , dimension(:,:,:) , pointer :: zphifine_st , zphicoarse_st + n = phicoarse%grid_param%n ix_min = phicoarse%icompx_min ix_max = phicoarse%icompx_max @@ -478,17 +481,21 @@ contains end do endif if (LUseT) then + zphifine_st => phifine%st + zphicoarse_st => phicoarse%st + !$acc kernels loop independent collapse(3) do iz=1,phicoarse%grid_param%nz do iy=iy_min,iy_max do ix=ix_min,ix_max - phicoarse%st(ix,iy,iz) = & - phifine%st(2*ix ,2*iy ,iz) + & - phifine%st(2*ix ,2*iy-1,iz) + & - phifine%st(2*ix-1,2*iy ,iz) + & - phifine%st(2*ix-1,2*iy-1,iz) + zphicoarse_st(ix,iy,iz) = & + zphifine_st(2*ix ,2*iy ,iz) + & + zphifine_st(2*ix ,2*iy-1,iz) + & + zphifine_st(2*ix-1,2*iy ,iz) + & + zphifine_st(2*ix-1,2*iy-1,iz) end do end do end do + !$acc end kernels endif elseif(mg_param%restriction == REST_KHALIL) then @@ -769,6 +776,8 @@ contains !local var integer :: ix,iy,iz + real , dimension(:,:,:) , pointer :: zphifine_st , zphicoarse_st + ! optimisation for newman MNH case : all coef constant rhox = 0.25_rl rhoy = 0.25_rl @@ -794,22 +803,28 @@ contains end if if (LUseT) then ! Piecewise linear interpolation + + zphifine_st => phifine%st + zphicoarse_st => phicoarse%st + + !$acc kernels loop independent collapse(5) do iz=1,phicoarse%grid_param%nz do diy = -1,0 do dix = -1,0 do iy=iymin(iblock),iymax(iblock) do ix=ixmin(iblock),ixmax(iblock) - phifine%st(2*ix+dix,2*iy+diy,iz) = & - phicoarse%st(ix,iy,iz) + & - rhox*(phicoarse%st(ix+(2*dix+1),iy,iz) & - - phicoarse%st(ix,iy,iz)) + & - rhoy*(phicoarse%st(ix,iy+(2*diy+1),iz) & - - phicoarse%st(ix,iy,iz)) + zphifine_st(2*ix+dix,2*iy+diy,iz) = & + zphicoarse_st(ix,iy,iz) + & + rhox*(zphicoarse_st(ix+(2*dix+1),iy,iz) & + - zphicoarse_st(ix,iy,iz)) + & + rhoy*(zphicoarse_st(ix,iy+(2*diy+1),iz) & + - zphicoarse_st(ix,iy,iz)) end do end do end do end do end do + !$acc end kernels end if end subroutine loop_over_grid_linear_mnh @@ -1159,10 +1174,13 @@ contains integer, intent(in) :: splitlevel integer, intent(in) :: level integer, intent(in) :: m + !local var integer :: n_gridpoints integer :: nlocalx, nlocaly integer :: halo_size + real , dimension(:,:,:) , pointer :: zu_level_1_m_st + nlocalx = u(level,m)%ix_max-u(level,m)%ix_min+1 nlocaly = u(level,m)%iy_max-u(level,m)%iy_min+1 halo_size = u(level,m)%halo_size @@ -1212,7 +1230,12 @@ contains else ! Set initial solution on coarser level to zero (incl. halos!) if (LUseO) u(level-1,m)%s(:,:,:) = 0.0_rl - if (LUseT) u(level-1,m)%st(:,:,:) = 0.0_rl + if (LUseT) then + zu_level_1_m_st => u(level-1,m)%st(:,:,:) + !$acc kernels + zu_level_1_m_st(:,:,:) = 0.0_rl + !$acc end kernels + end if ! solve on coarser grid call mg_vcycle_mnh(b,u,r,finelevel,splitlevel,level-1,m) end if @@ -1433,7 +1456,7 @@ contains type(time) :: t_prec, t_res, t_apply, t_l2norm, t_scalprod, t_mainloop type(scalar3d) :: pp type(scalar3d) :: q - type(scalar3d) :: zone + type(scalar3d) :: z_one real(kind=rl) :: alpha, beta, pq, rz, rz_old integer :: ierr @@ -1460,27 +1483,27 @@ contains * (nlocaly+2*halo_size) & * (usolution%grid_param%nz+2) ! - ! Init 1 vector = zone - call create_scalar3d(MPI_COMM_HORIZ,bRHS%grid_param,halo_size,zone) + ! Init 1 vector = z_one + call create_scalar3d(MPI_COMM_HORIZ,bRHS%grid_param,halo_size,z_one) if (LUseO) then - zone%s(:,:,:) = 0.0_rl - zone%s(1:zone%grid_param%nz,1:zone%icompy_max,1:zone%icompx_max) = 1.0_rl + z_one%s(:,:,:) = 0.0_rl + z_one%s(1:z_one%grid_param%nz,1:z_one%icompy_max,1:z_one%icompx_max) = 1.0_rl end if if (LUseT) then - zone%st(:,:,:) = 0.0_rl - zone%st(1:zone%icompx_max,1:zone%icompy_max,1:zone%grid_param%nz) = 1.0_rl + z_one%st(:,:,:) = 0.0_rl + z_one%st(1:z_one%icompx_max,1:z_one%icompy_max,1:z_one%grid_param%nz) = 1.0_rl end if ! Mean / Norm of B - call scalarprod_mnh(pproc,zone,zone, mean_initial ) - call scalarprod_mnh(pproc,zone,bRHS, mean_initial ) + call scalarprod_mnh(pproc,z_one,z_one, mean_initial ) + call scalarprod_mnh(pproc,z_one,bRHS, mean_initial ) norm_initial = l2norm_mnh(bRHS,.true.) - mean_initial = mean_initial / (( zone%grid_param%nz ) * ( zone%grid_param%n )**2) + mean_initial = mean_initial / (( z_one%grid_param%nz ) * ( z_one%grid_param%n )**2) norm_initial = mean_initial / norm_initial if (LMean) then - ! b <- b -mean_initial * zone - if (LUseO) call daxpy(n_gridpoints,-mean_initial,zone%s,1,bRHS%s,1) - if (LUseT) call daxpy(n_gridpoints,-mean_initial,zone%st,1,bRHS%st,1) - call scalarprod_mnh(pproc,zone,bRHS, mean_initial ) + ! b <- b -mean_initial * z_one + if (LUseO) call daxpy(n_gridpoints,-mean_initial,z_one%s,1,bRHS%s,1) + if (LUseT) call daxpy(n_gridpoints,-mean_initial,z_one%st,1,bRHS%st,1) + call scalarprod_mnh(pproc,z_one,bRHS, mean_initial ) endif ! ! Copy b to b(1) @@ -1492,7 +1515,7 @@ contains ! Scale with volume of grid cells call volscale_scalar3d_mnh(xb_mg(level,pproc),1) - call scalarprod_mnh(pproc,zone,xb_mg(level,pproc), mean_initial ) + call scalarprod_mnh(pproc,z_one,xb_mg(level,pproc), mean_initial ) call start_timer(t_res) call calculate_residual_mnh(level, pproc, & @@ -1624,6 +1647,9 @@ contains if (LUseO) call dcopy(n_gridpoints,xu_mg(level,pproc)%s,1,usolution%s,1) if (LUseT) call dcopy(n_gridpoints,xu_mg(level,pproc)%st,1,usolution%st,1) end if + + call destroy_scalar3d(z_one) + call finish_timer(t_mainloop) ! Print out solver information