From 26c43f9c96d021fdfeb71e17f233b677a8de4265 Mon Sep 17 00:00:00 2001 From: Juan Escobar <juan.escobar@aero.obs-mip.fr> Date: Tue, 3 Aug 2021 11:43:16 +0200 Subject: [PATCH] Juan 03/08/2021: discretisation.f90 , replace all temp array by preallocated save/on_type one --- .../discretisation.f90 | 259 +++++++++++++----- 1 file changed, 184 insertions(+), 75 deletions(-) diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 index 9fa41dd27..9e5ca7427 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 @@ -1304,12 +1304,8 @@ end subroutine construct_vertical_coeff integer, intent(in) :: m type(scalar3d), intent(in) :: b type(scalar3d), intent(inout) :: u - real(kind=rl), allocatable :: r(:) integer :: ix,iy,iz, nz real(kind=rl), dimension(5) :: alpha_T - real(kind=rl), allocatable :: c(:), utmp(:) - real(kind=rl), allocatable :: u0(:,:,:) - real(kind=rl), allocatable :: ut0(:,:,:) integer :: nlocal, halo_size real(kind=rl) :: rho logical :: overlap_comms @@ -1320,20 +1316,86 @@ end subroutine construct_vertical_coeff integer :: iblock, ierr integer :: iib,iie,ijb,ije - type(scalar3d) :: Sr,Sc,Sut0,Sutmp + + real , dimension(:,:,:) , pointer , contiguous :: zSut0_st , zu_st - real , dimension(:,:,:) , pointer :: zSut0_st , zu_st + integer , parameter :: max_lev = 128 + logical , save :: Ljacob_first_call = .true. + logical , save , dimension(max_lev) :: Ljacob_first_call_level = .true. - call boundary_mnh(u) + real(kind=rl), pointer , contiguous :: r(:) + real(kind=rl), pointer , contiguous :: c(:), utmp(:) + real(kind=rl), pointer , contiguous :: u0(:,:,:) + real(kind=rl), pointer , contiguous :: ut0(:,:,:) + type(scalar3d) , pointer :: Sr,Sc,Sut0,Sutmp - ! Set optimal smoothing parameter on each level - !rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2)) - rho = smoother_param%rho + type Temp_jacobi + real(kind=rl), pointer , contiguous :: r(:) + real(kind=rl), pointer , contiguous :: c(:), utmp(:) + real(kind=rl), pointer , contiguous :: u0(:,:,:) + real(kind=rl), pointer , contiguous :: ut0(:,:,:) + type(scalar3d) , pointer :: Sr,Sc,Sut0,Sutmp + end type Temp_jacobi + type (Temp_jacobi) , save , dimension(max_lev) :: Tjacobi + + ! + ! init size , param + ! nz = u%grid_param%nz nlocal = u%ix_max -u%ix_min + 1 halo_size = u%halo_size + ! Set optimal smoothing parameter on each level + !rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2)) + rho = smoother_param%rho + + ! Allocate data one for all by level + if (Ljacob_first_call_level(level)) then + Ljacob_first_call_level(level) = .false. + if (LUseO) then + allocate(Tjacobi(level)%u0(0:u%grid_param%nz+1, & + 1-halo_size:nlocal+halo_size, & + 1-halo_size:nlocal+halo_size) ) + !$acc enter data create(Tjacobi(level)%u0) + end if + if (LUseT) then + allocate(Tjacobi(level)%ut0(1-halo_size:nlocal+halo_size, & + 1-halo_size:nlocal+halo_size, & + 0:u%grid_param%nz+1) ) + !$acc enter data create(Tjacobi(level)%ut0) + end if + ! Create residual vector + allocate(Tjacobi(level)%r(nz)) + !$acc enter data create(Tjacobi(level)%r) + ! Allocate memory for auxiliary vectors for Thomas algorithm + allocate(Tjacobi(level)%c(nz)) + !$acc enter data create(Tjacobi(level)%c) + allocate(Tjacobi(level)%utmp(nz)) + !$acc enter data create(Tjacobi(level)%utmp) + if (LUseT) then + allocate(Tjacobi(level)%Sr) + allocate(Tjacobi(level)%Sr%st(1-halo_size:nlocal+halo_size, & + 1-halo_size:nlocal+halo_size, & + 0:u%grid_param%nz+1) ) + !$acc enter data create(Tjacobi(level)%Sr%st) + allocate(Tjacobi(level)%Sut0) + allocate(Tjacobi(level)%Sut0%st(1-halo_size:nlocal+halo_size, & + 1-halo_size:nlocal+halo_size, & + 0:u%grid_param%nz+1) ) + !$acc enter data create(Tjacobi(level)%Sut0%st) + allocate(Tjacobi(level)%Sutmp) + allocate(Tjacobi(level)%Sutmp%st(1-halo_size:nlocal+halo_size, & + 1-halo_size:nlocal+halo_size, & + 0:u%grid_param%nz+1) ) + !$acc enter data create(Tjacobi(level)%Sutmp%st) + end if + end if + + call boundary_mnh(u) + + + #ifdef OVERLAPCOMMS overlap_comms = (nlocal > 2) #else @@ -1375,13 +1437,19 @@ end subroutine construct_vertical_coeff iymax(5) = nlocal end if - ! Temporary vector - if (LUseO) allocate(u0(0:u%grid_param%nz+1, & - 1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size) ) - if (LUseT) allocate(ut0(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & - 0:u%grid_param%nz+1) ) + ! Temporary vector + if (LUseO) then +!!$ allocate(u0(0:u%grid_param%nz+1, & +!!$ 1-halo_size:nlocal+halo_size, & +!!$ 1-halo_size:nlocal+halo_size) ) + u0 => Tjacobi(level)%u0 + end if + if (LUseT) then +!!$ allocate(ut0(1-halo_size:nlocal+halo_size, & +!!$ 1-halo_size:nlocal+halo_size, & +!!$ 0:u%grid_param%nz+1) ) + ut0 => Tjacobi(level)%ut0 + end if if (LUseO) u0(:,:,:) = u%s(:,:,:) if (LUseT) then zu_st => u%st @@ -1390,18 +1458,23 @@ end subroutine construct_vertical_coeff !$acc end kernels end if ! Create residual vector - allocate(r(nz)) +!!$ allocate(r(nz)) + r => Tjacobi(level)%r ! Allocate memory for auxiliary vectors for Thomas algorithm - allocate(c(nz)) - allocate(utmp(nz)) +!!$ allocate(c(nz)) + c => Tjacobi(level)%c +!!$ allocate(utmp(nz)) + utmp => Tjacobi(level)%utmp if (LUseT) then - allocate(Sr%st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & - 0:u%grid_param%nz+1) ) - allocate(Sut0%st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & - 0:u%grid_param%nz+1) ) - +!!$ allocate(Sr%st(1-halo_size:nlocal+halo_size, & +!!$ 1-halo_size:nlocal+halo_size, & +!!$ 0:u%grid_param%nz+1) ) + Sr => Tjacobi(level)%Sr +!!$ allocate(Sut0%st(1-halo_size:nlocal+halo_size, & +!!$ 1-halo_size:nlocal+halo_size, & +!!$ 0:u%grid_param%nz+1) ) + Sut0 => Tjacobi(level)%Sut0 + zSut0_st => Sut0%st zu_st => u%st @@ -1409,9 +1482,10 @@ end subroutine construct_vertical_coeff zSut0_st(:,:,:) = zu_st(:,:,:) !$acc end kernels - allocate(Sutmp%st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & - 0:u%grid_param%nz+1) ) +!!$ allocate(Sutmp%st(1-halo_size:nlocal+halo_size, & +!!$ 1-halo_size:nlocal+halo_size, & +!!$ 0:u%grid_param%nz+1) ) + Sutmp => Tjacobi(level)%Sutmp endif @@ -1439,12 +1513,12 @@ end subroutine construct_vertical_coeff end if ! Free memory again - deallocate(r) - deallocate(c) - if (LUseO) deallocate(u0) - if (LUseT) deallocate(ut0) - deallocate(utmp) - if (LUseT) deallocate(Sr%st,Sut0%st,Sutmp%st) +!!$ deallocate(r) +!!$ deallocate(c) +!!$ if (LUseO) deallocate(u0) +!!$ if (LUseT) deallocate(ut0) +!!$ deallocate(utmp) +!!$ if (LUseT) deallocate(Sr%st,Sut0%st,Sutmp%st) contains @@ -1483,11 +1557,14 @@ end subroutine construct_vertical_coeff call apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije,Sr,c,b, & Sut0, & - Sutmp ) + Sutmp,level ) !$acc kernels - zu_st(iib:iie,ijb:ije,1:nz) = & - rho*zSutmp_st(iib:iie,ijb:ije,1:nz) & - + (1.0_rl-rho)*zSut0_st(iib:iie,ijb:ije,1:nz) + !$acc loop independent collapse(3) + do concurrent ( ix=iib:iie,iy=ijb:ije,iz=1:nz ) + zu_st(ix,iy,iz) = & + rho*zSutmp_st(ix,iy,iz) & + + (1.0_rl-rho)*zSut0_st(ix,iy,iz) + end do ! concurrent !$acc end kernels end if @@ -1792,7 +1869,7 @@ end subroutine line_Jacobi_mnh ! tranpose version all xyz !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! subroutine apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije, & - Sr,c,b,Su_in,Su_out) + Sr,c,b,Su_in,Su_out,level) implicit none integer, intent(in) :: iib,iie,ijb,ije @@ -1801,23 +1878,40 @@ end subroutine line_Jacobi_mnh type(scalar3d), intent(in) :: b type(scalar3d), intent(in) :: Su_in type(scalar3d), intent(inout) :: Su_out + integer, intent(in) :: level !local - real(kind=rl), dimension(5) :: alpha_T + !real(kind=rl), dimension(5) :: alpha_T real(kind=rl) :: Tij - real(kind=rl) :: alpha_div_Tij, tmp, b_k_tmp, c_k_tmp + real(kind=rl) :: alpha_div_Tij ! b_k_tmp, c_k_tmp integer :: iz, nz - real, dimension(:,:,:) , pointer :: zSr_st , zb_st , zSu_in_st , zSu_out_st - real, dimension(:) , pointer :: za_k, zb_k, zc_k, zd_k , tmp_k + real, dimension(:,:,:) , pointer, contiguous :: zSr_st , zb_st , zSu_in_st , zSu_out_st + real, dimension(:) , pointer, contiguous :: za_k, zb_k, zc_k, zd_k integer :: ii,ij - nz = b%grid_param%nz + integer , parameter :: max_lev = 128 + logical , save :: Lfirst_call_tridiag_mnhallT = .true. + logical , save , dimension(max_lev) :: Lfirst_call_level_tridiag_mnhallT = .true. - call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij) - ! Calculate r_i = b_i - A_{ij} u_i - if (LUseT ) then + real, dimension(:), pointer, contiguous :: tmp_k,c_k + + type Temp_tridiag_mnh + real, dimension(:), pointer, contiguous :: tmp_k,c_k + end type Temp_tridiag_mnh + + type(Temp_tridiag_mnh) , save , dimension(max_lev) :: Ttridiag_mnh + if (LUseT ) then + + nz = b%grid_param%nz + + !call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij) + Tij = ( b%grid_param%L/b%grid_param%n ) ** 2 + alpha_div_Tij = 4.0_rl / Tij + !print*,"level=",level," Tij=",Tij," alpha_div_Tij=",alpha_div_Tij + ! Calculate r_i = b_i - A_{ij} u_i + zSr_st => Sr%st zb_st => b%st zSu_in_st => Su_in%st @@ -1826,8 +1920,38 @@ end subroutine line_Jacobi_mnh zb_k => vert_coeff%b zc_k => vert_coeff%c zd_k => vert_coeff%d - allocate(tmp_k(size(zb_k))) + if ( Lfirst_call_level_tridiag_mnhallT(level) ) then + Lfirst_call_level_tridiag_mnhallT(level) = .false. + allocate(Ttridiag_mnh(level)%tmp_k(size(zb_k))) + allocate(Ttridiag_mnh(level)%c_k(size(zb_k))) + + tmp_k => Ttridiag_mnh(level)%tmp_k + c_k => Ttridiag_mnh(level)%c_k + ! Thomas algorithm + ! STEP 1: Create modified coefficients + iz = 1 + tmp_k(iz) = (za_k(iz)-zb_k(iz)-zc_k(iz)) + c_k(iz) = zb_k(iz)/tmp_k(iz) + ! + do iz=2,nz-1 + tmp_k(iz) = ((za_k(iz)-zb_k(iz)-zc_k(iz))-alpha_div_Tij) & + - c_k(iz-1)*zc_k(iz) + c_k(iz) = zb_k(iz) / tmp_k(iz) + end do + ! + iz=nz + tmp_k(iz) = ((za_k(iz)-zb_k(iz)-zc_k(iz))) & + - c_k(iz-1)*zc_k(iz) + c_k(iz) = zb_k(iz) / tmp_k(iz) + + !$acc enter data copyin(tmp_k,c_k) + + endif + + tmp_k => Ttridiag_mnh(level)%tmp_k + c_k => Ttridiag_mnh(level)%c_k + !$acc kernels iz=1 zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) @@ -1840,50 +1964,35 @@ end subroutine line_Jacobi_mnh end do iz=nz zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) - + ! ! Thomas algorithm - ! STEP 1: Create modified coefficients - iz = 1 - alpha_div_Tij = alpha_T(5)/Tij - tmp = (za_k(iz)-vert_coeff%b(iz)-vert_coeff%c(iz)) - c(iz) = vert_coeff%b(iz)/tmp - zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp*Tij*zd_k(iz)) + ! + iz = 1 + zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp_k(iz)*Tij*zd_k(iz)) ! - do iz=2,nz-1 - b_k_tmp = zb_k(iz) - c_k_tmp = zc_k(iz) - tmp_k(iz) = ((za_k(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) & - - c(iz-1)*c_k_tmp - c(iz) = b_k_tmp / tmp_k(iz) - end do + !$acc loop seq do iz=2,nz-1 !$acc loop independent collapse(2) do ij=ijb,ije do ii=iib,iie zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & - - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp_k(iz) + - zSu_out_st(ii,ij,iz-1)*zc_k(iz)) / tmp_k(iz) end do end do end do ! iz=nz - b_k_tmp = zb_k(iz) - c_k_tmp = zc_k(iz) - tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)) & - - c(iz-1)*c_k_tmp - c(iz) = b_k_tmp / tmp zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) & - - zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp + - zSu_out_st(iib:iie,ijb:ije,iz-1)*zc_k(iz)) / tmp_k(iz) ! STEP 2: back substitution + !$acc loop seq do iz=nz-1,1,-1 zSu_out_st(iib:iie,ijb:ije,iz) = zSu_out_st(iib:iie,ijb:ije,iz) & - - c(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1) + - c_k(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1) end do !$acc end kernels - - deallocate(tmp_k) - + end if end subroutine apply_tridiag_solve_mnh_allT -- GitLab