From 26c43f9c96d021fdfeb71e17f233b677a8de4265 Mon Sep 17 00:00:00 2001
From: Juan Escobar <juan.escobar@aero.obs-mip.fr>
Date: Tue, 3 Aug 2021 11:43:16 +0200
Subject: [PATCH] Juan 03/08/2021: discretisation.f90 , replace all temp array
 by preallocated save/on_type one

---
 .../discretisation.f90                        | 259 +++++++++++++-----
 1 file changed, 184 insertions(+), 75 deletions(-)

diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
index 9fa41dd27..9e5ca7427 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
@@ -1304,12 +1304,8 @@ end subroutine construct_vertical_coeff
     integer, intent(in)                :: m
     type(scalar3d), intent(in)         :: b
     type(scalar3d), intent(inout)      :: u
-    real(kind=rl), allocatable :: r(:)
     integer :: ix,iy,iz, nz
     real(kind=rl), dimension(5) :: alpha_T
-    real(kind=rl), allocatable :: c(:), utmp(:)
-    real(kind=rl), allocatable :: u0(:,:,:) 
-    real(kind=rl), allocatable :: ut0(:,:,:) 
     integer :: nlocal, halo_size
     real(kind=rl) :: rho
     logical :: overlap_comms
@@ -1320,20 +1316,86 @@ end subroutine construct_vertical_coeff
     integer :: iblock, ierr
 
     integer :: iib,iie,ijb,ije
-    type(scalar3d) :: Sr,Sc,Sut0,Sutmp
+    
+    real , dimension(:,:,:) , pointer , contiguous :: zSut0_st , zu_st
 
-    real , dimension(:,:,:) , pointer :: zSut0_st , zu_st
+    integer , parameter :: max_lev = 128
+    logical , save                      :: Ljacob_first_call = .true.
+    logical , save , dimension(max_lev) :: Ljacob_first_call_level = .true.
 
-    call boundary_mnh(u)
+    real(kind=rl), pointer , contiguous :: r(:)
+    real(kind=rl), pointer , contiguous :: c(:), utmp(:)
+    real(kind=rl), pointer , contiguous :: u0(:,:,:) 
+    real(kind=rl), pointer , contiguous :: ut0(:,:,:)
+    type(scalar3d) , pointer :: Sr,Sc,Sut0,Sutmp
 
-    ! Set optimal smoothing parameter on each level
-    !rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2))
-    rho = smoother_param%rho
+    type Temp_jacobi
+       real(kind=rl), pointer , contiguous :: r(:)
+       real(kind=rl), pointer , contiguous :: c(:), utmp(:)
+       real(kind=rl), pointer , contiguous :: u0(:,:,:) 
+       real(kind=rl), pointer , contiguous :: ut0(:,:,:)
+       type(scalar3d) , pointer            :: Sr,Sc,Sut0,Sutmp
+    end type Temp_jacobi
 
+    type (Temp_jacobi) , save , dimension(max_lev) :: Tjacobi
+
+    !
+    !  init size , param
+    !
     nz = u%grid_param%nz
     nlocal = u%ix_max -u%ix_min + 1
     halo_size = u%halo_size
 
+    ! Set optimal smoothing parameter on each level
+    !rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2))
+    rho = smoother_param%rho
+    
+    ! Allocate data one for all by level
+    if (Ljacob_first_call_level(level)) then
+       Ljacob_first_call_level(level) = .false.
+       if (LUseO) then
+          allocate(Tjacobi(level)%u0(0:u%grid_param%nz+1,            &
+               1-halo_size:nlocal+halo_size,   &
+               1-halo_size:nlocal+halo_size) )
+          !$acc enter data create(Tjacobi(level)%u0)
+       end if
+       if (LUseT) then
+          allocate(Tjacobi(level)%ut0(1-halo_size:nlocal+halo_size,   &
+               1-halo_size:nlocal+halo_size,   &
+               0:u%grid_param%nz+1) )
+          !$acc enter data create(Tjacobi(level)%ut0)
+       end if
+       ! Create residual vector       
+       allocate(Tjacobi(level)%r(nz))
+       !$acc enter data create(Tjacobi(level)%r)
+       ! Allocate memory for auxiliary vectors for Thomas algorithm      
+       allocate(Tjacobi(level)%c(nz))
+       !$acc enter data create(Tjacobi(level)%c)
+       allocate(Tjacobi(level)%utmp(nz))
+       !$acc enter data create(Tjacobi(level)%utmp)
+       if (LUseT) then
+          allocate(Tjacobi(level)%Sr)
+          allocate(Tjacobi(level)%Sr%st(1-halo_size:nlocal+halo_size,   &
+               1-halo_size:nlocal+halo_size,   &
+               0:u%grid_param%nz+1) )
+          !$acc enter data create(Tjacobi(level)%Sr%st)
+          allocate(Tjacobi(level)%Sut0)
+          allocate(Tjacobi(level)%Sut0%st(1-halo_size:nlocal+halo_size,   &
+               1-halo_size:nlocal+halo_size,   &
+               0:u%grid_param%nz+1) )
+          !$acc enter data create(Tjacobi(level)%Sut0%st)
+          allocate(Tjacobi(level)%Sutmp)
+          allocate(Tjacobi(level)%Sutmp%st(1-halo_size:nlocal+halo_size,   &
+                            1-halo_size:nlocal+halo_size,   &
+                            0:u%grid_param%nz+1) )
+          !$acc enter data create(Tjacobi(level)%Sutmp%st)
+       end if
+    end if
+
+    call boundary_mnh(u)
+
+
+
 #ifdef OVERLAPCOMMS
     overlap_comms = (nlocal > 2)
 #else
@@ -1375,13 +1437,19 @@ end subroutine construct_vertical_coeff
       iymax(5) = nlocal
     end if
 
-    ! Temporary vector
-    if (LUseO) allocate(u0(0:u%grid_param%nz+1,            &
-                1-halo_size:nlocal+halo_size,   &
-                1-halo_size:nlocal+halo_size) )
-    if (LUseT) allocate(ut0(1-halo_size:nlocal+halo_size,   &
-                            1-halo_size:nlocal+halo_size,   &
-                            0:u%grid_param%nz+1) )
+    ! Temporary vector 
+    if (LUseO) then
+!!$       allocate(u0(0:u%grid_param%nz+1,            &
+!!$                1-halo_size:nlocal+halo_size,   &
+!!$                1-halo_size:nlocal+halo_size) )
+       u0 => Tjacobi(level)%u0
+    end if
+    if (LUseT) then
+!!$       allocate(ut0(1-halo_size:nlocal+halo_size,   &
+!!$                            1-halo_size:nlocal+halo_size,   &
+!!$                            0:u%grid_param%nz+1) )
+       ut0 => Tjacobi(level)%ut0 
+    end if
     if (LUseO) u0(:,:,:) = u%s(:,:,:)
     if (LUseT) then
        zu_st =>  u%st
@@ -1390,18 +1458,23 @@ end subroutine construct_vertical_coeff
        !$acc end kernels
     end if
     ! Create residual vector
-    allocate(r(nz))
+!!$    allocate(r(nz))
+    r => Tjacobi(level)%r
     ! Allocate memory for auxiliary vectors for Thomas algorithm
-    allocate(c(nz))
-    allocate(utmp(nz))
+!!$    allocate(c(nz))
+    c => Tjacobi(level)%c    
+!!$    allocate(utmp(nz))
+    utmp => Tjacobi(level)%utmp
     if (LUseT) then
-       allocate(Sr%st(1-halo_size:nlocal+halo_size,   &
-                            1-halo_size:nlocal+halo_size,   &
-                            0:u%grid_param%nz+1) )
-       allocate(Sut0%st(1-halo_size:nlocal+halo_size,   &
-                            1-halo_size:nlocal+halo_size,   &
-                            0:u%grid_param%nz+1) )
-
+!!$       allocate(Sr%st(1-halo_size:nlocal+halo_size,   &
+!!$                            1-halo_size:nlocal+halo_size,   &
+!!$                            0:u%grid_param%nz+1) )
+       Sr => Tjacobi(level)%Sr
+!!$       allocate(Sut0%st(1-halo_size:nlocal+halo_size,   &
+!!$                            1-halo_size:nlocal+halo_size,   &
+!!$                            0:u%grid_param%nz+1) )
+       Sut0 => Tjacobi(level)%Sut0
+       
        zSut0_st => Sut0%st
        zu_st => u%st
 
@@ -1409,9 +1482,10 @@ end subroutine construct_vertical_coeff
        zSut0_st(:,:,:) = zu_st(:,:,:)
        !$acc end kernels
 
-       allocate(Sutmp%st(1-halo_size:nlocal+halo_size,   &
-                            1-halo_size:nlocal+halo_size,   &
-                            0:u%grid_param%nz+1) )
+!!$       allocate(Sutmp%st(1-halo_size:nlocal+halo_size,   &
+!!$                            1-halo_size:nlocal+halo_size,   &
+!!$                            0:u%grid_param%nz+1) )
+       Sutmp => Tjacobi(level)%Sutmp
        
     endif
 
@@ -1439,12 +1513,12 @@ end subroutine construct_vertical_coeff
     end if
 
     ! Free memory again
-    deallocate(r)
-    deallocate(c)
-    if (LUseO) deallocate(u0)
-    if (LUseT) deallocate(ut0)
-    deallocate(utmp)
-    if (LUseT) deallocate(Sr%st,Sut0%st,Sutmp%st)
+!!$    deallocate(r)
+!!$    deallocate(c)
+!!$    if (LUseO) deallocate(u0)
+!!$    if (LUseT) deallocate(ut0)
+!!$    deallocate(utmp)
+!!$    if (LUseT) deallocate(Sr%st,Sut0%st,Sutmp%st)
 
   contains
 
@@ -1483,11 +1557,14 @@ end subroutine construct_vertical_coeff
 
        call apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije,Sr,c,b,        &
                   Sut0, &
-                  Sutmp )
+                  Sutmp,level )
        !$acc kernels
-       zu_st(iib:iie,ijb:ije,1:nz) = & 
-                         rho*zSutmp_st(iib:iie,ijb:ije,1:nz) & 
-                       + (1.0_rl-rho)*zSut0_st(iib:iie,ijb:ije,1:nz)
+       !$acc loop independent collapse(3)
+       do concurrent ( ix=iib:iie,iy=ijb:ije,iz=1:nz )
+          zu_st(ix,iy,iz) = & 
+               rho*zSutmp_st(ix,iy,iz) & 
+               + (1.0_rl-rho)*zSut0_st(ix,iy,iz)
+       end do ! concurrent
        !$acc end kernels
     end if
 
@@ -1792,7 +1869,7 @@ end subroutine line_Jacobi_mnh
 ! tranpose version all xyz
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   subroutine apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije, &
-                                 Sr,c,b,Su_in,Su_out)
+                                 Sr,c,b,Su_in,Su_out,level)
 
     implicit none
     integer, intent(in) :: iib,iie,ijb,ije
@@ -1801,23 +1878,40 @@ end subroutine line_Jacobi_mnh
     type(scalar3d), intent(in)    :: b
     type(scalar3d), intent(in)    :: Su_in
     type(scalar3d), intent(inout) :: Su_out
+    integer, intent(in) :: level
 
     !local 
-    real(kind=rl), dimension(5) :: alpha_T
+    !real(kind=rl), dimension(5) :: alpha_T
     real(kind=rl) :: Tij
-    real(kind=rl) :: alpha_div_Tij, tmp, b_k_tmp, c_k_tmp
+    real(kind=rl) :: alpha_div_Tij ! b_k_tmp, c_k_tmp
     integer :: iz, nz
 
-    real, dimension(:,:,:) , pointer :: zSr_st , zb_st , zSu_in_st , zSu_out_st
-    real, dimension(:)     , pointer :: za_k, zb_k, zc_k, zd_k , tmp_k
+    real, dimension(:,:,:) , pointer, contiguous :: zSr_st , zb_st , zSu_in_st , zSu_out_st
+    real, dimension(:)     , pointer, contiguous :: za_k, zb_k, zc_k, zd_k 
     integer :: ii,ij
 
-    nz = b%grid_param%nz
+    integer , parameter :: max_lev = 128
+    logical , save                                 :: Lfirst_call_tridiag_mnhallT = .true.
+    logical , save , dimension(max_lev)            :: Lfirst_call_level_tridiag_mnhallT = .true.
 
-    call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij)
-    ! Calculate r_i = b_i - A_{ij} u_i
-    if (LUseT ) then 
+    real, dimension(:), pointer, contiguous :: tmp_k,c_k
+    
+    type Temp_tridiag_mnh
+       real, dimension(:), pointer, contiguous :: tmp_k,c_k
+    end type Temp_tridiag_mnh
+
+    type(Temp_tridiag_mnh) , save , dimension(max_lev) :: Ttridiag_mnh
 
+    if (LUseT ) then
+       
+       nz = b%grid_param%nz
+       
+       !call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij)
+       Tij = ( b%grid_param%L/b%grid_param%n ) ** 2
+       alpha_div_Tij = 4.0_rl / Tij
+       !print*,"level=",level," Tij=",Tij," alpha_div_Tij=",alpha_div_Tij
+       ! Calculate r_i = b_i - A_{ij} u_i
+    
       zSr_st => Sr%st
       zb_st  => b%st
       zSu_in_st => Su_in%st
@@ -1826,8 +1920,38 @@ end subroutine line_Jacobi_mnh
       zb_k  => vert_coeff%b
       zc_k  => vert_coeff%c
       zd_k  => vert_coeff%d
-      allocate(tmp_k(size(zb_k)))
 
+      if ( Lfirst_call_level_tridiag_mnhallT(level) ) then
+         Lfirst_call_level_tridiag_mnhallT(level) = .false.
+         allocate(Ttridiag_mnh(level)%tmp_k(size(zb_k)))
+         allocate(Ttridiag_mnh(level)%c_k(size(zb_k)))
+         
+         tmp_k => Ttridiag_mnh(level)%tmp_k
+         c_k => Ttridiag_mnh(level)%c_k
+         ! Thomas algorithm
+         ! STEP 1: Create modified coefficients
+         iz = 1
+         tmp_k(iz) = (za_k(iz)-zb_k(iz)-zc_k(iz))
+         c_k(iz) = zb_k(iz)/tmp_k(iz)
+         !
+         do iz=2,nz-1
+            tmp_k(iz) = ((za_k(iz)-zb_k(iz)-zc_k(iz))-alpha_div_Tij) &
+                 - c_k(iz-1)*zc_k(iz)
+            c_k(iz) = zb_k(iz) / tmp_k(iz)
+         end do
+         !
+         iz=nz
+         tmp_k(iz) = ((za_k(iz)-zb_k(iz)-zc_k(iz))) &
+              - c_k(iz-1)*zc_k(iz)
+         c_k(iz) = zb_k(iz) / tmp_k(iz)
+
+         !$acc enter data copyin(tmp_k,c_k)
+         
+      endif
+      
+      tmp_k => Ttridiag_mnh(level)%tmp_k
+      c_k => Ttridiag_mnh(level)%c_k
+         
       !$acc kernels
        iz=1 
        zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz)
@@ -1840,50 +1964,35 @@ end subroutine line_Jacobi_mnh
        end do
        iz=nz
        zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz)
-       
+       !
        ! Thomas algorithm
-       ! STEP 1: Create modified coefficients
-       iz = 1
-       alpha_div_Tij = alpha_T(5)/Tij
-       tmp = (za_k(iz)-vert_coeff%b(iz)-vert_coeff%c(iz))
-       c(iz) = vert_coeff%b(iz)/tmp
-       zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp*Tij*zd_k(iz))
+       ! 
+       iz = 1     
+       zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp_k(iz)*Tij*zd_k(iz))
        !
-       do iz=2,nz-1
-          b_k_tmp = zb_k(iz)
-          c_k_tmp = zc_k(iz)
-          tmp_k(iz) = ((za_k(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) &
-               - c(iz-1)*c_k_tmp
-          c(iz) = b_k_tmp / tmp_k(iz)
-       end do
+       !$acc loop seq
        do iz=2,nz-1
           !$acc loop independent collapse(2)
           do ij=ijb,ije
              do ii=iib,iie
                 zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & 
-                                     - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp_k(iz)
+                                     - zSu_out_st(ii,ij,iz-1)*zc_k(iz)) / tmp_k(iz)
              end do
           end do          
        end do
        !
        iz=nz
-       b_k_tmp = zb_k(iz)
-       c_k_tmp = zc_k(iz)
-       tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)) &
-            - c(iz-1)*c_k_tmp
-       c(iz) = b_k_tmp / tmp
        zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) &
-                                     -  zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp
+                                     -  zSu_out_st(iib:iie,ijb:ije,iz-1)*zc_k(iz)) / tmp_k(iz)
        
        ! STEP 2: back substitution
+       !$acc loop seq
        do iz=nz-1,1,-1
           zSu_out_st(iib:iie,ijb:ije,iz) = zSu_out_st(iib:iie,ijb:ije,iz) & 
-                                        - c(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1)
+                                        - c_k(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1)
        end do
        !$acc end kernels
-
-       deallocate(tmp_k)
-
+       
     end if
 
   end subroutine apply_tridiag_solve_mnh_allT
-- 
GitLab