From f5613bcccf551bc890241d2b45db0980d46dbd00 Mon Sep 17 00:00:00 2001
From: Juan Escobar <escj@aero.obs-mip.fr>
Date: Tue, 16 Nov 2021 12:27:48 +0100
Subject: [PATCH] Juan 16/11/2021:ZSOLVER, add routine
 zt1d_discretisation_allocate3d , for fast GPU memory allocation in MutiGrid

---
 .../discretisation.f90                        | 212 +++++++++++++-----
 .../mg_main_mnh.f90                           |   4 +-
 2 files changed, 162 insertions(+), 54 deletions(-)

diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
index 1bd6d51d8..b81b9dc95 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
@@ -135,6 +135,29 @@ private
   ! Stoarge for vertical coefficients
   type(vertical_coefficients) :: vert_coeff
 
+  type Temp_jacobi
+     real(kind=rl), pointer , contiguous :: r(:)
+     real(kind=rl), pointer , contiguous :: c(:), utmp(:)
+     real(kind=rl), pointer , contiguous :: u0(:,:,:) 
+     real(kind=rl), pointer , contiguous :: ut0(:,:,:)
+     type(scalar3d) , pointer            :: Sr,Sc,Sut0,Sutmp
+  end type Temp_jacobi
+
+  integer , parameter :: max_lev = 128
+  type (Temp_jacobi) , save , dimension(max_lev) :: Tjacobi
+
+  real(kind=rl), pointer , contiguous , dimension(:) :: zt1d_discretisation
+  INTEGER                                            :: zt1d_discretisation_size = 10000000
+  INTEGER , parameter                                :: zt1d_discretisation_size_factor = 2 * 4
+  INTEGER                                            :: nt1d_discretisation_top = 0 , nt1d_discretisation_max = 0
+  
+  INTEGER , ALLOCATABLE, DIMENSION (:)               :: nt1d_discretisation_pointer , nt1d_discretisation_pointer_ksize
+  INTEGER , parameter                                :: zt1d_discretisation_pointer_size = 128
+  INTEGER                                            :: nt1d_discretisation_pointer_top = 0 , nt1d_discretisation_pointer_max = 0
+  
+  logical                                            :: gfirst_call_zt1d_discretisation = .true.
+  
+
 public::discretisation_initialise_mnh
 public::discretisation_initialise
 public::discretisation_finalise
@@ -158,12 +181,91 @@ public::DIRECTION_FORWARD
 public::DIRECTION_BACKWARD
 public::ORDERING_LEX
 public::ORDERING_RB
+public::zt1d_discretisation_init
 
 contains
 
 !==================================================================
 ! Initialise module
 !==================================================================
+
+  subroutine zt1d_discretisation_init(kiu,kju,kku)
+    implicit none
+
+    integer , optional , intent(in) :: kiu,kju,kku ! size of 1 3D MNH array
+
+    if (gfirst_call_zt1d_discretisation) then
+       !
+       gfirst_call_zt1d_discretisation = .false.
+       !
+       if (present(kiu)) then
+          zt1d_discretisation_size = kiu*kju*kku * zt1d_discretisation_size_factor
+       end if
+       allocate(zt1d_discretisation(zt1d_discretisation_size))
+       zt1d_discretisation = 0.0
+       !$acc enter data copyin(zt1d_discretisation)
+       !
+       allocate(nt1d_discretisation_pointer(zt1d_discretisation_pointer_size))
+       nt1d_discretisation_pointer = 0.0
+       allocate(nt1d_discretisation_pointer_ksize(zt1d_discretisation_pointer_size))
+       nt1d_discretisation_pointer_ksize = 0.0
+       !
+    end if
+    
+  end subroutine zt1d_discretisation_init
+
+  function zt1d_discretisation_allocate3d(ptab3d,kib,kie,kjb,kje,kkb,kke) result(kindex)
+    implicit none
+    real(kind=rl), pointer , contiguous , dimension(:,:,:) :: ptab3d
+    !real(kind=rl), pointer , dimension(:,:,:) :: ptab3d
+    integer , intent(in)                                   :: kib,kie,kjb,kje,kkb,kke
+    integer                                                :: kindex
+
+    ! local var
+
+    integer :: ksize, kbeg, kend
+
+    if ( nt1d_discretisation_pointer_top == zt1d_discretisation_pointer_size ) then
+       print*,"ERROR zt1d_discretisation_init:: zt1d_discretisation_pointer_size to small=",zt1d_discretisation_pointer_size
+       print*,"ERROR zt1d_discretisation_init:: Augment it "
+       stop 
+    end if
+
+    ksize = (kie-kib+1)*(kje-kjb+1)*(kke-kkb+1)
+
+    kbeg = nt1d_discretisation_top + 1
+    kend  = nt1d_discretisation_top + ksize
+
+    if ( kend > zt1d_discretisation_size ) then
+       print*,"ERROR zt1d_discretisation_allocate3d:: zt1d_discretisation_size to small=",zt1d_discretisation_size
+       print*,"ERROR zt1d_discretisation_allocate3d:: Augment it "
+       stop       
+    end if
+
+    ptab3d(kib:kie,kjb:kje,kkb:kke) => zt1d_discretisation(kbeg:)
+
+    nt1d_discretisation_pointer_top = nt1d_discretisation_pointer_top + 1
+    nt1d_discretisation_top = kend
+    
+    nt1d_discretisation_pointer(nt1d_discretisation_pointer_top)       = kend
+    nt1d_discretisation_pointer_ksize(nt1d_discretisation_pointer_top) = ksize
+
+    if (  nt1d_discretisation_pointer_top > nt1d_discretisation_pointer_max ) then
+       nt1d_discretisation_pointer_max =  nt1d_discretisation_pointer_top
+       print*,"zt1d_discretisation_allocate3d:: nt1d_discretisation_pointer_max=",nt1d_discretisation_pointer_max
+       call flush(6)
+    endif
+    
+    if (  nt1d_discretisation_top > nt1d_discretisation_max ) then
+       nt1d_discretisation_max =  nt1d_discretisation_top
+       print*,"zt1d_discretisation_allocate3d:: nt1d_discretisation_max=",nt1d_discretisation_max
+       call flush(6)
+    endif
+            
+  end function zt1d_discretisation_allocate3d
+
+  
+  
  subroutine discretisation_initialise_mnh(grid_param_in, &
                                        model_param_in, &
                                        smoother_param_in, &
@@ -1326,9 +1428,8 @@ end subroutine construct_vertical_coeff
 
     integer :: iib,iie,ijb,ije
     
-    real , dimension(:,:,:) , pointer , contiguous :: zSut0_st , zu_st
+    real , dimension(:,:,:) , pointer , contiguous :: zSut0_st , zu_st, zSr_st, zSutmp_st
 
-    integer , parameter :: max_lev = 128
     logical , save                      :: Ljacob_first_call = .true.
     logical , save , dimension(max_lev) :: Ljacob_first_call_level = .true.
 
@@ -1338,15 +1439,9 @@ end subroutine construct_vertical_coeff
     real(kind=rl), pointer , contiguous :: ut0(:,:,:)
     type(scalar3d) , pointer :: Sr,Sc,Sut0,Sutmp
 
-    type Temp_jacobi
-       real(kind=rl), pointer , contiguous :: r(:)
-       real(kind=rl), pointer , contiguous :: c(:), utmp(:)
-       real(kind=rl), pointer , contiguous :: u0(:,:,:) 
-       real(kind=rl), pointer , contiguous :: ut0(:,:,:)
-       type(scalar3d) , pointer            :: Sr,Sc,Sut0,Sutmp
-    end type Temp_jacobi
-
-    type (Temp_jacobi) , save , dimension(max_lev) :: Tjacobi
+    integer :: nib,nie,njb,nje,nkb,nke
+    integer :: ii,ij,ik
+    integer :: iindex
 
     !
     !  init size , param
@@ -1354,7 +1449,14 @@ end subroutine construct_vertical_coeff
     nz = u%grid_param%nz
     nlocal = u%ix_max -u%ix_min + 1
     halo_size = u%halo_size
-
+ 
+    nib = 1-halo_size
+    nie = nlocal+halo_size
+    njb = 1-halo_size
+    nje = nlocal+halo_size
+    nkb = 0
+    nke = nz + 1
+    
     ! Set optimal smoothing parameter on each level
     !rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2))
     rho = smoother_param%rho
@@ -1369,42 +1471,62 @@ end subroutine construct_vertical_coeff
           !$acc enter data create(Tjacobi(level)%u0)
        end if
        if (LUseT) then
-          allocate(Tjacobi(level)%ut0(1-halo_size:nlocal+halo_size,   &
-               1-halo_size:nlocal+halo_size,   &
-               0:u%grid_param%nz+1) )
-          !$acc enter data create(Tjacobi(level)%ut0)
+          print*,"allocate(ut0),nib,nie,njb,nje,nkb,nke,level",nib,nie,njb,nje,nkb,nke, level
+!!$          allocate(Tjacobi(level)%ut0(1-halo_size:nlocal+halo_size,   &
+!!$               1-halo_size:nlocal+halo_size,   &
+!!$               0:u%grid_param%nz+1) )
+!!$          ut0 => Tjacobi(level)%ut0
+
+!!$          allocate(ut0(nib:nie,njb:nje,nkb:nke) )
+!!$          !$acc enter data create(ut0)          
+          iindex = zt1d_discretisation_allocate3d(ut0,nib,nie,njb,nje,nkb,nke)
+          
+          Tjacobi(level)%ut0 => ut0
+          
        end if
        ! Create residual vector       
        allocate(Tjacobi(level)%r(nz))
-       !$acc enter data create(Tjacobi(level)%r)
+       r => Tjacobi(level)%r
+       !$acc enter data create(r)
        ! Allocate memory for auxiliary vectors for Thomas algorithm      
        allocate(Tjacobi(level)%c(nz))
-       !$acc enter data create(Tjacobi(level)%c)
+       c => Tjacobi(level)%c
+       !$acc enter data create(c)
        allocate(Tjacobi(level)%utmp(nz))
-       !$acc enter data create(Tjacobi(level)%utmp)
+       utmp => Tjacobi(level)%utmp
+       !$acc enter data create(utmp)
        if (LUseT) then
           allocate(Tjacobi(level)%Sr)
-          allocate(Tjacobi(level)%Sr%st(1-halo_size:nlocal+halo_size,   &
-               1-halo_size:nlocal+halo_size,   &
-               0:u%grid_param%nz+1) )
-          !$acc enter data create(Tjacobi(level)%Sr%st)
+!!$          allocate(Tjacobi(level)%Sr%st(1-halo_size:nlocal+halo_size,   &
+!!$               1-halo_size:nlocal+halo_size,   &
+!!$               0:u%grid_param%nz+1) )
+!!$          zSr_st => Tjacobi(level)%Sr%st
+!!$          !$acc enter data create(zSr_st)
+          iindex = zt1d_discretisation_allocate3d(zSr_st,nib,nie,njb,nje,nkb,nke)
+          Tjacobi(level)%Sr%st => zSr_st
+          
           allocate(Tjacobi(level)%Sut0)
-          allocate(Tjacobi(level)%Sut0%st(1-halo_size:nlocal+halo_size,   &
-               1-halo_size:nlocal+halo_size,   &
-               0:u%grid_param%nz+1) )
-          !$acc enter data create(Tjacobi(level)%Sut0%st)
+!!$          allocate(Tjacobi(level)%Sut0%st(1-halo_size:nlocal+halo_size,   &
+!!$               1-halo_size:nlocal+halo_size,   &
+!!$               0:u%grid_param%nz+1) )
+!!$          zSut0_st => Tjacobi(level)%Sut0%st
+!!$          !$acc enter data create(zSut0_st)
+          iindex = zt1d_discretisation_allocate3d(zSut0_st,nib,nie,njb,nje,nkb,nke)
+          Tjacobi(level)%Sut0%st => zSut0_st
+          
           allocate(Tjacobi(level)%Sutmp)
-          allocate(Tjacobi(level)%Sutmp%st(1-halo_size:nlocal+halo_size,   &
-                            1-halo_size:nlocal+halo_size,   &
-                            0:u%grid_param%nz+1) )
-          !$acc enter data create(Tjacobi(level)%Sutmp%st)
+!!$          allocate(Tjacobi(level)%Sutmp%st(1-halo_size:nlocal+halo_size,   &
+!!$                            1-halo_size:nlocal+halo_size,   &
+!!$                            0:u%grid_param%nz+1) )
+!!$          zSutmp_st => Tjacobi(level)%Sutmp%st
+!!$          !$acc enter data create(zSutmp_st)
+          iindex = zt1d_discretisation_allocate3d(zSutmp_st,nib,nie,njb,nje,nkb,nke)
+          Tjacobi(level)%Sutmp%st => zSutmp_st
        end if
     end if
 
     call boundary_mnh(u)
 
-
-
 #ifdef OVERLAPCOMMS
     overlap_comms = (nlocal > 2)
 #else
@@ -1448,40 +1570,27 @@ end subroutine construct_vertical_coeff
 
     ! Temporary vector 
     if (LUseO) then
-!!$       allocate(u0(0:u%grid_param%nz+1,            &
-!!$                1-halo_size:nlocal+halo_size,   &
-!!$                1-halo_size:nlocal+halo_size) )
        u0 => Tjacobi(level)%u0
     end if
     if (LUseT) then
-!!$       allocate(ut0(1-halo_size:nlocal+halo_size,   &
-!!$                            1-halo_size:nlocal+halo_size,   &
-!!$                            0:u%grid_param%nz+1) )
        ut0 => Tjacobi(level)%ut0 
     end if
     if (LUseO) u0(:,:,:) = u%s(:,:,:)
     if (LUseT) then
        zu_st =>  u%st
-       !$acc kernels
-       ut0(:,:,:) = zu_st(:,:,:)
+       !$acc kernels loop independent collapse(3)
+       do concurrent (ii=nib:nie,ij=njb:nje,ik=nkb:nke)
+          ut0(ii,ij,ik) = zu_st(ii,ij,ik)
+       end do
        !$acc end kernels
     end if
     ! Create residual vector
-!!$    allocate(r(nz))
     r => Tjacobi(level)%r
     ! Allocate memory for auxiliary vectors for Thomas algorithm
-!!$    allocate(c(nz))
     c => Tjacobi(level)%c    
-!!$    allocate(utmp(nz))
     utmp => Tjacobi(level)%utmp
     if (LUseT) then
-!!$       allocate(Sr%st(1-halo_size:nlocal+halo_size,   &
-!!$                            1-halo_size:nlocal+halo_size,   &
-!!$                            0:u%grid_param%nz+1) )
        Sr => Tjacobi(level)%Sr
-!!$       allocate(Sut0%st(1-halo_size:nlocal+halo_size,   &
-!!$                            1-halo_size:nlocal+halo_size,   &
-!!$                            0:u%grid_param%nz+1) )
        Sut0 => Tjacobi(level)%Sut0
        
        zSut0_st => Sut0%st
@@ -1491,9 +1600,6 @@ end subroutine construct_vertical_coeff
        zSut0_st(:,:,:) = zu_st(:,:,:)
        !$acc end kernels
 
-!!$       allocate(Sutmp%st(1-halo_size:nlocal+halo_size,   &
-!!$                            1-halo_size:nlocal+halo_size,   &
-!!$                            0:u%grid_param%nz+1) )
        Sutmp => Tjacobi(level)%Sutmp
        
     endif
diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/mg_main_mnh.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/mg_main_mnh.f90
index 219993168..e20657cfe 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/mg_main_mnh.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/mg_main_mnh.f90
@@ -73,6 +73,7 @@ contains
     real(kind=rl) , optional , intent (in) :: PL,PH
     real(kind=rl) , optional , intent (in) :: PA_K(:),PB_K(:),PC_K(:),PD_K(:)
 
+
     call mg_init_mnh(KN,KNZ,PL,PH,PA_K,PB_K,PC_K,PD_K)
 
     ! Initialise ghosts in initial solution, as mg_solve assumes that they
@@ -113,6 +114,7 @@ implicit none
 integer       , optional , intent(in) :: KIB,KIE,KIU,KJB,KJE,KJU,KKU
 real(kind=rl) , optional , intent(in) :: PY(:,:,:)
 
+ call zt1d_discretisation_init(KIU,KJU,KKU)
  call initialise_rhs_mnh(grid_param,model_param,xb_fine,KIB,KIE,KIU,KJB,KJE,KJU,KKU,PY)
  call haloswap_mnh(mg_param%n_lev,pproc,xb_fine)
 
@@ -179,7 +181,7 @@ end subroutine mg_main_get_u_mnh
        call mg_main_initialise_u_mnh()
 
        if (i_am_master_mpi) then
-          write(STDOUT,*),'************************ IT=',it,' ***************************'
+          write(STDOUT,*) '************************ IT=',it,' ***************************'
           call flush(STDOUT)
        end if
        
-- 
GitLab