discretisation.f90

!=== COPYRIGHT AND LICENSE STATEMENT ===
!
!  This file is part of the TensorProductMultigrid code.
!  
!  (c) The copyright relating to this work is owned jointly by the
!  Crown, Met Office and NERC [2014]. However, it has been created
!  with the help of the GungHo Consortium, whose members are identified
!  at https://puma.nerc.ac.uk/trac/GungHo/wiki .
!  
!  Main Developer: Eike Mueller
!  
!  TensorProductMultigrid is free software: you can redistribute it and/or
!  modify it under the terms of the GNU Lesser General Public License as
!  published by the Free Software Foundation, either version 3 of the
!  License, or (at your option) any later version.
!  
!  TensorProductMultigrid is distributed in the hope that it will be useful,
!  but WITHOUT ANY WARRANTY; without even the implied warranty of
!  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!  GNU Lesser General Public License for more details.
!  
!  You should have received a copy of the GNU Lesser General Public License
!  along with TensorProductMultigrid (see files COPYING and COPYING.LESSER).
!  If not, see <http://www.gnu.org/licenses/>.
!
!=== COPYRIGHT AND LICENSE STATEMENT ===


!==================================================================
!
!  Discretisation module of the model problem
!
!
!  -omega2 * (d^2/dx^2 + d^2/dy^2 + lambda2 * d^2/dz^2 ) u
!                                                + delta u = RHS
!  [Cartesian]
!
!  or
!
!  -omega2 * (laplace_{2d} + lambda2/r^2 d/dr (r^2 d/dr)) u
!                                                + delta u = RHS
!  [Spherical]
!
!  We use a cell centered finite volume discretisation with
!  The equation is discretised either in a unit cube or on 1/6th
!  of a cubed sphere grid.
!
!  The vertical grid spacing is not necessarily uniform and can
!  be chosen by specifying the vertical grid in a vector.
!
!  The following boundary conditions are used:
!
!  * Dirichlet in the horizontal
!  * Neumann in the vertical
!
!  For delta = 0 the operator reduces to the Poisson operator.
!
!    Eike Mueller, University of Bath, Feb 2012
!
!==================================================================

module discretisation

  use parameters
  use messages
  use datatypes
  use communication
  use mpi

  implicit none

private

  type model_parameters
    real(kind=rl) :: omega2   ! omega^2
    real(kind=rl) :: lambda2  ! lambda^2
    real(kind=rl) :: delta    ! delta
  end type model_parameters

! --- Stencil ---
!

! Grid traversal direction in SOR
  integer, parameter :: DIRECTION_FORWARD = 1
  integer, parameter :: DIRECTION_BACKWARD = 2

! Ordering in SOR
  ! Lexicographic ordering
  integer, parameter :: ORDERING_LEX = 1
  ! Red-black ordering
  integer, parameter :: ORDERING_RB = 2

  type smoother_parameters
    ! smoother
    integer :: smoother
    ! relaxation parameter
    real(kind=rl) :: rho
    ! ordering of degrees of freedom
    integer :: ordering
  end type smoother_parameters

  ! Allowed smoothers
  integer, parameter :: SMOOTHER_LINE_SOR = 3
  integer, parameter :: SMOOTHER_LINE_SSOR = 4
  integer, parameter :: SMOOTHER_LINE_JAC = 6

  ! Number of levels
  integer :: nlev

  ! Grid parameters
  type(grid_parameters) :: grid_param

  ! Model parameters
  type(model_parameters) :: model_param

  ! Smoother parameters
  type(smoother_parameters) :: smoother_param

  ! Arrays for measuring the residual reduction
  real(kind=rl), allocatable :: log_resreduction(:)
  integer, allocatable :: nsmooth_total(:)

  ! Data structure for storing the vertical discretisation
  type vertical_coefficients
    real(kind=rl), allocatable :: a(:)
    real(kind=rl), allocatable :: b(:)
    real(kind=rl), allocatable :: c(:)
    real(kind=rl), allocatable :: d(:)
  end type vertical_coefficients

  ! Stoarge for vertical coefficients
  type(vertical_coefficients) :: vert_coeff

public::discretisation_initialise
public::discretisation_finalise
public::smooth
public::line_SOR
public::line_SSOR
public::line_jacobi
public::calculate_residual
public::apply
public::model_parameters
public::smoother_parameters
public::volume_of_element
public::SMOOTHER_LINE_SOR
public::SMOOTHER_LINE_SSOR
public::SMOOTHER_LINE_JAC
public::DIRECTION_FORWARD
public::DIRECTION_BACKWARD
public::ORDERING_LEX
public::ORDERING_RB

contains

!==================================================================
! Initialise module
!==================================================================
  subroutine discretisation_initialise(grid_param_in, &
                                       model_param_in, &
                                       smoother_param_in, &
                                       nlev_in)
    implicit none
    type(grid_parameters), intent(in)   :: grid_param_in
    type(model_parameters), intent(in)   :: model_param_in
    type(smoother_parameters), intent(in)   :: smoother_param_in
    integer, intent(in) :: nlev_in
    integer :: k
    grid_param = grid_param_in
    model_param = model_param_in
    smoother_param = smoother_param_in
    nlev = nlev_in
    allocate(log_resreduction(nlev))
    allocate(nsmooth_total(nlev))
    log_resreduction(:) = 0.0_rl
    nsmooth_total(:) = 0
    allocate(r_grid(grid_param%nz+1))
    if (grid_param%graded) then
      do k=1,grid_param%nz+1
        r_grid(k) = grid_param%H*(1.0_rl*(k-1.0_rl)/grid_param%nz)**2
      end do
    else
      do k=1,grid_param%nz+1
        r_grid(k) = grid_param%H*(1.0_rl*(k-1.0_rl)/grid_param%nz)
      end do
    end if
#ifdef CARTESIANGEOMETRY
#else
    r_grid(:) = 1.0_rl + r_grid(:)
#endif
    ! Allocate arrays for vertical discretisation matrices
    ! and calculate matrix entries
    allocate(vert_coeff%a(grid_param%nz))
    allocate(vert_coeff%b(grid_param%nz))
    allocate(vert_coeff%c(grid_param%nz))
    allocate(vert_coeff%d(grid_param%nz))
    call construct_vertical_coeff()
  end subroutine discretisation_initialise

!==================================================================
! Finalise module
!==================================================================
  subroutine discretisation_finalise()
    implicit none
    integer :: level
    real(kind=rl) :: rho_avg
#ifdef MEASURESMOOTHINGRATE
    if (i_am_master_mpi) then
      write(STDOUT,'("Average smoothing rates:")')
      do level=nlev,1,-1
        if (nsmooth_total(level) > 0) then
          rho_avg = exp(log_resreduction(level)/nsmooth_total(level))
        else
          rho_avg = 1.0_rl
        end if
        write(STDOUT,'("rho_{avg}(",I3,") = ",E10.4," ( ",I5," x )")') &
          level, rho_avg, nsmooth_total(level)
      end do
    end if
#endif
    deallocate(log_resreduction)
    deallocate(nsmooth_total)
    deallocate(r_grid)
    ! Deallocate storage for vertical discretisation matrices
    deallocate(vert_coeff%a)
    deallocate(vert_coeff%b)
    deallocate(vert_coeff%c)
    deallocate(vert_coeff%d)
  end subroutine discretisation_finalise

!==================================================================
! Construct alpha_{i',j'} and |T_{ij}| needed for the
! horizontal stencil
! ( alpha_{i+1,j},
!   alpha_{i-1,j},
!   alpha_{i,j+1},
!   alpha_{i,j-1},
!   alpha_{ij})
! (ix,iy) are LOCAL indices of the grid boxes, which are
! converted to global indices
!==================================================================
  subroutine construct_alpha_T(grid_param,ix,iy,alpha_T,Tij)
    implicit none
    type(grid_parameters), intent(in) :: grid_param
    integer, intent(in) :: ix
    integer, intent(in) :: iy
    real(kind=rl), intent(inout), dimension(5) :: alpha_T
    real(kind=rl), intent(out) :: Tij
    real(kind=rl)                  :: h, rho_i, sigma_j
#ifdef CARTESIANGEOMETRY
    h = grid_param%L/grid_param%n
    ! Cartesian coefficients
    Tij = h**2
    if (ix == grid_param%n) then
      alpha_T(1) = 2.0_rl
    else
      alpha_T(1) = 1.0_rl
    end if
    if (ix == 1) then
      alpha_T(2) = 2.0_rl
    else
      alpha_T(2) = 1.0_rl
    end if
    if (iy == grid_param%n) then
      alpha_T(3) = 2.0_rl
    else
      alpha_T(3) = 1.0_rl
    end if
    if (iy == 1) then
      alpha_T(4) = 2.0_rl
    else
      alpha_T(4) = 1.0_rl
    end if
#else
    ! Coefficients in cubed sphere geometry
    ! (rho_i,sigma_j) \in [-1,1] x [-1,1] are the coordinates of the
    ! cubed sphere segment
    h = 2.0_rl/grid_param%n
    Tij = volume_of_element(ix,iy,grid_param)
    rho_i = 2.0_rl*(1.0_rl*ix-0.5_rl)/grid_param%n-1.0_rl
    sigma_j = 2.0_rl*(1.0_rl*iy-0.5_rl)/grid_param%n-1.0_rl
    ! alpha_{i+1,j}
    if (ix == grid_param%n) then
      alpha_T(1) = 2.0_rl*DSQRT((1.0_rl+(rho_i+0.25_rl*h)**2)/(1.0_rl+sigma_j**2))
    else
      alpha_T(1) = DSQRT((1.0_rl+(rho_i+0.5_rl*h)**2)/(1.0_rl+sigma_j**2))
    end if
    ! alpha_{i-1,j}
    if (ix == 1) then
      alpha_T(2) = 2.0_rl*DSQRT((1.0_rl+(rho_i-0.25_rl*h)**2)/(1.0_rl+sigma_j**2))
    else
      alpha_T(2) = DSQRT((1.0_rl+(rho_i-0.5_rl*h)**2)/(1.0_rl+sigma_j**2))
    end if
    ! alpha_{i,j+1}
    if (iy == grid_param%n) then
      alpha_T(3) = 2.0_rl*DSQRT((1.0_rl+(sigma_j+0.25_rl*h)**2)/(1.0_rl+rho_i**2))
    else
      alpha_T(3) = DSQRT((1.0_rl+(sigma_j+0.5_rl*h)**2)/(1.0_rl+rho_i**2))
    end if
    ! alpha_{i,j-1}
    if (iy == 1) then
      alpha_T(4) = 2.0_rl*DSQRT((1.0_rl+(sigma_j-0.25_rl*h)**2)/(1.0_rl+rho_i**2))
    else
      alpha_T(4) = DSQRT((1.0_rl+(sigma_j-0.5_rl*h)**2)/(1.0_rl+rho_i**2))
    end if
#endif
    alpha_T(5) = alpha_T(1) + alpha_T(2) + alpha_T(3) + alpha_T(4)
  end subroutine construct_alpha_T

!==================================================================
! Construct coefficients of tridiagonal matrix A_T
! describing the coupling in the vertical direction and the
! diagonal matrix diag(d)
!==================================================================
subroutine construct_vertical_coeff()
  implicit none
  real(kind=rl) :: a_k_tmp, b_k_tmp, c_k_tmp, d_k_tmp
  real(kind=rl) :: omega2, lambda2, delta, vol_r, surface_k, surface_kp1
  integer :: k
  omega2 = model_param%omega2
  lambda2 = model_param%lambda2
  delta = model_param%delta
  do k = 1, grid_param%nz
#ifdef  CARTESIANGEOMETRY
    vol_r = r_grid(k+1)-r_grid(k)
    surface_k = 1.0_rl
    surface_kp1 = 1.0_rl
#else
    vol_r = (r_grid(k+1)**3 - r_grid(k)**3)/3.0_rl
    surface_k = r_grid(k)**2
    surface_kp1 = r_grid(k+1)**2
#endif
    ! Diagonal element
    a_k_tmp = delta*vol_r
    ! off diagonal elements
    ! Boundary conditions
    ! Top
    if (k == grid_param%nz) then
      if (grid_param%vertbc == VERTBC_DIRICHLET) then
        b_k_tmp = - 2.0_rl * omega2*lambda2 &
                           * surface_kp1/(r_grid(k+1)-r_grid(k))
      else
        b_k_tmp = 0.0_rl
      end if
    else
      b_k_tmp = - 2.0_rl*omega2*lambda2 &
              * surface_kp1/(r_grid(k+2)-r_grid(k))
    end if
    ! Bottom
    if (k == 1) then
      if (grid_param%vertbc == VERTBC_DIRICHLET) then
        c_k_tmp = - 2.0_rl * omega2*lambda2 &
                           * surface_k/(r_grid(k+1)-r_grid(k))
      else
        c_k_tmp = 0.0_rl
      end if
    else
      c_k_tmp = - 2.0_rl * omega2 * lambda2 &
              * surface_k/(r_grid(k+1)-r_grid(k-1))
    end if
    ! Diagonal matrix d_k
    d_k_tmp = - omega2 * (r_grid(k+1)-r_grid(k))
    vert_coeff%a(k) = a_k_tmp/d_k_tmp
    vert_coeff%b(k) = b_k_tmp/d_k_tmp
    vert_coeff%c(k) = c_k_tmp/d_k_tmp
    vert_coeff%d(k) = d_k_tmp
  end do
end subroutine construct_vertical_coeff

!==================================================================
! Calculate local residual r = b - A.u
!==================================================================
  subroutine calculate_residual(level,m,b,u,r)
    implicit none
    integer, intent(in)                :: level
    integer, intent(in)                :: m
    type(scalar3d), intent(in)         :: b
    type(scalar3d), intent(inout)      :: u
    type(scalar3d), intent(inout)      :: r
    integer :: ix,iy,iz

    ! r <- A.u
    call apply(u,r)
    ! r <- b - r = b - A.u
    do ix=u%icompx_min,u%icompx_max
      do iy=u%icompy_min,u%icompy_max
        do iz=1,u%grid_param%nz
          r%s(iz,iy,ix) = b%s(iz,iy,ix) - r%s(iz,iy,ix)
        end do
      end do
    end do
  end subroutine calculate_residual

!==================================================================
! Apply operator v = A.u
!==================================================================
  subroutine apply(u,v)
    implicit none
    type(scalar3d), intent(in)         :: u
    type(scalar3d), intent(inout)      :: v
    real(kind=rl), dimension(5) :: alpha_T
    real(kind=rl) :: Tij
    real(kind=rl) :: a_k, b_k, c_k, d_k
    integer :: ix,iy,iz
    real(kind=rl) :: tmp

    do ix=u%icompx_min,u%icompx_max
      do iy=u%icompy_min,u%icompy_max
        ! Construct horizontal part of stencil
        call construct_alpha_T(u%grid_param,  &
                               ix+u%ix_min-1, &
                               iy+u%iy_min-1, &
                               alpha_T,Tij)
        do iz=1,u%grid_param%nz
          a_k = vert_coeff%a(iz)
          b_k = vert_coeff%b(iz)
          c_k = vert_coeff%c(iz)
          d_k = vert_coeff%d(iz)
          tmp = ((a_k-b_k-c_k)*Tij - alpha_T(5)) * u%s(iz,iy,ix)
          if (iz < grid_param%nz) then
            tmp = tmp + b_k*Tij * u%s(iz+1,iy,ix)
          end if
          if (iz > 1) then
            tmp = tmp + c_k*Tij * u%s(iz-1,iy,ix)
          end if
          tmp = tmp + alpha_T(1) * u%s(iz,  iy  ,ix+1) &
                    + alpha_T(2) * u%s(iz,  iy  ,ix-1) &
                    + alpha_T(3) * u%s(iz,  iy+1,ix  ) &
                    + alpha_T(4) * u%s(iz,  iy-1,ix  )
          v%s(iz,iy,ix) = d_k*tmp
        end do
      end do
    end do
  end subroutine apply

!==================================================================
!==================================================================
!
!     S M O O T H E R S
!
!==================================================================
!==================================================================

!==================================================================
! Perform nsmooth smoother iterations
!==================================================================
  subroutine smooth(level,m,nsmooth,direction,b,u)
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    integer, intent(in) :: nsmooth       ! Number of smoothing steps
    integer, intent(in) :: direction     ! Direction
    type(scalar3d), intent(inout) :: b   ! RHS
    type(scalar3d), intent(inout) :: u   ! solution vector
    integer :: i
    real(kind=rl) :: log_res_initial, log_res_final
    type(scalar3d) :: r
    integer :: halo_size
    integer :: nlocal, nz

#ifdef MEASURESMOOTHINGRATE
    r%ix_min = u%ix_min
    r%ix_max = u%ix_max
    r%iy_min = u%iy_min
    r%iy_max = u%iy_max
    r%icompx_min = u%icompx_min
    r%icompx_max = u%icompx_max
    r%icompy_min = u%icompy_min
    r%icompy_max = u%icompy_max
    r%halo_size = u%halo_size
    r%isactive = u%isactive
    r%grid_param = u%grid_param
    nlocal = r%ix_max-r%ix_min+1
    halo_size = r%halo_size
    nz = r%grid_param%nz
    allocate(r%s(0:nz+1,                       &
                 1-halo_size:nlocal+halo_size, &
                 1-halo_size:nlocal+halo_size))
    call calculate_residual(level,m,b,u,r)
    log_res_initial = log(l2norm(r))
#endif
    ! Carry out nsmooth iterations of the smoother
    if (smoother_param%smoother == SMOOTHER_LINE_SOR) then
      do i=1,nsmooth
        call line_SOR(level,m,direction,b,u)
      end do
    else if (smoother_param%smoother == SMOOTHER_LINE_SSOR) then
      do i=1,nsmooth
        call line_SSOR(level,m,direction,b,u)
      end do
    else if (smoother_param%smoother == SMOOTHER_LINE_JAC) then
      do i=1,nsmooth
        call line_jacobi(level,m,b,u)
      end do
    end if
#ifdef MEASURESMOOTHINGRATE
    call calculate_residual(level,m,b,u,r)
    log_res_final = log(l2norm(r))
    log_resreduction(level) = log_resreduction(level) &
                            + (log_res_final - log_res_initial)
    nsmooth_total(level) = nsmooth_total(level) + nsmooth
    deallocate(r%s)
#endif
  end subroutine smooth

!==================================================================
! SOR line smoother
!==================================================================
  subroutine line_SOR(level,m,direction,b,u)

    implicit none

    integer, intent(in)                :: level
    integer, intent(in)                :: m
    integer, intent(in)                :: direction
    type(scalar3d), intent(in)         :: b
    type(scalar3d), intent(inout)      :: u
    real(kind=rl), allocatable :: r(:)
    integer :: nz, nlocal
    real(kind=rl), allocatable :: c(:), utmp(:)
    integer :: ixmin(5), ixmax(5), dix
    integer :: iymin(5), iymax(5), diy
    integer :: color
    integer :: nsweeps, isweep
    integer :: ordering
    real(kind=rl) :: rho
    integer, dimension(4) :: send_requests, recv_requests
    integer :: tmp, ierr
    integer :: iblock
    logical :: overlap_comms

    ordering = smoother_param%ordering
    rho = smoother_param%rho

    nz = u%grid_param%nz

    ! Create residual vector
    allocate(r(nz))
    ! Allocate memory for auxiliary vectors for Thomas algorithm
    allocate(c(nz))
    allocate(utmp(nz))
    nlocal = u%ix_max-u%ix_min+1
#ifdef OVERLAPCOMMS
    overlap_comms = (nlocal > 2)
#else
    overlap_comms = .false.
#endif
    ! Block 1 (N)
    ixmin(1) = 1
    ixmax(1) = nlocal
    iymin(1) = 1
    iymax(1) = 1
    ! Block 2 (S)
    ixmin(2) = 1
    ixmax(2) = nlocal
    iymin(2) = nlocal
    iymax(2) = nlocal
    ! Block 3 (W)
    ixmin(3) = 1
    ixmax(3) = 1
    iymin(3) = 2
    iymax(3) = nlocal-1
    ! Block 4 (E)
    ixmin(4) = nlocal
    ixmax(4) = nlocal
    iymin(4) = 2
    iymax(4) = nlocal-1
    ! Block 5 (INTERIOR)
    if (overlap_comms) then
      ixmin(5) = 2
      ixmax(5) = nlocal-1
      iymin(5) = 2
      iymax(5) = nlocal-1
    else
      ! If there are no interior cells, do not overlap
      ! communications and calculations, just loop over interior cells
      ixmin(5) = 1
      ixmax(5) = nlocal
      iymin(5) = 1
      iymax(5) = nlocal
    end if
    dix = +1
    diy = +1
    color = 1
    ! When iteration backwards over the grid, reverse the direction
    if (direction == DIRECTION_BACKWARD) then
      do iblock = 1, 5
        tmp = ixmax(iblock)
        ixmax(iblock) = ixmin(iblock)
        ixmin(iblock) = tmp
        tmp = iymax(iblock)
        iymax(iblock) = iymin(iblock)
        iymin(iblock) = tmp
      end do
      dix = -1
      diy = -1
      color = 0
    end if
    nsweeps = 1
    if (ordering == ORDERING_LEX) then
      nsweeps = 1
    else if (ordering == ORDERING_RB) then
      nsweeps = 2
    end if
    do isweep = 1, nsweeps
      if (overlap_comms) then
        ! Loop over cells next to boundary (iblock = 1,...,4)
        do iblock = 1, 4
          call loop_over_grid(iblock)
        end do
        ! Initiate halo exchange
        call ihaloswap(level,m,u,send_requests,recv_requests)
      end if
      ! Loop over INTERIOR cells
      iblock = 5
      call loop_over_grid(iblock)
      if (overlap_comms) then
        if (m > 0) then
          call mpi_waitall(4,recv_requests, MPI_STATUSES_IGNORE, ierr)
        end if
      else
        call haloswap(level,m,u)
      end if
      color = 1-color
    end do

    ! Free memory again
    deallocate(r)
    deallocate(c)
    deallocate(utmp)

    contains

    !------------------------------------------------------------------
    ! Loop over grid, for a given block
    !------------------------------------------------------------------
    subroutine loop_over_grid(iblock)
      implicit none
      integer, intent(in) :: iblock
      integer :: ix,iy,iz
      do ix=ixmin(iblock),ixmax(iblock),dix
        do iy=iymin(iblock),iymax(iblock),diy
          if (ordering == ORDERING_RB) then
            if (mod((ix+u%ix_min)+(iy+u%iy_min),2) .ne. color) cycle
          end if
          call apply_tridiag_solve(ix,iy,r,c,b,         &
                                   u%s(1:nz,iy  ,ix+1), &
                                   u%s(1:nz,iy  ,ix-1), &
                                   u%s(1:nz,iy+1,ix  ), &
                                   u%s(1:nz,iy-1,ix  ), &
                                   utmp)
           ! Add to field with overrelaxation-factor
          do iz=1,nz
            u%s(iz,iy,ix) = (1.0_rl-rho)*u%s(iz,iy,ix) + rho*utmp(iz)
          end do
        end do
      end do
    end subroutine loop_over_grid

  end subroutine line_SOR

!==================================================================
! SSOR line smoother
!==================================================================
  subroutine line_SSOR(level,m,direction,b,u)
    implicit none
    integer, intent(in)                :: level
    integer, intent(in)                :: m
    integer, intent(in)                :: direction
    type(scalar3d), intent(in)         :: b
    type(scalar3d), intent(inout)      :: u
    if (direction == DIRECTION_FORWARD) then
      call line_SOR(level,m,DIRECTION_FORWARD,b,u)
      call line_SOR(level,m,DIRECTION_BACKWARD,b,u)
    else
      call line_SOR(level,m,DIRECTION_BACKWARD,b,u)
      call line_SOR(level,m,DIRECTION_FORWARD,b,u)
    end if
  end subroutine line_SSOR

!==================================================================
! Jacobi line smoother
!==================================================================
  subroutine line_Jacobi(level,m,b,u)
    implicit none
    integer, intent(in)                :: level
    integer, intent(in)                :: m
    type(scalar3d), intent(in)         :: b
    type(scalar3d), intent(inout)      :: u
    real(kind=rl), allocatable :: r(:)
    integer :: ix,iy,iz, nz
    real(kind=rl), dimension(5) :: alpha_T
    real(kind=rl), allocatable :: c(:), utmp(:)
    real(kind=rl), allocatable :: u0(:,:,:)
    integer :: nlocal, halo_size
    real(kind=rl) :: rho
    logical :: overlap_comms
    integer, dimension(4) :: send_requests, recv_requests
    integer :: ixmin(5), ixmax(5)
    integer :: iymin(5), iymax(5)
    integer :: iblock, ierr

    ! Set optimal smoothing parameter on each level
    rho = 2.0_rl/(2.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2/(1.0_rl+4.0_rl*model_param%omega2*u%grid_param%n**2))

    nz = u%grid_param%nz
    nlocal = u%ix_max -u%ix_min + 1
    halo_size = u%halo_size

#ifdef OVERLAPCOMMS
    overlap_comms = (nlocal > 2)
#else
    overlap_comms = .false.
#endif

    ! Block 1 (N)
    ixmin(1) = 1
    ixmax(1) = nlocal
    iymin(1) = 1
    iymax(1) = 1
    ! Block 2 (S)
    ixmin(2) = 1
    ixmax(2) = nlocal
    iymin(2) = nlocal
    iymax(2) = nlocal
    ! Block 3 (W)
    ixmin(3) = 1
    ixmax(3) = 1
    iymin(3) = 2
    iymax(3) = nlocal-1
    ! Block 4 (E)
    ixmin(4) = nlocal
    ixmax(4) = nlocal
    iymin(4) = 2
    iymax(4) = nlocal-1
    ! Block 5 (INTERIOR)
    if (overlap_comms) then
      ixmin(5) = 2
      ixmax(5) = nlocal-1
      iymin(5) = 2
      iymax(5) = nlocal-1
    else
      ! If there are no interior cells, do not overlap
      ! communications and calculations, just loop over interior cells
      ixmin(5) = 1
      ixmax(5) = nlocal
      iymin(5) = 1
      iymax(5) = nlocal
    end if

    ! Temporary vector
    allocate(u0(0:u%grid_param%nz+1,            &
                1-halo_size:nlocal+halo_size,   &
                1-halo_size:nlocal+halo_size) )
    u0(:,:,:) = u%s(:,:,:)
    ! Create residual vector
    allocate(r(nz))
    ! Allocate memory for auxiliary vectors for Thomas algorithm
    allocate(c(nz))
    allocate(utmp(nz))

    ! Loop over grid
    if (overlap_comms) then
    ! Loop over cells next to boundary (iblock = 1,...,4)
      do iblock = 1, 4
        call loop_over_grid(iblock)
      end do
      ! Initiate halo exchange
      call ihaloswap(level,m,u,send_requests,recv_requests)
    end if
    ! Loop over INTERIOR cells
    iblock = 5
    call loop_over_grid(iblock)
    if (overlap_comms) then
      if (m > 0) then
        call mpi_waitall(4,recv_requests, MPI_STATUSES_IGNORE, ierr)
      end if
    else
      call haloswap(level,m,u)
    end if

    ! Free memory again
    deallocate(r)
    deallocate(c)
    deallocate(u0)
    deallocate(utmp)

  contains

  subroutine loop_over_grid(iblock)
    implicit none
    integer, intent(in) :: iblock
    integer :: ix,iy,iz
    do ix=ixmin(iblock),ixmax(iblock)
      do iy=iymin(iblock),iymax(iblock)
        call apply_tridiag_solve(ix,iy,r,c,b,        &
                                 u0(1:nz,iy  ,ix+1), &
                                 u0(1:nz,iy  ,ix-1), &
                                 u0(1:nz,iy+1,ix  ), &
                                 u0(1:nz,iy-1,ix  ), &
                                 utmp)
        ! Add correction
        do iz=1,nz
          u%s(iz,iy,ix) = rho*utmp(iz) + (1.0_rl-rho)*u0(iz,iy,ix)
        end do
      end do
    end do
  end subroutine loop_over_grid

  end subroutine line_Jacobi

!==================================================================
! At a given horizontal position (ix,iy) (local coordinates),
! calculate
!
! u_out = T(ix,iy)^{-1} (b_(ix,iy)
!       - sum_{ix',iy' != ix,iy} A_{(ix,iy),(ix',iy')}*u_in(ix',iy'))
!
!==================================================================
  subroutine apply_tridiag_solve(ix,iy,r,c,b, &
                                 u_in_1,      &
                                 u_in_2,      &
                                 u_in_3,      &
                                 u_in_4,      &
                                 u_out)

    implicit none
    integer, intent(in) :: ix
    integer, intent(in) :: iy
    real(kind=rl), intent(inout), dimension(:) :: r
    real(kind=rl), intent(inout), dimension(:) :: c
    type(scalar3d), intent(in) :: b
    real(kind=rl), intent(in), dimension(:) :: u_in_1
    real(kind=rl), intent(in), dimension(:) :: u_in_2
    real(kind=rl), intent(in), dimension(:) :: u_in_3
    real(kind=rl), intent(in), dimension(:) :: u_in_4
    real(kind=rl), intent(inout), dimension(:) :: u_out
    real(kind=rl), dimension(5) :: alpha_T
    real(kind=rl) :: Tij
    real(kind=rl) :: alpha_div_Tij, tmp, b_k_tmp, c_k_tmp
    integer :: iz, nz

    nz = b%grid_param%nz

    call construct_alpha_T(b%grid_param,  &
                           ix+b%ix_min-1, &
                           iy+b%iy_min-1, &
                           alpha_T,Tij)
    ! Calculate r_i = b_i - A_{ij} u_i
    do iz=1,nz
      r(iz) = b%s(iz,iy,ix) - vert_coeff%d(iz) * ( &
                alpha_T(1) * u_in_1(iz) + &
                alpha_T(2) * u_in_2(iz) + &
                alpha_T(3) * u_in_3(iz) + &
                alpha_T(4) * u_in_4(iz) )
    end do

    ! Thomas algorithm
    ! STEP 1: Create modified coefficients
    iz = 1
    alpha_div_Tij = alpha_T(5)/Tij
    tmp = (vert_coeff%a(iz)-vert_coeff%b(iz)-vert_coeff%c(iz)) &
             - alpha_div_Tij
    c(iz) = vert_coeff%b(iz)/tmp
    u_out(iz) = r(iz) / (tmp*Tij*vert_coeff%d(iz))
    do iz=2,nz
      b_k_tmp = vert_coeff%b(iz)
      c_k_tmp = vert_coeff%c(iz)
      tmp = ((vert_coeff%a(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) &
          - c(iz-1)*c_k_tmp
      c(iz) = b_k_tmp / tmp
      u_out(iz) = (r(iz) / (Tij*vert_coeff%d(iz)) - u_out(iz-1)*c_k_tmp) / tmp
    end do
    ! STEP 2: back substitution
    do iz=nz-1,1,-1
      u_out(iz) = u_out(iz) - c(iz) * u_out(iz+1)
    end do
  end subroutine apply_tridiag_solve

end module discretisation