!=== COPYRIGHT AND LICENSE STATEMENT ===
!
!  This file is part of the TensorProductMultigrid code.
!  
!  (c) The copyright relating to this work is owned jointly by the
!  Crown, Met Office and NERC [2014]. However, it has been created
!  with the help of the GungHo Consortium, whose members are identified
!  at https://puma.nerc.ac.uk/trac/GungHo/wiki .
!  
!  Main Developer: Eike Mueller
!  
!  TensorProductMultigrid is free software: you can redistribute it and/or
!  modify it under the terms of the GNU Lesser General Public License as
!  published by the Free Software Foundation, either version 3 of the
!  License, or (at your option) any later version.
!  
!  TensorProductMultigrid is distributed in the hope that it will be useful,
!  but WITHOUT ANY WARRANTY; without even the implied warranty of
!  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!  GNU Lesser General Public License for more details.
!  
!  You should have received a copy of the GNU Lesser General Public License
!  along with TensorProductMultigrid (see files COPYING and COPYING.LESSER).
!  If not, see <http://www.gnu.org/licenses/>.
!
!=== COPYRIGHT AND LICENSE STATEMENT ===


!==================================================================
!
!  MPI communication routines for multigrid code
!
!    Eike Mueller, University of Bath, Feb 2012
!
!==================================================================

module communication
  use messages
  use datatypes
  use parameters
  !use mpi
  use modd_mpif
  use timer

  implicit none

public::comm_preinitialise
public::comm_initialise
public::comm_finalise
public::scalarprod_mnh
public::scalarprod
public::boundary_mnh
public::haloswap_mnh
public::haloswap
public::ihaloswap_mnh
public::ihaloswap
public::collect
public::distribute
public::i_am_master_mpi
public::master_rank
public::pproc
public::MPI_COMM_HORIZ
public::comm_parameters
public::comm_measuretime


  ! Number of processors
  ! n_proc = 2^(2*pproc), with integer pproc
  integer :: pproc

! Rank of master process
  integer, parameter :: master_rank = 0
! Am I the master process?
  logical :: i_am_master_mpi

  integer, parameter :: dim = 3  ! Dimension
  integer, parameter :: dim_horiz = 2  ! Horizontal dimension
  integer :: MPI_COMM_HORIZ ! Communicator with horizontal partitioning

private

! Data types for halo exchange in both x- and y-direction
  integer, dimension(:,:,:), allocatable :: halo_type

! MPI vector data types
  ! Halo for data exchange in north-south direction
  integer, allocatable, dimension(:,:) :: halo_ns
  integer, allocatable, dimension(:,:) :: halo_nst
  integer, allocatable, dimension(:,:) :: halo_wet
  ! Vector data type for interior of field a(level,m)
  integer, allocatable, dimension(:,:) :: interior
  integer, allocatable, dimension(:,:) :: interiorT 
  ! Vector data type for one quarter of interior of field
  ! at level a(level,m). This has the same size (and can be
  ! used for communications with) the interior of a(level,m+1)
  integer, allocatable, dimension(:,:) :: sub_interior
  integer, allocatable, dimension(:,:) :: sub_interiorT 
  ! Timer for halo swaps
  type(time), allocatable, dimension(:,:) :: t_haloswap
  ! Timer for collect and distribute
  type(time), allocatable, dimension(:) :: t_collect
  type(time), allocatable, dimension(:) :: t_distribute
  ! Parallelisation parameters
  ! Measure communication times?
  logical :: comm_measuretime

  ! Parallel communication parameters
  type comm_parameters
    ! Size of halos
    integer :: halo_size
  end type comm_parameters

  type(comm_parameters) :: comm_param

! Data layout
! ===========
!
!  The number of processes has to be of the form nproc = 2^(2*pproc) to
!  ensure that data can be distributed between processes.
!  The processes are arranged in a (2^pproc) x (2^pproc) cartesian grid
!  in the horizontal plane (i.e. vertical columns are always local to one
!  process), which is implemented via the communicator MPI_COMM_HORIZ.
!  This MPI_cart_rank() and MPI_cart_shift() can then be used to
!  easily identify neighbouring processes.

!  The number of data grid cells in each direction has to be a multiply
!  of 2**(L-1) where L is the number of levels (including the coarse
!  and fine level), with the coarse level corresponding to level=1.
!  Also define L_split as the level where we start to pull together
!  data. For levels > L_split each position in the cartesian grid is
!  included in the work, below this only a subset of processes is
!  used.
!
!  Each grid a(level,m) is identified by two numbers:
!  (1) The multigrid level it belongs to (level)
!  (2) The number of active processes that operate on it (2^(2*m)).
!
!  For level > L_split, m=procp. For L_split we store a(L_split,pproc) and
!  a(L_split,pproc-1), and only processes with even coordinates in both
!  horizontal directions use this grid.
!  Below that level, store a(L_split-1,pproc-1) and a(L_split-1,pproc-2),
!  where only processes for which both horiontal coordinates are
!  multiples of four use the latter. This is continued until only on
!  process is left.
!
!
!  level
!    L          a(L,pproc)
!    L-1        a(L-1,pproc)
!    ...
!    L_split    a(L_split,pproc)    a(L_split  ,pproc-1)
!    L_split-1                      a(L_split-1,pproc-1)  a(L_split-1,pproc-2)
!
!                                                                  ... a(3,1)
!                                                                      a(2,1)
!                                                                      a(1,1)
!
!  When moving from left to right in the above graph the total number of
!  grid cells does not change, but the number of data points per process
!  increases by a factor of 4.
!
! Parallel operations
! ===================
!
!   (*)  Halo exchange. Update halo with data from neighbouring
!        processors in cartesian grid on current (level,m)
!   (*)  Collect data on all processes at (level,m) on those
!        processes that are still active on (level,m-1).
!   (*)  Distribute data at (level,m-1) and duplicate on all processes
!        that are active at (level,m).
!
!   Note that in the cartesian processor grid the first coordinate
!   is the North-South (y-) direction, the second coordinate is the
!   East-West (x-) direction, i.e. the layout is this:
!
!   p_0 (0,0)   p_1 (0,1)   p_2 (0,2)   p_3 (0,3)
!
!   p_4 (1,0)   p_5 (1,1)   p_6 (1,2)   p_7 (1,3)
!
!   p_8 (2,0)   p_9 (2,1)   p_10 (2,2)  p_11 (2,3)
!
!                       [...]
!
!
!   Normal multigrid restriction and prolongation are used to
!   move between levels with fixed m.
!
!

contains

!==================================================================
! Pre-initialise communication routines
!==================================================================
  subroutine comm_preinitialise()
    implicit none
    integer :: nproc, ierr, rank
    call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr)
    call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
    i_am_master_mpi = (rank == master_rank)
    ! Check that nproc = 2^(2*p)
    pproc = floor(log(1.0d0*nproc)/log(4.0d0))
    if ( (nproc - 4**pproc) .ne. 0) then
      call fatalerror("Number of processors has to be 2^(2*pproc) with integer pproc.")
    end if
    if (i_am_master_mpi) then
      write(STDOUT,'("PARALLEL RUN")')
      write(STDOUT,'("Number of processors : 2^(2*pproc) = ",I10," with pproc = ",I6)') &
        nproc, pproc
    end if
    ! Create halo data types

  end subroutine comm_preinitialise

!==================================================================
! Initialise communication routines
!==================================================================
  subroutine comm_initialise(n_lev,        & !} multigrid parameters
                            lev_split,     & !}
                            grid_param,    & ! Grid parameters
                            comm_param_in)   ! Parallel communication
                                             ! parameters
    implicit none
    integer, intent(in) :: n_lev
    integer, intent(in) :: lev_split
    type(grid_parameters), intent(inout) :: grid_param
    type(comm_parameters), intent(in)    :: comm_param_in
    integer :: n
    integer :: nz
    integer :: rank, nproc, ierr
    integer :: count, blocklength, stride
    integer, dimension(2) :: p_horiz
    integer :: m, level, nlocal
    logical :: reduced_m
    integer :: halo_size
    character(len=32) :: t_label

    integer,parameter    :: nb_dims=3
    integer,dimension(nb_dims) :: profil_tab,profil_sous_tab,coord_debut

    n = grid_param%n
    nz = grid_param%nz

    comm_param = comm_param_in
    halo_size = comm_param%halo_size

    call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr)

    ! Create cartesian topology
    call mpi_cart_create(MPI_COMM_WORLD,        & ! Old communicator name
                         dim_horiz,             & ! horizontal dimension
                         (/2**pproc,2**pproc/), & ! extent in each horizontal direction
                         (/.false.,.false./),   & ! periodic?
                         .true.,                & ! reorder?
                         MPI_COMM_HORIZ,        & ! Name of new communicator
                         ierr)
   ! calculate and display rank and corrdinates in cartesian grid
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Local size of (horizontal) grid
    nlocal = n/2**pproc

    ! === Set up data types ===
    ! Halo for exchange in north-south direction
    if (LUseO) allocate(halo_ns(n_lev,0:pproc))
    if (LUseT) allocate(halo_nst(n_lev,0:pproc))
    if (LUseT) allocate(halo_wet(n_lev,0:pproc))
    ! Interior data types
    if (LUseO) allocate(interior(n_lev,0:pproc))
    if (LUseO) allocate(sub_interior(n_lev,0:pproc))
    if (LUseT) allocate(interiorT(n_lev,0:pproc))
    if (LUseT) allocate(sub_interiorT(n_lev,0:pproc))
    ! Timer
    allocate(t_haloswap(n_lev,0:pproc))
    allocate(t_collect(0:pproc))
    allocate(t_distribute(0:pproc))
    do m=0,pproc
      write(t_label,'("t_collect(",I3,")")') m
      call initialise_timer(t_collect(m),t_label)
      write(t_label,'("t_distribute(",I3,")")') m
      call initialise_timer(t_distribute(m),t_label)
    end do

    m = pproc
    level = n_lev
    reduced_m = .false.
    do while (level > 0)
      ! --- Create halo data types ---
      if (LUseO) then
      ! NS- (y-) direction
      count = nlocal
      blocklength = (nz+2)*halo_size
      stride = (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_ns(level,m),ierr)
      call mpi_type_commit(halo_ns(level,m),ierr)
      endif
      ! tranpose
      if (LUseT) then
      ! NS- (y-) transpose direction
      count =        nz+2                                        ! nlocal
      blocklength =  nlocal*halo_size                            ! (nz+2)*halo_size
      stride =       (nlocal+2*halo_size) * (nlocal+2*halo_size) ! (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_nst(level,m),ierr)
      call mpi_type_commit(halo_nst(level,m),ierr)
     ! WE- (x-) transpose direction
      count =        (nz+2)*(nlocal+2*halo_size)*halo_size       ! nlocal
      blocklength =  1*halo_size                                 ! (nz+2)*halo_size
      stride =       nlocal+2*halo_size                          ! (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_wet(level,m),ierr)
      call mpi_type_commit(halo_wet(level,m),ierr)
      endif
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Commit halo_ns failed in mpi_type_commit().")
#endif
      ! --- Create interior data types ---
      if (LUseO) then
         count = nlocal
         blocklength = nlocal*(nz+2)
         stride = (nz+2)*(nlocal+2*halo_size)
         call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,interior(level,m),ierr)
         call mpi_type_commit(interior(level,m),ierr)
         count = nlocal/2
         blocklength = nlocal/2*(nz+2)
         stride = (nlocal+2*halo_size)*(nz+2)
         call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,sub_interior(level,m),ierr)
         call mpi_type_commit(sub_interior(level,m),ierr)
      end if
      if (LUseT) then
         ! interiorT
         if ( nlocal /= 0 ) then
            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
            profil_sous_tab = (/ nlocal             , nlocal             , nz+2 /)
            coord_debut     = (/ 0                  , 0                  , 0    /)
            call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                 MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,interiorT(level,m),ierr)
            call mpi_type_commit(interiorT(level,m),ierr)
         end if
         ! sub_interiorT
         if ( (nlocal/2) /= 0 ) then
            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
            profil_sous_tab = (/ nlocal/2           , nlocal/2           , nz+2 /)
            coord_debut     = (/ 0                  , 0                  , 0    /)
            call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                 MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sub_interiorT(level,m),ierr)
            call mpi_type_commit(sub_interiorT(level,m),ierr)
         end if
      end if
      ! --- Create timers ---
      write(t_label,'("t_haloswap(",I3,",",I3,")")') level,m
      call initialise_timer(t_haloswap(level,m),t_label)

      ! If we are below L_split, split data
      if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
        reduced_m = .true.
        m = m-1
        nlocal = 2*nlocal
        cycle
      end if
      reduced_m = .false.
      level = level-1
      nlocal = nlocal/2
    end do

  end subroutine comm_initialise

!==================================================================
! Finalise communication routines
!==================================================================
  subroutine comm_finalise(n_lev,   &  ! }
                           lev_split)  ! } Multigrid parameters
    implicit none
    integer, intent(in) :: n_lev
    integer, intent(in) :: lev_split
    logical :: reduced_m
    integer :: level, m
    integer :: ierr
    character(len=80) :: s
    m = pproc
    level = n_lev
    reduced_m = .false.
    if (i_am_master_mpi) then
      write(STDOUT,'(" *** Finalising communications ***")')
    end if
    call print_timerinfo("--- Communication timing results ---")
    do while (level > 0)
      write(s,'("level = ",I3,", m = ",I3)') level, m
      call print_timerinfo(s)
      ! --- Print out timer information ---
      call print_elapsed(t_haloswap(level,m),.True.,1.0_rl)
      ! --- Free halo data types ---
      if (LUseO) call mpi_type_free(halo_ns(level,m),ierr)
      if (LUseT) call mpi_type_free(halo_nst(level,m),ierr)
      if (LUseT) call mpi_type_free(halo_wet(level,m),ierr)
      ! --- Free interior data types ---
      if (LUseO) call mpi_type_free(interior(level,m),ierr)
      if (LUseO) call mpi_type_free(sub_interior(level,m),ierr)
      if (LUseT) call mpi_type_free(interiorT(level,m),ierr)
      if (LUseT) call mpi_type_free(sub_interiorT(level,m),ierr)
      ! If we are below L_split, split data
      if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
        reduced_m = .true.
        m = m-1
        cycle
      end if
      reduced_m = .false.
      level = level-1
    end do
    do m=pproc,0,-1
      write(s,'("m = ",I3)') m
      call print_timerinfo(s)
      ! --- Print out timer information ---
      call print_elapsed(t_collect(m),.True.,1.0_rl)
      call print_elapsed(t_distribute(m),.True.,1.0_rl)
    end do

    ! Deallocate arrays
    if (LUseO) deallocate(halo_ns)
    if (LUseT) deallocate(halo_nst,halo_wet)
    if (LUseO) deallocate(interior)
    if (LUseO) deallocate(sub_interior)
    if (LUseT) deallocate(interiorT)
    if (LUseT) deallocate(sub_interiorT)

    deallocate(t_haloswap)
    deallocate(t_collect)
    deallocate(t_distribute)
    if (i_am_master_mpi) then
      write(STDOUT,'("")')
    end if

  end subroutine comm_finalise

!==================================================================
!  Scalar product of two fields
!==================================================================
  subroutine scalarprod_mnh(m, a, b, s)
    implicit none
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(in) :: b
    real(kind=rl), intent(out) :: s
    integer :: nprocs, rank, ierr
    integer :: p_horiz(2)
    integer :: stepsize
    integer, parameter :: dim_horiz = 2
    real(kind=rl) :: local_sum, global_sum 
    real(kind=rl) :: local_sumt,global_sumt
    integer :: nlocal, nz, i
    integer :: ix,iy,iz
    real(kind=rl) :: ddot

    nlocal = a%ix_max-a%ix_min+1
    nz = a%grid_param%nz
    ! Work out coordinates of processor
    call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
    call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
    stepsize = 2**(pproc-m)
    if (nprocs > 1) then
      ! Only inlcude local sum if the processor coordinates
      ! are multiples of stepsize
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
      if ( (stepsize == 1) .or.                   &
           ( (stepsize > 1) .and.                 &
             (mod(p_horiz(1),stepsize)==0) .and.  &
             (mod(p_horiz(2),stepsize)==0) ) ) then
        if (LUseO) then
           local_sum = 0.0_rl
           do i = 1, nlocal
              local_sum = local_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
           end do
        end if
        if (LUseT) then
           local_sumt = 0.0_rl
           do iz=0,nz+1
              do iy=a%icompy_min,a%icompy_max
                 do ix=a%icompx_min,a%icompx_max
                    local_sumt = local_sumt &
                         + a%st(ix,iy,iz)*b%st(ix,iy,iz)
                 end do
              end do
           end do
        end if
      else
        if (LUseO) local_sum = 0.0_rl
        if (LUseT) local_sumt = 0.0_rl
      end if
      if (LUseO) call mpi_allreduce(local_sum,global_sum,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
      if (LUseT) call mpi_allreduce(local_sumt,global_sumt,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
    else
      if (LUseO) then   
      global_sum = 0.0_rl
      do i = 1, nlocal
        global_sum = global_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
      end do
      endif
      if (LUseT) then
      global_sumt = 0.0_rl
         do iz=0,nz+1
            do iy=a%iy_min,a%iy_max
               do ix=a%ix_min,a%ix_max
                  global_sumt = global_sumt &
                       + a%st(ix,iy,iz)*b%st(ix,iy,iz)
               end do
            end do
         end do
      endif
    end if
    if (LUseO) then
       s = global_sum
    else
       s = global_sumt
    end if
    
  end subroutine scalarprod_mnh
!-------------------------------------------------------------------------------
  subroutine scalarprod(m, a, b, s)
    implicit none
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(in) :: b
    real(kind=rl), intent(out) :: s
    integer :: nprocs, rank, ierr
    integer :: p_horiz(2)
    integer :: stepsize
    integer, parameter :: dim_horiz = 2
    real(kind=rl) :: local_sum, global_sum
    integer :: nlocal, nz, i
    real(kind=rl) :: ddot

    nlocal = a%ix_max-a%ix_min+1
    nz = a%grid_param%nz
    ! Work out coordinates of processor
    call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
    call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
    stepsize = 2**(pproc-m)
    if (nprocs > 1) then
      ! Only inlcude local sum if the processor coordinates
      ! are multiples of stepsize
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
      if ( (stepsize == 1) .or.                   &
           ( (stepsize > 1) .and.                 &
             (mod(p_horiz(1),stepsize)==0) .and.  &
             (mod(p_horiz(2),stepsize)==0) ) ) then
        local_sum = 0.0_rl
        do i = 1, nlocal
          local_sum = local_sum &
                    + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
        end do
      else
        local_sum = 0.0_rl
      end if
      call mpi_allreduce(local_sum,global_sum,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
    else
      global_sum = 0.0_rl
      do i = 1, nlocal
        global_sum = global_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
      end do
    end if
    s = global_sum
  end subroutine scalarprod
!==================================================================
! Boundary Neumann
!==================================================================
  subroutine boundary_mnh(a)              ! data field

    implicit none
 
    type(scalar3d), intent(inout) :: a

    !local var
    integer :: n, ix_min,ix_max,iy_min,iy_max
    
    ! Update Real Boundary for Newman case   u(0) = u(1) , etc ...

    !return

    n      = a%grid_param%n
    ix_min = a%ix_min
    ix_max = a%ix_max
    iy_min = a%iy_min
    iy_max = a%iy_max
    if (LUseO) then 
    if ( ix_min == 1 ) then
       a%s(:,:,0) = a%s(:,:,1)
    endif
    if ( ix_max == n ) then
       a%s(:,:,a%icompx_max+1) = a%s(:,:,a%icompx_max)
    endif
    if ( iy_min == 1 ) then
       a%s(:,0,:) = a%s(:,1,:)
    endif
    if ( iy_max == n ) then
       a%s(:,a%icompy_max+1,:) = a%s(:,a%icompy_max,:)
    endif
    endif
    if (LUseT) then
    !  transpose
    if ( ix_min == 1 ) then
       a%st(0,:,:) = a%st(1,:,:)
    endif
    if ( ix_max == n ) then
       a%st(a%icompx_max+1,:,:) = a%st(a%icompx_max,:,:)
    endif
    if ( iy_min == 1 ) then
       a%st(:,0,:) = a%st(:,1,:)
    endif
    if ( iy_max == n ) then
       a%st(:,a%icompy_max+1,:) = a%st(:,a%icompy_max,:)
    endif
    endif
    !
    ! corner ( inutile , deja réalisé au dessus )
    !
!!$    if ( ( ix_min == 1 ) .and. ( iy_min == 1 ) ) then
!!$       a%s(:,0,0) = a%s(:,1,1) 
!!$    endif
!!$    if ( ( ix_min == 1 ) .and. ( iy_max == n ) ) then
!!$       a%s(:,a%icompy_max+1,0) = a%s(:,a%icompy_max,1)
!!$    end if
!!$    if ( ( ix_max == n ) .and. ( iy_min == 1 ) ) then
!!$       a%s(:,0,a%icompx_max+1) = a%s(:,1,a%icompx_max)
!!$    end if
!!$    if ( ( ix_max == n ) .and. ( iy_max == n ) ) then
!!$     a%s(:,a%icompy_max+1,a%icompx_max+1) = a%s(:,a%icompy_max,a%icompx_max)
!!$    end if
  end subroutine boundary_mnh
!==================================================================
!  Initiate asynchronous halo exchange
!
!  For all processes with horizontal indices that are multiples
!  of 2^(pproc-m), update halos with information held by
!  neighbouring processes, e.g. for pproc-m = 1, stepsize=2
!
!                      N  (0,2)
!                      ^
!                      !
!                      v
!
!      W (2,0) <-->  (2,2)  <-->  E (2,4)
!
!                      ^
!                      !
!                      v
!                      S (4,2)
!
!==================================================================
  subroutine ihaloswap_mnh(level,m,       &  ! multigrid- and processor- level
                       a,             &  ! data field
                       send_requests, &  ! send requests (OUT)
                       recv_requests,  &  ! recv requests (OUT)
                       send_requestsT, &  ! send requests T (OUT)
                       recv_requestsT  &  ! recv requests T (OUT)
                       )
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    integer, intent(out), dimension(4) :: send_requests
    integer, intent(out), dimension(4) :: recv_requests
    integer, intent(out), dimension(4) :: send_requestsT
    integer, intent(out), dimension(4) :: recv_requestsT
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 1002
        if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1,       &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(1), ierr)
        recvtag = 1012
        if (LUseT) call mpi_irecv(a%st(1,0-(halo_size-1),0),1,      &
                       halo_nst(level,m),neighbour_n_rank,recvtag,  &
                       MPI_COMM_HORIZ, recv_requestsT(1), ierr)
        ! Receive from south
        recvtag = 1003
        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(2), ierr)
        recvtag = 1013
        if (LUseT) call mpi_irecv(a%st(1,a_n+1,0),1,                &
                       halo_nst(level,m),neighbour_s_rank,recvtag,  &
                       MPI_COMM_HORIZ, recv_requestsT(2), ierr)
        ! Send to south
        sendtag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(1), ierr)
        sendtag = 1012
        if (LUseT) call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
                       halo_nst(level,m),neighbour_s_rank,sendtag,  &
                       MPI_COMM_HORIZ, send_requestsT(1), ierr)
        ! Send to north
        sendtag = 1003
        if (LUseO) call mpi_isend(a%s(0,1,1),1,                     &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(2), ierr)
        sendtag = 1013
        if (LUseT) call mpi_isend(a%st(1,1,0),1,                    &
                       halo_nst(level,m),neighbour_n_rank,sendtag,  &
                       MPI_COMM_HORIZ, send_requestsT(2), ierr)
        ! Receive from west
        recvtag = 1000
        if (LUseO) call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(3), ierr)
        recvtag = 1010
        if (LUseT) call mpi_irecv(a%st(0-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requestsT(3), ierr)
        ! Receive from east
        sendtag = 1001
        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(4), ierr)
        sendtag = 1011
        if (LUseT) call mpi_irecv(a%st(a_n+1,0,0),1,          &
                       halo_wet(level,m),neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requestsT(4), ierr)
        ! Send to east
        sendtag = 1000
        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, send_requests(3), ierr)
        sendtag = 1010
        if (LUseT) call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, send_requestsT(3), ierr)
        ! Send to west
        recvtag = 1001
        if (LUseO) call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(4), ierr)
        recvtag = 1011
        if (LUseT) call mpi_isend(a%st(1,0,0),1,                &
                       halo_wet(level,m),neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requestsT(4), ierr)
      end if
    end if
  end subroutine ihaloswap_mnh
!==================================================================
  subroutine ihaloswap(level,m,       &  ! multigrid- and processor- level
                       a,             &  ! data field
                       send_requests, &  ! send requests (OUT)
                       recv_requests  &  ! recv requests (OUT)
                       )
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    integer, intent(out), dimension(4) :: send_requests
    integer, intent(out), dimension(4) :: recv_requests
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 2
        call mpi_irecv(a%s(0,0-(halo_size-1),1),1,                  &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(1), ierr)
        ! Receive from south
        recvtag = 3
        call mpi_irecv(a%s(0,a_n+1,1),1,                            &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(2), ierr)
        ! Send to south
        sendtag = 2
        call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,                &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(1), ierr)
        ! Send to north
        sendtag = 3
        call mpi_isend(a%s(0,1,1),1,                                &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(2), ierr)
        ! Receive from west
        recvtag = 0
        call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(3), ierr)
        ! Receive from east
        sendtag = 1
        call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(4), ierr)
        ! Send to east
        sendtag = 0
        call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, send_requests(3), ierr)
        ! Send to west
        recvtag = 1
        call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(4), ierr)
      end if
    end if
  end subroutine ihaloswap

!==================================================================
!  Halo exchange
!
!  For all processes with horizontal indices that are multiples
!  of 2^(pproc-m), update halos with information held by
!  neighbouring processes, e.g. for pproc-m = 1, stepsize=2
!
!                      N  (0,2)
!                      ^
!                      !
!                      v
!
!      W (2,0) <-->  (2,2)  <-->  E (2,4)
!
!                      ^
!                      !
!                      v
!                      S (4,2)
!
!==================================================================
  subroutine haloswap_mnh(level,m, &  ! multigrid- and processor- level
                      a)          ! data field
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(inout) :: a
    !local var
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength
    integer, dimension(4) :: requests_ns
    integer, dimension(4) :: requests_ew
    integer, dimension(4) :: requests_nsT
    integer, dimension(4) :: requests_ewT

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      if (comm_measuretime) then
        call start_timer(t_haloswap(level,m))
      end if
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 1002
        if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1,       &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(1), ierr)
        recvtag = 1012
        if (LUseT) call mpi_irecv(a%st(1,0-(halo_size-1),0),1,      &
                       halo_nst(level,m),neighbour_n_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(1), ierr)
        ! Receive from south
        recvtag = 1003
        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(2), ierr)
        recvtag = 1013
        if (LUseT) call mpi_irecv(a%st(1,a_n+1,0),1,                &
                       halo_nst(level,m),neighbour_s_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(2), ierr)
        ! Send to south
        sendtag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(3), ierr)
        sendtag = 1012
        if (LUseT) call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
                       halo_nst(level,m),neighbour_s_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(3), ierr)
        ! Send to north
        sendtag = 1003
        if (LUseO) call mpi_isend(a%s(0,1,1),1,                     &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(4), ierr)
        sendtag = 1013
        if (LUseT) call mpi_isend(a%st(1,1,0),1,                    &
                       halo_nst(level,m),neighbour_n_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(4), ierr)
        if (halo_size > 1) then
          ! Wait for North <-> South communication to complete
          if (LUseO) call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
          if (LUseT) call mpi_waitall(4,requests_nsT, MPI_STATUSES_IGNORE, ierr)
        end if
        ! Receive from west
        recvtag = 1000
        if (LUseO) call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(1), ierr)
        recvtag = 1010
        if (LUseT) call mpi_irecv(a%st(0-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(1), ierr)
        ! Receive from east
        sendtag = 1001
        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(2), ierr)
        sendtag = 1011
        if (LUseT) call mpi_irecv(a%st(a_n+1,0,0),1,          &
                       halo_wet(level,m),neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(2), ierr)
        ! Send to east
        sendtag = 1000
        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ew(3), ierr)
        sendtag = 1010
        if (LUseT) call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ewT(3), ierr)
        ! Send to west
        recvtag = 1001
        if (LUseO) call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ew(4), ierr)
        recvtag = 1011
        if (LUseT) call mpi_isend(a%st(1,0,0),1,                &
                       halo_wet(level,m),neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ewT(4), ierr)
        ! Wait for East <-> West communication to complete
        if (halo_size == 1) then
          ! Wait for North <-> South communication to complete
          if (LUseO) call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
          if (LUseT) call mpi_waitall(4,requests_nsT, MPI_STATUSES_IGNORE, ierr)
        end if
        if (LUseO) call mpi_waitall(4,requests_ew, MPI_STATUSES_IGNORE, ierr)
        if (LUseT) call mpi_waitall(4,requests_ewT, MPI_STATUSES_IGNORE, ierr)
      end if
      if (comm_measuretime) then
        call finish_timer(t_haloswap(level,m))
      end if
    end if

  end subroutine haloswap_mnh
!==================================================================
  subroutine haloswap(level,m, &  ! multigrid- and processor- level
                      a)          ! data field
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength
    integer, dimension(4) :: requests_ns
    integer, dimension(4) :: requests_ew

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      if (comm_measuretime) then
        call start_timer(t_haloswap(level,m))
      end if
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 2
        call mpi_irecv(a%s(0,0-(halo_size-1),1),1,                  &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(1), ierr)
        ! Receive from south
        recvtag = 3
        call mpi_irecv(a%s(0,a_n+1,1),1,                            &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(2), ierr)
        ! Send to south
        sendtag = 2
        call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,                &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(3), ierr)
        ! Send to north
        sendtag = 3
        call mpi_isend(a%s(0,1,1),1,                                &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(4), ierr)
        if (halo_size > 1) then
          ! Wait for North <-> South communication to complete
          call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
        end if
        ! Receive from west
        recvtag = 0
        call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(1), ierr)
        ! Receive from east
        sendtag = 1
        call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(2), ierr)
        ! Send to east
        sendtag = 0
        call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ew(3), ierr)
        ! Send to west
        recvtag = 1
        call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ew(4), ierr)
        ! Wait for East <-> West communication to complete
        if (halo_size == 1) then
          ! Wait for North <-> South communication to complete
          call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
        end if
        call mpi_waitall(4,requests_ew, MPI_STATUSES_IGNORE, ierr)
      end if
      if (comm_measuretime) then
        call finish_timer(t_haloswap(level,m))
      end if
    end if
  end subroutine haloswap
!==================================================================
!  Collect from a(level,m) and store on less processors
!  in b(level,m-1)
!
!  Example for pproc-m = 1, i.e. stepsize = 2:
!
!   NW (0,0)  <--  NE (0,2)
!
!     ^      .
!     !        .
!                .
!   SW (2,0)       SE (2,2) [send to 0,0]
!
!==================================================================
  subroutine collect(level,m, &    ! multigrid and processor level
                     a, &          ! IN: data on level (level,m)
                     b)            ! OUT: data on level (level,m-1)
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(inout) :: b
    integer :: a_n, b_n   ! horizontal grid sizes
    integer :: nz ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, source_rank, dest_rank, rank, recv_tag, send_tag, iz
    logical :: corner_nw, corner_ne, corner_sw, corner_se
    integer :: recv_request(3)
    integer :: recv_requestT(3)

    call start_timer(t_collect(m))

    stepsize = 2**(pproc-m)

    a_n = a%ix_max-a%ix_min+1
    b_n = b%ix_max-b%ix_min+1
    nz = b%grid_param%nz

    ! Work out rank, only execute on relevant processes
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    ! Store position in process grid in in p_horiz
    ! Note we can NOT use cart_shift as we need diagonal neighburs as well
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Ignore all processes that do not participate at this level
    if ( (stepsize .eq. 1) .or. ((mod(p_horiz(1),stepsize) == 0) .and. (mod(p_horiz(2),stepsize)) == 0)) then
      ! Determine position in local 2x2 block
      if (stepsize .eq. 1) then
        corner_nw = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 0))
        corner_ne = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 1))
        corner_sw = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 0))
        corner_se = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 1))
      else
        corner_nw = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_ne = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 1))
        corner_sw = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_se = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 1))
      end if
      ! NW receives from the other three processes
      if ( corner_nw ) then
        ! Receive from NE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)+stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1000
        if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(1),ierr)
        recv_tag = 1010
        if (LUseT) call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_requestT(1),ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from NE failed in mpi_irecv().")
#endif
         ! Receive from SW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)/), &
                           source_rank, &
                           ierr)
        recv_tag = 1001
        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(2),ierr)
        recv_tag = 1011
        if (LUseT) call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_requestT(2),ierr)

#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from SW failed in mpi_irecv().")
#endif
        ! Receive from SE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1002
        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(3),ierr)
        recv_tag = 1012
        if (LUseT) call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_requestT(3),ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from SE failed in mpi_irecv().")
#endif
        ! Copy local data while waiting for data from other processes
        if (LUseO) b%s(0:nz+1,1:a_n,1:a_n) = a%s(0:nz+1,1:a_n,1:a_n)
        if (LUseT) b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1)
        ! Wait for receives to complete before proceeding
        if (LUseO) call mpi_waitall(3,recv_request,MPI_STATUSES_IGNORE,ierr)
        if (LUseT) call mpi_waitall(3,recv_requestT,MPI_STATUSES_IGNORE,ierr)
      end if
      if ( corner_ne ) then
        ! Send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)-stepsize/), &
                           dest_rank, &
                           ierr)
        send_tag = 1000
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1010
        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from NE failed in mpi_send().")
#endif
      end if
      if ( corner_sw ) then
        ! Send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)/), &
                           dest_rank, &
                           ierr)
        send_tag = 1001
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1011
        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from SW failed in mpi_send().")
#endif
      end if
      if ( corner_se ) then
        ! send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                           dest_rank, &
                           ierr)
        send_tag = 1002
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1012
        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from SE failed in mpi_send().")
#endif
      end if

    end if
    call finish_timer(t_collect(m))

  end subroutine collect

!==================================================================
!  Distribute data in a(level,m-1) and store in b(level,m)
!
!  Example for p-m = 1, i.e. stepsize = 2:
!
!   NW (0,0)  -->  NE (2,0)
!
!     !      .
!     v        .
!                .
!   SW (0,2)       SE (2,2) [receive from to 0,0]
!==================================================================
  subroutine distribute(level,m, &  ! multigrid and processor level
                        a,       &  ! IN: Data on level (level,m-1)
                        b)          ! OUT: Data on level (level,m)
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(inout) :: b
    integer :: a_n, b_n   ! horizontal grid sizes
    integer :: nz ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, source_rank, dest_rank, send_tag, recv_tag, rank, iz
    integer :: stat(MPI_STATUS_SIZE)
    integer :: send_request(3)
    integer :: send_requestT(3)
    logical :: corner_nw, corner_ne, corner_sw, corner_se

    call start_timer(t_distribute(m))

    stepsize = 2**(pproc-m)

    a_n = a%ix_max-a%ix_min+1
    b_n = b%ix_max-b%ix_min+1
    nz = a%grid_param%nz

    ! Work out rank, only execute on relevant processes
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Ignore all processes that do not participate at this level
    if ( (stepsize .eq. 1) .or. ((mod(p_horiz(1),stepsize) == 0) .and. (mod(p_horiz(2),stepsize)) == 0)) then
      ! Work out coordinates in local 2 x 2 block
      if (stepsize .eq. 1) then
        corner_nw = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 0))
        corner_ne = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 1))
        corner_sw = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 0))
        corner_se = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 1))
      else
        corner_nw = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_ne = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 1))
        corner_sw = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_se = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 1))
      end if
      if ( corner_nw ) then
        ! (Asynchronous) send to NE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)+stepsize/), &
                           dest_rank, &
                           ierr)

        send_tag = 1000
        if (LUseO) call mpi_isend(a%s(0,1,a_n/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, &
                       MPI_COMM_HORIZ,send_request(1),ierr)
        send_tag = 1010
        if (LUseT) call mpi_isend(a%st(a_n/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, &
                       MPI_COMM_HORIZ,send_requestT(1),ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to NE failed in mpi_isend().")
#endif
        ! (Asynchronous) send to SW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)/), &
                           dest_rank, &
                           ierr)
        send_tag = 1001
        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                       MPI_COMM_HORIZ, send_request(2), ierr)
        send_tag = 1011
        if (LUseT) call mpi_isend(a%st(1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                       MPI_COMM_HORIZ, send_requestT(2), ierr)

#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to SW failed in mpi_isend().")
#endif
        ! (Asynchronous) send to SE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                           dest_rank, &
                           ierr)
        send_tag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,a_n/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                      MPI_COMM_HORIZ, send_request(3), ierr)
        send_tag = 1012
        if (LUseT) call mpi_isend(a%st(a_n/2+1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                      MPI_COMM_HORIZ, send_requestT(3), ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to SE failed in mpi_isend().")
#endif
        ! While sending, copy local data
        if (LUseO) b%s(0:nz+1,1:b_n,1:b_n) = a%s(0:nz+1,1:b_n,1:b_n)
        if (LUseT) b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1)
        ! Only proceed when async sends to complete
        if (LUseO) call mpi_waitall(3, send_request, MPI_STATUSES_IGNORE, ierr)
        if (LUseT) call mpi_waitall(3, send_requestT, MPI_STATUSES_IGNORE, ierr)
      end if
      if ( corner_ne ) then

        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)-stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1000
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1010
        if (LUseT) call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on NE failed in mpi_recv().")
#endif
      end if
      if ( corner_sw ) then
        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)/), &
                           source_rank, &
                           ierr)
        recv_tag = 1001
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1011
        if (LUseT) call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on SW failed in mpi_recv().")
#endif
      end if
      if ( corner_se ) then
        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1002
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1012
        if (LUseT) call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on NW failed in mpi_recv().")
#endif
      end if

    end if
    call finish_timer(t_distribute(m))

  end subroutine distribute

end module communication