communication.f90

!=== COPYRIGHT AND LICENSE STATEMENT ===
!
!  This file is part of the TensorProductMultigrid code.
!  
!  (c) The copyright relating to this work is owned jointly by the
!  Crown, Met Office and NERC [2014]. However, it has been created
!  with the help of the GungHo Consortium, whose members are identified
!  at https://puma.nerc.ac.uk/trac/GungHo/wiki .
!  
!  Main Developer: Eike Mueller
!  
!  TensorProductMultigrid is free software: you can redistribute it and/or
!  modify it under the terms of the GNU Lesser General Public License as
!  published by the Free Software Foundation, either version 3 of the
!  License, or (at your option) any later version.
!  
!  TensorProductMultigrid is distributed in the hope that it will be useful,
!  but WITHOUT ANY WARRANTY; without even the implied warranty of
!  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!  GNU Lesser General Public License for more details.
!  
!  You should have received a copy of the GNU Lesser General Public License
!  along with TensorProductMultigrid (see files COPYING and COPYING.LESSER).
!  If not, see <http://www.gnu.org/licenses/>.
!
!=== COPYRIGHT AND LICENSE STATEMENT ===


!==================================================================
!
!  MPI communication routines for multigrid code
!
!    Eike Mueller, University of Bath, Feb 2012
!
!==================================================================

module communication
  use messages
  use datatypes

#ifndef MNH
  use mpi
#else
  use modd_mpif
#endif   
  use timer

  use mode_mnh_allocate_mg_halo

  implicit none

public::comm_preinitialise
public::comm_initialise
public::comm_finalise
public::scalarprod_mnh
public::scalarprod
public::print_scalaprod2
public::boundary_mnh
public::haloswap_mnh
public::haloswap
public::ihaloswap_mnh
public::ihaloswap
public::collect
public::distribute
public::i_am_master_mpi
public::master_rank
public::pproc
public::MPI_COMM_HORIZ
public::comm_parameters
public::comm_measuretime


  ! Number of processors
  ! n_proc = 2^(2*pproc), with integer pproc
  integer :: pproc

! Rank of master process
  integer, parameter :: master_rank = 0
! Am I the master process?
  logical :: i_am_master_mpi

  integer, parameter :: dim = 3  ! Dimension
  integer, parameter :: dim_horiz = 2  ! Horizontal dimension
  integer :: MPI_COMM_HORIZ ! Communicator with horizontal partitioning

private

! Data types for halo exchange in both x- and y-direction
  integer, dimension(:,:,:), allocatable :: halo_type

! Array for Halo Exchange <-> GPU copy/managed
  type type_halo_T
     real, dimension(:,:,:), pointer, contiguous :: haloTin,haloTout
  end type type_halo_T
! MPI vector data types
  ! Halo for data exchange in north-south direction
  integer, allocatable, dimension(:,:) :: halo_ns
  integer, allocatable, dimension(:,:) :: halo_nst
  type(type_halo_T), allocatable, dimension(:,:) :: tab_halo_nt
  type(type_halo_T), allocatable, dimension(:,:) :: tab_halo_st
  integer, allocatable, dimension(:,:) :: halo_wet
  type(type_halo_T), allocatable, dimension(:,:) :: tab_halo_wt
  type(type_halo_T), allocatable, dimension(:,:) :: tab_halo_et
  ! Vector data type for interior of field a(level,m)
  integer, allocatable, dimension(:,:) :: interior
  integer, allocatable, dimension(:,:) :: interiorT
  type(type_halo_T), allocatable, dimension(:,:) :: tab_interiorT_ne,tab_interiorT_sw,tab_interiorT_se
  ! Vector data type for one quarter of interior of field
  ! at level a(level,m). This has the same size (and can be
  ! used for communications with) the interior of a(level,m+1)
  integer, allocatable, dimension(:,:) :: sub_interior
  integer, allocatable, dimension(:,:) :: sub_interiorT
  type(type_halo_T), allocatable, dimension(:,:) :: tab_sub_interiorT_ne,tab_sub_interiorT_sw,tab_sub_interiorT_se
  ! Timer for halo swaps
  type(time), allocatable, dimension(:,:) :: t_haloswap
  ! Timer for collect and distribute
  type(time), allocatable, dimension(:) :: t_collect
  type(time), allocatable, dimension(:) :: t_distribute
  ! Parallelisation parameters
  ! Measure communication times?
  logical :: comm_measuretime

  ! Parallel communication parameters
  type comm_parameters
    ! Size of halos
    integer :: halo_size
  end type comm_parameters

  type(comm_parameters) :: comm_param

! Data layout
! ===========
!
!  The number of processes has to be of the form nproc = 2^(2*pproc) to
!  ensure that data can be distributed between processes.
!  The processes are arranged in a (2^pproc) x (2^pproc) cartesian grid
!  in the horizontal plane (i.e. vertical columns are always local to one
!  process), which is implemented via the communicator MPI_COMM_HORIZ.
!  This MPI_cart_rank() and MPI_cart_shift() can then be used to
!  easily identify neighbouring processes.

!  The number of data grid cells in each direction has to be a multiply
!  of 2**(L-1) where L is the number of levels (including the coarse
!  and fine level), with the coarse level corresponding to level=1.
!  Also define L_split as the level where we start to pull together
!  data. For levels > L_split each position in the cartesian grid is
!  included in the work, below this only a subset of processes is
!  used.
!
!  Each grid a(level,m) is identified by two numbers:
!  (1) The multigrid level it belongs to (level)
!  (2) The number of active processes that operate on it (2^(2*m)).
!
!  For level > L_split, m=procp. For L_split we store a(L_split,pproc) and
!  a(L_split,pproc-1), and only processes with even coordinates in both
!  horizontal directions use this grid.
!  Below that level, store a(L_split-1,pproc-1) and a(L_split-1,pproc-2),
!  where only processes for which both horiontal coordinates are
!  multiples of four use the latter. This is continued until only on
!  process is left.
!
!
!  level
!    L          a(L,pproc)
!    L-1        a(L-1,pproc)
!    ...
!    L_split    a(L_split,pproc)    a(L_split  ,pproc-1)
!    L_split-1                      a(L_split-1,pproc-1)  a(L_split-1,pproc-2)
!
!                                                                  ... a(3,1)
!                                                                      a(2,1)
!                                                                      a(1,1)
!
!  When moving from left to right in the above graph the total number of
!  grid cells does not change, but the number of data points per process
!  increases by a factor of 4.
!
! Parallel operations
! ===================
!
!   (*)  Halo exchange. Update halo with data from neighbouring
!        processors in cartesian grid on current (level,m)
!   (*)  Collect data on all processes at (level,m) on those
!        processes that are still active on (level,m-1).
!   (*)  Distribute data at (level,m-1) and duplicate on all processes
!        that are active at (level,m).
!
!   Note that in the cartesian processor grid the first coordinate
!   is the North-South (y-) direction, the second coordinate is the
!   East-West (x-) direction, i.e. the layout is this:
!
!   p_0 (0,0)   p_1 (0,1)   p_2 (0,2)   p_3 (0,3)
!
!   p_4 (1,0)   p_5 (1,1)   p_6 (1,2)   p_7 (1,3)
!
!   p_8 (2,0)   p_9 (2,1)   p_10 (2,2)  p_11 (2,3)
!
!                       [...]
!
!
!   Normal multigrid restriction and prolongation are used to
!   move between levels with fixed m.
!
!

contains

!==================================================================
! Pre-initialise communication routines
!==================================================================
  subroutine comm_preinitialise()
    implicit none
    integer :: nproc, ierr, rank
    call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr)
    call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
    i_am_master_mpi = (rank == master_rank)
    ! Check that nproc = 2^(2*p)
    !pproc = floor(log(1.0d0*nproc)/log(4.0d0))
    pproc = nint(log(1.0d0*nproc)/log(4.0d0))
    if ( (nproc - 4**pproc) .ne. 0) then
      print*,"Number of processors has to be 2^(2*pproc) with integer nproc,pproc=",nproc,pproc
      call fatalerror("Number of processors has to be 2^(2*pproc) with integer pproc.")
    end if
    if (i_am_master_mpi) then
      write(STDOUT,'("PARALLEL RUN")')
      write(STDOUT,'("Number of processors : 2^(2*pproc) = ",I10," with pproc = ",I6)') &
        nproc, pproc
    end if
    ! Create halo data types

  end subroutine comm_preinitialise

!==================================================================
! Initialise communication routines
!==================================================================
  subroutine comm_initialise(n_lev,        & !} multigrid parameters
                            lev_split,     & !}
                            grid_param,    & ! Grid parameters
                            comm_param_in)   ! Parallel communication
                                             ! parameters
    implicit none
    integer, intent(in) :: n_lev
    integer, intent(in) :: lev_split
    type(grid_parameters), intent(inout) :: grid_param
    type(comm_parameters), intent(in)    :: comm_param_in
    integer :: n
    integer :: nz
    integer :: rank, nproc, ierr
    integer :: count, blocklength, stride
    integer, dimension(2) :: p_horiz
    integer :: m, level, nlocal
    logical :: reduced_m
    integer :: halo_size
    character(len=32) :: t_label

    integer,parameter    :: nb_dims=3
    integer,dimension(nb_dims) :: profil_tab,profil_sous_tab,coord_debut

    n = grid_param%n
    nz = grid_param%nz

    comm_param = comm_param_in
    halo_size = comm_param%halo_size

    call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr)

    ! Create cartesian topology
    call mpi_cart_create(MPI_COMM_WORLD,        & ! Old communicator name
                         dim_horiz,             & ! horizontal dimension
                         (/2**pproc,2**pproc/), & ! extent in each horizontal direction
                         (/.false.,.false./),   & ! periodic?
                         .false.,               & ! reorder?
                         MPI_COMM_HORIZ,        & ! Name of new communicator
                         ierr)
   ! calculate and display rank and corrdinates in cartesian grid
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Local size of (horizontal) grid
    nlocal = n/2**pproc

    ! === Set up data types ===
    ! Halo for exchange in north-south direction
    if (LUseO) allocate(halo_ns(n_lev,0:pproc))
    if (LUseT) allocate(halo_nst(n_lev,0:pproc))
    if (LUseT) allocate(tab_halo_nt(n_lev,0:pproc))
    if (LUseT) allocate(tab_halo_st(n_lev,0:pproc))
    if (LUseT) allocate(halo_wet(n_lev,0:pproc))
    if (LUseT) allocate(tab_halo_wt(n_lev,0:pproc))
    if (LUseT) allocate(tab_halo_et(n_lev,0:pproc))
    ! Interior data types
    if (LUseO) allocate(interior(n_lev,0:pproc))
    if (LUseO) allocate(sub_interior(n_lev,0:pproc))
    if (LUseT) allocate(interiorT(n_lev,0:pproc))
    if (LUseT) allocate(sub_interiorT(n_lev,0:pproc))
    if (LUseT) allocate(tab_interiorT_ne(n_lev,0:pproc))
    if (LUseT) allocate(tab_interiorT_sw(n_lev,0:pproc))
    if (LUseT) allocate(tab_interiorT_se(n_lev,0:pproc))
    if (LUseT) allocate(tab_sub_interiorT_ne(n_lev,0:pproc))
    if (LUseT) allocate(tab_sub_interiorT_sw(n_lev,0:pproc))
    if (LUseT) allocate(tab_sub_interiorT_se(n_lev,0:pproc))
    ! Timer
    allocate(t_haloswap(n_lev,0:pproc))
    allocate(t_collect(0:pproc))
    allocate(t_distribute(0:pproc))
    do m=0,pproc
      write(t_label,'("t_collect(",I3,")")') m
      call initialise_timer(t_collect(m),t_label)
      write(t_label,'("t_distribute(",I3,")")') m
      call initialise_timer(t_distribute(m),t_label)
    end do

    m = pproc
    level = n_lev
    reduced_m = .false.
    do while (level > 0)
      ! --- Create halo data types ---
      if (LUseO) then
      ! NS- (y-) direction
      count = nlocal
      blocklength = (nz+2)*halo_size
      stride = (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_ns(level,m),ierr)
      call mpi_type_commit(halo_ns(level,m),ierr)
      endif
      ! tranpose
      if (LUseT) then
      ! NS- (y-) transpose direction
      count =        nz+2                                        ! nlocal
      blocklength =  nlocal*halo_size                            ! (nz+2)*halo_size
      stride =       (nlocal+2*halo_size) * (nlocal+2*halo_size) ! (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_nst(level,m),ierr)
      call mpi_type_commit(halo_nst(level,m),ierr)
      ! allocate send/recv buffer for GPU copy/managed
      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTin,nlocal,halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTout,nlocal,halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTin,nlocal,halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTout,nlocal,halo_size,nz+2)      
      
     ! WE- (x-) transpose direction
      count =        (nz+2)*(nlocal+2*halo_size)*halo_size       ! nlocal
      blocklength =  1*halo_size                                 ! (nz+2)*halo_size
      stride =       nlocal+2*halo_size                          ! (nlocal+2*halo_size)*(nz+2)
      call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                           halo_wet(level,m),ierr)
      call mpi_type_commit(halo_wet(level,m),ierr)
      ! allocate send/recv buffer for GPU copy/managed      !
      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2)
      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2)      
      endif
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Commit halo_ns failed in mpi_type_commit().")
#endif
      ! --- Create interior data types ---
      if (LUseO) then
         count = nlocal
         blocklength = nlocal*(nz+2)
         stride = (nz+2)*(nlocal+2*halo_size)
         call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,interior(level,m),ierr)
         call mpi_type_commit(interior(level,m),ierr)
         count = nlocal/2
         blocklength = nlocal/2*(nz+2)
         stride = (nlocal+2*halo_size)*(nz+2)
         call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,sub_interior(level,m),ierr)
         call mpi_type_commit(sub_interior(level,m),ierr)
      end if
      if (LUseT) then
         ! interiorT
         if ( nlocal /= 0 ) then
            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
            profil_sous_tab = (/ nlocal             , nlocal             , nz+2 /)
            coord_debut     = (/ 0                  , 0                  , 0    /)
            call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                 MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,interiorT(level,m),ierr)
            call mpi_type_commit(interiorT(level,m),ierr)
            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocal,nlocal,nz+2)
            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocal,nlocal,nz+2)
            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocal,nlocal,nz+2)
            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocal,nlocal,nz+2)
            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocal,nlocal,nz+2)
            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocal,nlocal,nz+2)
         end if
         ! sub_interiorT
         if ( (nlocal/2) /= 0 ) then
            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
            profil_sous_tab = (/ nlocal/2           , nlocal/2           , nz+2 /)
            coord_debut     = (/ 0                  , 0                  , 0    /)
            call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                 MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sub_interiorT(level,m),ierr)
            call mpi_type_commit(sub_interiorT(level,m),ierr)
            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)            
         end if
      end if
      ! --- Create timers ---
      write(t_label,'("t_haloswap(",I3,",",I3,")")') level,m
      call initialise_timer(t_haloswap(level,m),t_label)

      ! If we are below L_split, split data
      if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
        reduced_m = .true.
        m = m-1
        nlocal = 2*nlocal
        cycle
      end if
      reduced_m = .false.
      level = level-1
      nlocal = nlocal/2
    end do

  end subroutine comm_initialise

!==================================================================
! Finalise communication routines
!==================================================================
  subroutine comm_finalise(n_lev,   &  ! } Multigrid parameters
                           lev_split,  & !}
                           grid_param )  ! } Grid parameters
    implicit none
    integer, intent(in) :: n_lev
    integer, intent(in) :: lev_split
    type(grid_parameters), intent(in) :: grid_param
    ! local var
    logical :: reduced_m
    integer :: level, m
    integer :: ierr
    integer :: nlocal,n
    character(len=80) :: s

   ! Local size of (horizontal) grid
    n = grid_param%n
    nlocal = n/2**pproc

    m = pproc
    level = n_lev
    reduced_m = .false.
    if (i_am_master_mpi) then
      write(STDOUT,'(" *** Finalising communications ***")')
    end if
    call print_timerinfo("--- Communication timing results ---")
    do while (level > 0)
      write(s,'("level = ",I3,", m = ",I3)') level, m
      call print_timerinfo(s)
      ! --- Print out timer information ---
      call print_elapsed(t_haloswap(level,m),.True.,1.0_rl)
      ! --- Free halo data types ---
      if (LUseO) call mpi_type_free(halo_ns(level,m),ierr)
      if (LUseT) call mpi_type_free(halo_nst(level,m),ierr)
      if (LUseT) call mpi_type_free(halo_wet(level,m),ierr)
      ! --- Free interior data types ---
      if (LUseO) call mpi_type_free(interior(level,m),ierr)
      if (LUseO) call mpi_type_free(sub_interior(level,m),ierr)
      if (LUseT .and. (nlocal /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr)
      if (LUseT .and. ( (nlocal/2) /= 0 ) ) call mpi_type_free(sub_interiorT(level,m),ierr)
      ! If we are below L_split, split data
      if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
        reduced_m = .true.
        m = m-1
        nlocal = 2*nlocal
        cycle
      end if
      reduced_m = .false.
      level = level-1
      nlocal = nlocal/2
    end do
    do m=pproc,0,-1
      write(s,'("m = ",I3)') m
      call print_timerinfo(s)
      ! --- Print out timer information ---
      call print_elapsed(t_collect(m),.True.,1.0_rl)
      call print_elapsed(t_distribute(m),.True.,1.0_rl)
    end do

    ! Deallocate arrays
    if (LUseO) deallocate(halo_ns)
    if (LUseT) deallocate(halo_nst,halo_wet)
    if (LUseO) deallocate(interior)
    if (LUseO) deallocate(sub_interior)
    if (LUseT) deallocate(interiorT)
    if (LUseT) deallocate(sub_interiorT)
    deallocate(t_haloswap)
    deallocate(t_collect)
    deallocate(t_distribute)
    if (i_am_master_mpi) then
      write(STDOUT,'("")')
    end if

  end subroutine comm_finalise

!==================================================================
!  Scalar product of two fields
!==================================================================
  subroutine scalarprod_mnh(m, a, b, s)
#ifdef MNH_MPPDB
    USE MODE_MPPDB
#endif    
    implicit none
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(in) :: b
    real(kind=rl), intent(out) :: s
    !local var
    integer :: nprocs, rank, ierr
    integer :: p_horiz(2)
    integer :: stepsize
    integer, parameter :: dim_horiz = 2
    real(kind=rl) :: local_sum, global_sum 
    real(kind=rl) :: local_sumt,global_sumt
    integer :: nlocal, nz, i
    integer :: ix,iy,iz
    real(kind=rl) :: ddot

    integer :: iy_min,iy_max, ix_min,ix_max
    integer :: icompy_min,icompy_max, icompx_min,icompx_max
    real , dimension(:,:,:) , pointer , contiguous :: za_st,zb_st

    nlocal = a%ix_max-a%ix_min+1
    nz = a%grid_param%nz

    iy_min = a%iy_min
    iy_max = a%iy_max
    ix_min = a%ix_min
    ix_max = a%ix_max

    icompy_min = a%icompy_min
    icompy_max = a%icompy_max
    icompx_min = a%icompx_min
    icompx_max = a%icompx_max

    ! Work out coordinates of processor
    call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
    call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
    stepsize = 2**(pproc-m)
    if (nprocs > 1) then
      ! Only inlcude local sum if the processor coordinates
      ! are multiples of stepsize
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
      if ( (stepsize == 1) .or.                   &
           ( (stepsize > 1) .and.                 &
             (mod(p_horiz(1),stepsize)==0) .and.  &
             (mod(p_horiz(2),stepsize)==0) ) ) then
        if (LUseO) then
           local_sum = 0.0_rl
           do i = 1, nlocal
              local_sum = local_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
           end do
        end if
        if (LUseT) then
           local_sumt = 0.0_rl
           za_st => a%st
           zb_st => b%st
           !$acc kernels
           !$acc loop collapse(3) reduction(+:local_sumt)
           do iz=0,nz+1
              do iy=icompy_min,icompy_max
                 do ix=icompx_min,icompx_max
                    local_sumt = local_sumt &
                         + za_st(ix,iy,iz)*zb_st(ix,iy,iz)
                 end do
              end do
           end do
           !$acc end kernels
        end if
      else
        if (LUseO) local_sum = 0.0_rl
        if (LUseT) local_sumt = 0.0_rl
      end if
      if (LUseO) call mpi_allreduce(local_sum,global_sum,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
      if (LUseT) call mpi_allreduce(local_sumt,global_sumt,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
    else
      if (LUseO) then   
      global_sum = 0.0_rl
      do i = 1, nlocal
        global_sum = global_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
      end do
      endif
      if (LUseT) then
      za_st => a%st
      zb_st => b%st
      global_sumt = 0.0_rl
         !$acc kernels
         do iz=0,nz+1
            do iy=iy_min,iy_max
               do ix=ix_min,ix_max
                  global_sumt = global_sumt &
                       + za_st(ix,iy,iz)*zb_st(ix,iy,iz)
               end do
            end do
         end do
         !$acc end kernels
      endif
    end if
    if (LUseO) then
       s = global_sum
    else
       s = global_sumt
    end if

#ifdef MNH
!!$    CALL MPPDB_CHECK0D_REAL_MG(s,"scalarprod_mnh")
#endif
    
  end subroutine scalarprod_mnh
!-------------------------------------------------------------------------------
  subroutine scalarprod(m, a, b, s)
    implicit none
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(in) :: b
    real(kind=rl), intent(out) :: s
    integer :: nprocs, rank, ierr
    integer :: p_horiz(2)
    integer :: stepsize
    integer, parameter :: dim_horiz = 2
    real(kind=rl) :: local_sum, global_sum
    integer :: nlocal, nz, i
    real(kind=rl) :: ddot

    nlocal = a%ix_max-a%ix_min+1
    nz = a%grid_param%nz
    ! Work out coordinates of processor
    call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
    call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
    stepsize = 2**(pproc-m)
    if (nprocs > 1) then
      ! Only inlcude local sum if the processor coordinates
      ! are multiples of stepsize
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
      if ( (stepsize == 1) .or.                   &
           ( (stepsize > 1) .and.                 &
             (mod(p_horiz(1),stepsize)==0) .and.  &
             (mod(p_horiz(2),stepsize)==0) ) ) then
        local_sum = 0.0_rl
        do i = 1, nlocal
          local_sum = local_sum &
                    + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
        end do
      else
        local_sum = 0.0_rl
      end if
      call mpi_allreduce(local_sum,global_sum,1,MPI_DOUBLE_PRECISION, &
                         MPI_SUM,MPI_COMM_HORIZ,ierr)
    else
      global_sum = 0.0_rl
      do i = 1, nlocal
        global_sum = global_sum &
                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
      end do
    end if
    s = global_sum
  end subroutine scalarprod
!==================================================================
!  Pritn Scalar product^2  of 1 fields
!==================================================================
  subroutine print_scalaprod2(l,m, a, message )
    implicit none
    integer, intent(in) :: l,m
    type(scalar3d), intent(in) :: a
    character(len=*) , intent(in) :: message

    !local 
    real(kind=rl) :: s

    call scalarprod_mnh(m, a, a, s)
    s = sqrt(s)
    if (i_am_master_mpi) then
       write(STDOUT,'("Print_norm::",A,2I3,E23.15)') message, l,m,s
       call flush(STDOUT)
    end if

  end subroutine print_scalaprod2
!==================================================================
! Boundary Neumann
!==================================================================
  subroutine boundary_mnh(a)              ! data field

    implicit none
 
    type(scalar3d), intent(inout) :: a

    !local var
    integer :: n, ix_min,ix_max,iy_min,iy_max
    integer :: icompx_max,icompy_max

    real , dimension(:,:,:) , pointer , contiguous :: za_st
    ! Update Real Boundary for Newman case   u(0) = u(1) , etc ...

    !return

    n      = a%grid_param%n
    ix_min = a%ix_min
    ix_max = a%ix_max
    iy_min = a%iy_min
    iy_max = a%iy_max
    if (LUseO) then 
    if ( ix_min == 1 ) then
       a%s(:,:,0) = a%s(:,:,1)
    endif
    if ( ix_max == n ) then
       a%s(:,:,a%icompx_max+1) = a%s(:,:,a%icompx_max)
    endif
    if ( iy_min == 1 ) then
       a%s(:,0,:) = a%s(:,1,:)
    endif
    if ( iy_max == n ) then
       a%s(:,a%icompy_max+1,:) = a%s(:,a%icompy_max,:)
    endif
    endif
    if (LUseT) then
    !  transpose

    za_st => a%st
    icompx_max = a%icompx_max
    icompy_max = a%icompy_max

    !$acc kernels
    if ( ix_min == 1 ) then
       !acc kernels
       za_st(0,:,:) = za_st(1,:,:)
       !acc end kernels
    endif
    if ( ix_max == n ) then
       !acc kernels
       za_st(icompx_max+1,:,:) = za_st(icompx_max,:,:)
       !acc end kernels
    endif
    if ( iy_min == 1 ) then
       !acc kernels
       za_st(:,0,:) = za_st(:,1,:)
       !acc end kernels
    endif
    if ( iy_max == n ) then
       !acc kernels
       za_st(:,icompy_max+1,:) = za_st(:,icompy_max,:)
       !acc end kernels
    endif
    !$acc end kernels
   
    endif

  end subroutine boundary_mnh
!==================================================================
!  Initiate asynchronous halo exchange
!
!  For all processes with horizontal indices that are multiples
!  of 2^(pproc-m), update halos with information held by
!  neighbouring processes, e.g. for pproc-m = 1, stepsize=2
!
!                      N  (0,2)
!                      ^
!                      !
!                      v
!
!      W (2,0) <-->  (2,2)  <-->  E (2,4)
!
!                      ^
!                      !
!                      v
!                      S (4,2)
!
!==================================================================
  subroutine ihaloswap_mnh(level,m,       &  ! multigrid- and processor- level
                       a,             &  ! data field
                       send_requests, &  ! send requests (OUT)
                       recv_requests,  &  ! recv requests (OUT)
                       send_requestsT, &  ! send requests T (OUT)
                       recv_requestsT  &  ! recv requests T (OUT)
                       )
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    integer, intent(out), dimension(4) :: send_requests
    integer, intent(out), dimension(4) :: recv_requests
    integer, intent(out), dimension(4) :: send_requestsT
    integer, intent(out), dimension(4) :: recv_requestsT
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 1002
        if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1,       &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(1), ierr)
        recvtag = 1012
        if (LUseT) then
           call mpi_irecv(a%st(1,0-(halo_size-1),0),1,      &
                halo_nst(level,m),neighbour_n_rank,recvtag,  &
                MPI_COMM_HORIZ, recv_requestsT(1), ierr)
        end if
        ! Receive from south
        recvtag = 1003
        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(2), ierr)
        recvtag = 1013
        if (LUseT) then
           call mpi_irecv(a%st(1,a_n+1,0),1,                &
                halo_nst(level,m),neighbour_s_rank,recvtag,  &
                MPI_COMM_HORIZ, recv_requestsT(2), ierr)
        end if
        ! Send to south
        sendtag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(1), ierr)
        sendtag = 1012
        if (LUseT) then
           call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
                halo_nst(level,m),neighbour_s_rank,sendtag,  &
                MPI_COMM_HORIZ, send_requestsT(1), ierr)
        end if
        ! Send to north
        sendtag = 1003
        if (LUseO) call mpi_isend(a%s(0,1,1),1,                     &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(2), ierr)
        sendtag = 1013
        if (LUseT) then
           call mpi_isend(a%st(1,1,0),1,                    &
                       halo_nst(level,m),neighbour_n_rank,sendtag,  &
                       MPI_COMM_HORIZ, send_requestsT(2), ierr)
        end if
        ! Receive from west
        recvtag = 1000
        if (LUseO) call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(3), ierr)
        recvtag = 1010
        if (LUseT) then
           call mpi_irecv(a%st(0-(halo_size-1),0,0),1,  &
                halo_wet(level,m),neighbour_w_rank,recvtag, &
                MPI_COMM_HORIZ, recv_requestsT(3), ierr)
        end if
        ! Receive from east
        sendtag = 1001
        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(4), ierr)
        sendtag = 1011
        if (LUseT) then
           call mpi_irecv(a%st(a_n+1,0,0),1,          &
                halo_wet(level,m),neighbour_e_rank,recvtag, &
                MPI_COMM_HORIZ, recv_requestsT(4), ierr)
        end if
        ! Send to east
        sendtag = 1000
        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, send_requests(3), ierr)
        sendtag = 1010
        if (LUseT) then
           call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
                halo_wet(level,m),neighbour_e_rank,sendtag, &
                MPI_COMM_HORIZ, send_requestsT(3), ierr)
        end if
        ! Send to west
        recvtag = 1001
        if (LUseO) call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(4), ierr)
        recvtag = 1011
        if (LUseT) then
           call mpi_isend(a%st(1,0,0),1,                &
                       halo_wet(level,m),neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requestsT(4), ierr)
        end if
      end if
    end if
  end subroutine ihaloswap_mnh
!==================================================================
  subroutine ihaloswap(level,m,       &  ! multigrid- and processor- level
                       a,             &  ! data field
                       send_requests, &  ! send requests (OUT)
                       recv_requests  &  ! recv requests (OUT)
                       )
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    integer, intent(out), dimension(4) :: send_requests
    integer, intent(out), dimension(4) :: recv_requests
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 2
        call mpi_irecv(a%s(0,0-(halo_size-1),1),1,                  &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(1), ierr)
        ! Receive from south
        recvtag = 3
        call mpi_irecv(a%s(0,a_n+1,1),1,                            &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, recv_requests(2), ierr)
        ! Send to south
        sendtag = 2
        call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,                &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(1), ierr)
        ! Send to north
        sendtag = 3
        call mpi_isend(a%s(0,1,1),1,                                &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(2), ierr)
        ! Receive from west
        recvtag = 0
        call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(3), ierr)
        ! Receive from east
        sendtag = 1
        call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, recv_requests(4), ierr)
        ! Send to east
        sendtag = 0
        call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, send_requests(3), ierr)
        ! Send to west
        recvtag = 1
        call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, send_requests(4), ierr)
      end if
    end if
  end subroutine ihaloswap

!==================================================================
!  Halo exchange
!
!  For all processes with horizontal indices that are multiples
!  of 2^(pproc-m), update halos with information held by
!  neighbouring processes, e.g. for pproc-m = 1, stepsize=2
!
!                      N  (0,2)
!                      ^
!                      !
!                      v
!
!      W (2,0) <-->  (2,2)  <-->  E (2,4)
!
!                      ^
!                      !
!                      v
!                      S (4,2)
!
!==================================================================
  subroutine haloswap_mnh(level,m, &  ! multigrid- and processor- level
                      a)          ! data field
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(inout) :: a
    !local var
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength
    integer, dimension(4) :: requests_ns
    integer, dimension(4) :: requests_ew
    integer, dimension(4) :: requests_nsT
    integer, dimension(4) :: requests_ewT

    integer :: ii,ij,ik
    real , pointer , contiguous , dimension(:,:,:) :: zst
    !
    real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_st_haloTin,ztab_halo_nt_haloTin
    real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_et_haloTin,ztab_halo_wt_haloTin
    !
    real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_nt_haloTout,ztab_halo_st_haloTout
    real , pointer , contiguous , dimension(:,:,:) :: ztab_halo_wt_haloTout,ztab_halo_et_haloTout

    INTEGER,PARAMETER :: IS_WEST=1 , IS_EAST=2, IS_SOUTH=3, IS_NORTH=4

    LOGICAL :: Gneighbour_s,Gneighbour_n,Gneighbour_e,Gneighbour_w

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      if (comm_measuretime) then
        call start_timer(t_haloswap(level,m))
      end if
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        !
        zst => a%st
        !
        Gneighbour_s = (neighbour_s_rank >= 0)
        Gneighbour_n = (neighbour_n_rank >= 0)
        Gneighbour_e = (neighbour_e_rank >= 0)
        Gneighbour_w = (neighbour_w_rank >= 0)        
        !
        requests_ns(:) = MPI_REQUEST_NULL
        requests_ew(:) = MPI_REQUEST_NULL
        requests_nsT(:) = MPI_REQUEST_NULL
        requests_ewT(:) = MPI_REQUEST_NULL
        !
        !  Init Pointer
        !
#ifdef MNH_GPUDIRECT
        if (LUseT) then
           ! Send to south
           if (Gneighbour_s) then
              ztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin
           end if
           ! Send to north
           if (Gneighbour_n) then
              ztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin
           end if
           ! Send to east
           if (Gneighbour_e) then
              ztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin
           end if
           ! Send to west
           if (Gneighbour_w) then
              ztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin
           end if
           ! Receive from north
           if (Gneighbour_n) then
              ztab_halo_nt_haloTout => tab_halo_nt(level,m)%haloTout
           end if
           ! Receive from south
           if (Gneighbour_s) then
              ztab_halo_st_haloTout => tab_halo_st(level,m)%haloTout
           end if
           ! Receive from west
           if (Gneighbour_w) then
              ztab_halo_wt_haloTout => tab_halo_wt(level,m)%haloTout
           end if
           ! Receive from east
           if (Gneighbour_e) then
              ztab_halo_et_haloTout => tab_halo_et(level,m)%haloTout
           end if
        end if
#endif
        !
        call haloswap_mnh_dim(ztab_halo_st_haloTin,ztab_halo_nt_haloTin,&
                              ztab_halo_et_haloTin,ztab_halo_wt_haloTin,&
                              ztab_halo_nt_haloTout,ztab_halo_st_haloTout,&
                              ztab_halo_wt_haloTout,ztab_halo_et_haloTout,&
                              zst)
        !
     end if!  (stepsize == 1) ...
     if (comm_measuretime) then
        call finish_timer(t_haloswap(level,m))
     end if
  end if !  (m > 0)

contains
  subroutine haloswap_mnh_dim(pztab_halo_st_haloTin,pztab_halo_nt_haloTin,&
                              pztab_halo_et_haloTin,pztab_halo_wt_haloTin,&
                              pztab_halo_nt_haloTout,pztab_halo_st_haloTout,&
                              pztab_halo_wt_haloTout,pztab_halo_et_haloTout,&
                              pzst)

    implicit none
    real :: pztab_halo_st_haloTin(1:a_n,1:halo_size,1:nz+2), &
            pztab_halo_nt_haloTin(1:a_n,1:halo_size,1:nz+2), &
            pztab_halo_et_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
            pztab_halo_wt_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
            pztab_halo_nt_haloTout(1:a_n,1:halo_size,1:nz+2), &
            pztab_halo_st_haloTout(1:a_n,1:halo_size,1:nz+2), &
            pztab_halo_wt_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
            pztab_halo_et_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
            pzst(1-halo_size:a_n+halo_size,1-halo_size:a_n+halo_size,0:nz+1)
        !
        ! Do Comm
        !
#ifdef MNH_GPUDIRECT
        if (LUseT) then
           !
           ! Copy send buffer async to GPU
           !
           ! Send to south
           if (Gneighbour_s) then
!!$           pztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin
           !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTin)
           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
              pztab_halo_st_haloTin(ii,ij,ik) = pzst(ii,ij+a_n-halo_size,ik-1)
           !$mnh_end_do()
           !$acc end kernels
           end if
           ! Send to north
           if (Gneighbour_n) then
!!$           pztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin
           !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTin)
           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )           
              pztab_halo_nt_haloTin(ii,ij,ik) = pzst(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels
           end if
           ! Send to east
           if (Gneighbour_e) then
!!$           pztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin
           !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTin)
           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
              pztab_halo_et_haloTin(ii,ij,ik) = pzst(ii+a_n-halo_size,ij-halo_size,ik-1)
           !$mnh_end_do()
           !$acc end kernels
           end if
           ! Send to west
           if (Gneighbour_w) then
!!$           pztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin
           !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTin)
           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
              pztab_halo_wt_haloTin(ii,ij,ik) = pzst(ii,ij-halo_size,ik-1)
           !$mnh_end_do()
           !$acc end kernels
           end if
        end if
#endif
        ! Receive from north
        recvtag = 1002
        if (LUseO) call mpi_irecv(a%s(0,0-(halo_size-1),1),1,       &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(1), ierr)
        recvtag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_n) then
!!$           pztab_halo_nt_haloTout => tab_halo_nt(level,m)%haloTout
           !$acc host_data use_device(pztab_halo_nt_haloTout)
           call mpi_irecv(pztab_halo_nt_haloTout,size(pztab_halo_nt_haloTout),      &
                       MPI_DOUBLE_PRECISION,neighbour_n_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(1), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_irecv(pztab_halo_nt_haloTout,neighbour_n_rank=",neighbour_n_rank
#else
           call mpi_irecv(a%st(1,0-(halo_size-1),0),1,      &
                       halo_nst(level,m),neighbour_n_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(1), ierr)
#endif
        end if
        ! Receive from south
        recvtag = 1003
        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(2), ierr)
        recvtag = 1013
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_s) then
!!$           pztab_halo_st_haloTout => tab_halo_st(level,m)%haloTout
           !$acc host_data use_device (pztab_halo_st_haloTout)
           call mpi_irecv(pztab_halo_st_haloTout,size(pztab_halo_st_haloTout),  &
                       MPI_DOUBLE_PRECISION,neighbour_s_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(2), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_irecv(pztab_halo_st_haloTout,neighbour_s_rank=",neighbour_s_rank
#else
           call mpi_irecv(a%st(1,a_n+1,0),1,                &
                       halo_nst(level,m),neighbour_s_rank,recvtag,  &
                       MPI_COMM_HORIZ, requests_nsT(2), ierr)           
#endif
        end if
#ifdef MNH_GPUDIRECT
        if (LUseT) then
           ! wait for async copy of send buffer to GPU
           call acc_wait_haloswap_mnh()           
        end if
#endif        
        ! Send to south
        sendtag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(3), ierr)
        sendtag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_s) then
           !$acc host_data use_device(pztab_halo_st_haloTin)
           call mpi_isend(pztab_halo_st_haloTin,size(pztab_halo_st_haloTin),    &
                       MPI_DOUBLE_PRECISION,neighbour_s_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(3), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_isend(pztab_halo_st_haloTin,neighbour_s_rank=",neighbour_s_rank
#else   
           call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
                       halo_nst(level,m),neighbour_s_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(3), ierr)
#endif
        end if
        ! Send to north
        sendtag = 1003
        if (LUseO) call mpi_isend(a%s(0,1,1),1,                     &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(4), ierr)
        sendtag = 1013
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_n) then
           !$acc host_data use_device(pztab_halo_nt_haloTin)
           call mpi_isend(pztab_halo_nt_haloTin,size(pztab_halo_nt_haloTin),   &
                       MPI_DOUBLE_PRECISION,neighbour_n_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(4), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_isend(pztab_halo_nt_haloTin,neighbour_n_rank=",neighbour_n_rank
#else
           call mpi_isend(a%st(1,1,0),1,                    &
                       halo_nst(level,m),neighbour_n_rank,sendtag,  &
                       MPI_COMM_HORIZ, requests_nsT(4), ierr)           
#endif
        end if
        if (halo_size > 1) then
          ! Wait for North <-> South communication to complete
          if (LUseO) call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
          if (LUseT) call mpi_waitall(4,requests_nsT, MPI_STATUSES_IGNORE, ierr)
        end if
        ! Receive from west
        recvtag = 1000
        if (LUseO) call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(1), ierr)
        recvtag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_w) then
!!$           pztab_halo_wt_haloTout => tab_halo_wt(level,m)%haloTout
           !$acc host_data use_device(pztab_halo_wt_haloTout)
           call mpi_irecv(pztab_halo_wt_haloTout,size(pztab_halo_wt_haloTout),  &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(1), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_irecv(pztab_halo_wt_haloTout,neighbour_w_rank=",neighbour_w_rank
#else
           call mpi_irecv(a%st(0-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(1), ierr)          
#endif
        end if
        ! Receive from east
        sendtag = 1001
        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(2), ierr)
        sendtag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_e) then
!!$           pztab_halo_et_haloTout => tab_halo_et(level,m)%haloTout
           !$acc host_data use_device(pztab_halo_et_haloTout)
           call mpi_irecv(pztab_halo_et_haloTout,size(pztab_halo_et_haloTout),  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(2), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_irecv(pztab_halo_et_haloTout,neighbour_e_rank=",neighbour_e_rank
#else
           call mpi_irecv(a%st(a_n+1,0,0),1,          &
                       halo_wet(level,m),neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ewT(2), ierr)
           
#endif
        end if
        ! Send to east
        sendtag = 1000
        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ew(3), ierr)
        sendtag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_e) then
           !$acc host_data use_device(pztab_halo_et_haloTin)
           call mpi_isend(pztab_halo_et_haloTin,size(pztab_halo_et_haloTin),  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ewT(3), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_isend(pztab_halo_et_haloTin,neighbour_e_rank=",neighbour_e_rank
#else
           call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
                       halo_wet(level,m),neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ewT(3), ierr)           
#endif
        end if
        ! Send to west
        recvtag = 1001
        if (LUseO) call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ew(4), ierr)
        recvtag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           if (Gneighbour_w) then
           !$acc host_data use_device(pztab_halo_wt_haloTin)
           call mpi_isend(pztab_halo_wt_haloTin,size(pztab_halo_wt_haloTin),  &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ewT(4), ierr)
           !$acc end host_data
           end if
           !print*,"mpi_isend(pztab_halo_wt_haloTin,neighbour_w_rank=",neighbour_w_rank
#else
           call mpi_isend(a%st(1,0,0),1,                &
                       halo_wet(level,m),neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ewT(4), ierr)           
#endif
        end if
        ! Wait for East <-> West communication to complete
        if (halo_size == 1) then
          ! Wait for North <-> South communication to complete
          if (LUseO) call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
          if (LUseT) call mpi_waitall(4,requests_nsT, MPI_STATUSES_IGNORE, ierr)
        end if
        if (LUseO) call mpi_waitall(4,requests_ew, MPI_STATUSES_IGNORE, ierr)
        if (LUseT) call mpi_waitall(4,requests_ewT, MPI_STATUSES_IGNORE, ierr)
#ifdef MNH_GPUDIRECT
        if (LUseT) then
           if (Gneighbour_n) then
           ! copy north halo for GPU managed
           !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTout)
           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
              pzst(ii,ij-halo_size,ik-1) = pztab_halo_nt_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
           end if
           if (Gneighbour_s) then
           ! copy south halo for GPU managed
           !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTout)
           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
              pzst(ii,ij+a_n,ik-1) = pztab_halo_st_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
           end if
           if (Gneighbour_w) then
           ! copy west halo for GPU managed
           !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTout)
           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
              pzst(ii-halo_size,ij-halo_size,ik-1) = pztab_halo_wt_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
           end if
           if (Gneighbour_e) then
           ! copy east halo for GPU managed
           !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTout)
           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
              pzst(ii+a_n,ij-halo_size,ik-1) = pztab_halo_et_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels           
           end if 
           ! wait for async copy of send buffer to GPU
           call acc_wait_haloswap_mnh()           
        end if 
#endif
      !
      ! End Comm
      !
    end subroutine haloswap_mnh_dim
      
    subroutine acc_wait_haloswap_mnh()
      if (Gneighbour_s) then
         !$acc wait(IS_SOUTH)
      endif
      if (Gneighbour_n) then
         !$acc wait(IS_NORTH)
      endif
      if (Gneighbour_e) then
         !$acc wait(IS_EAST)
      endif
      if (Gneighbour_w) then
         !$acc wait(IS_WEST)
      endif
    end subroutine acc_wait_haloswap_mnh
    
  end subroutine haloswap_mnh
!==================================================================
  subroutine haloswap(level,m, &  ! multigrid- and processor- level
                      a)          ! data field
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(inout) :: a
    integer :: a_n  ! horizontal grid size
    integer :: nz   ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, rank, sendtag, recvtag
    integer :: stat(MPI_STATUS_SIZE)
    integer :: halo_size
    integer :: neighbour_n_rank
    integer :: neighbour_s_rank
    integer :: neighbour_e_rank
    integer :: neighbour_w_rank
    integer :: yoffset, blocklength
    integer, dimension(4) :: requests_ns
    integer, dimension(4) :: requests_ew

    halo_size = comm_param%halo_size

    ! Do nothing if we are only using one processor
    if (m > 0) then
      if (comm_measuretime) then
        call start_timer(t_haloswap(level,m))
      end if
      a_n = a%ix_max-a%ix_min+1
      nz = a%grid_param%nz
      stepsize = 2**(pproc-m)

      ! Work out rank, only execute on relevant processes
      call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

      ! Work out ranks of neighbours
      ! W -> E
      call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                          neighbour_w_rank,neighbour_e_rank,ierr)
      ! N -> S
      call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                          neighbour_n_rank,neighbour_s_rank,ierr)
      if ( (stepsize == 1) .or.                   &
         (  (mod(p_horiz(1),stepsize) == 0) .and. &
            (mod(p_horiz(2),stepsize) == 0) ) ) then
        if (halo_size == 1) then
          ! Do not include corners in send/recv
          yoffset = 1
          blocklength = a_n*(nz+2)*halo_size
        else
          yoffset = 1-halo_size
          blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
        end if
        ! Receive from north
        recvtag = 2
        call mpi_irecv(a%s(0,0-(halo_size-1),1),1,                  &
                       halo_ns(level,m),neighbour_n_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(1), ierr)
        ! Receive from south
        recvtag = 3
        call mpi_irecv(a%s(0,a_n+1,1),1,                            &
                       halo_ns(level,m),neighbour_s_rank,recvtag,   &
                       MPI_COMM_HORIZ, requests_ns(2), ierr)
        ! Send to south
        sendtag = 2
        call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,                &
                       halo_ns(level,m),neighbour_s_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(3), ierr)
        ! Send to north
        sendtag = 3
        call mpi_isend(a%s(0,1,1),1,                                &
                       halo_ns(level,m),neighbour_n_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ns(4), ierr)
        if (halo_size > 1) then
          ! Wait for North <-> South communication to complete
          call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
        end if
        ! Receive from west
        recvtag = 0
        call mpi_irecv(a%s(0,yoffset,0-(halo_size-1)),blocklength,    &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(1), ierr)
        ! Receive from east
        sendtag = 1
        call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                       MPI_COMM_HORIZ, requests_ew(2), ierr)
        ! Send to east
        sendtag = 0
        call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
                       MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                       MPI_COMM_HORIZ, requests_ew(3), ierr)
        ! Send to west
        recvtag = 1
        call mpi_isend(a%s(0,yoffset,1),blocklength,                &
                       MPI_DOUBLE_PRECISION,neighbour_w_rank,sendtag,   &
                       MPI_COMM_HORIZ, requests_ew(4), ierr)
        ! Wait for East <-> West communication to complete
        if (halo_size == 1) then
          ! Wait for North <-> South communication to complete
          call mpi_waitall(4,requests_ns, MPI_STATUSES_IGNORE, ierr)
        end if
        call mpi_waitall(4,requests_ew, MPI_STATUSES_IGNORE, ierr)
      end if
      if (comm_measuretime) then
        call finish_timer(t_haloswap(level,m))
      end if
    end if
  end subroutine haloswap
!==================================================================
!  Collect from a(level,m) and store on less processors
!  in b(level,m-1)
!
!  Example for pproc-m = 1, i.e. stepsize = 2:
!
!   NW (0,0)  <--  NE (0,2)
!
!     ^      .
!     !        .
!                .
!   SW (2,0)       SE (2,2) [send to 0,0]
!
!==================================================================
  subroutine collect(level,m, &    ! multigrid and processor level
                     a, &          ! IN: data on level (level,m)
                     b)            ! OUT: data on level (level,m-1)
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(inout) :: b
    integer :: a_n, b_n   ! horizontal grid sizes
    integer :: nz ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, source_rank, dest_rank, rank, recv_tag, send_tag, iz
    logical :: corner_nw, corner_ne, corner_sw, corner_se
    integer :: recv_request(3)
    integer :: recv_requestT(3)

    integer :: ii,ij,ik

    real , pointer , contiguous , dimension(:,:,:) :: za_st,zb_st

    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_ne_m_haloTin
    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_sw_m_haloTin
    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_se_m_haloTin

    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_ne_m_1_haloTout
    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_sw_m_1_haloTout
    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_se_m_1_haloTout

    call start_timer(t_collect(m))

    stepsize = 2**(pproc-m)

    a_n = a%ix_max-a%ix_min+1
    b_n = b%ix_max-b%ix_min+1
    nz = b%grid_param%nz

    ! Work out rank, only execute on relevant processes
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    ! Store position in process grid in in p_horiz
    ! Note we can NOT use cart_shift as we need diagonal neighburs as well
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Ignore all processes that do not participate at this level
    if ( (stepsize .eq. 1) .or. ((mod(p_horiz(1),stepsize) == 0) .and. (mod(p_horiz(2),stepsize)) == 0)) then
      ! Determine position in local 2x2 block
      if (stepsize .eq. 1) then
        corner_nw = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 0))
        corner_ne = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 1))
        corner_sw = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 0))
        corner_se = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 1))
      else
        corner_nw = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_ne = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 1))
        corner_sw = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_se = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 1))
      end if
      ! NW receives from the other three processes
      if ( corner_nw ) then
        ! Receive from NE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)+stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1000
        if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(1),ierr)
        recv_tag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_ne_m_1_haloTout => tab_sub_interiorT_ne(level,m-1)%haloTout
           !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTout)
           call mpi_irecv(ztab_sub_interiorT_ne_m_1_haloTout,size(ztab_sub_interiorT_ne_m_1_haloTout), &
                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(1),ierr)
           !$acc end host_data
#else
           call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(1),ierr)
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from NE failed in mpi_irecv().")
#endif
         ! Receive from SW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)/), &
                           source_rank, &
                           ierr)
        recv_tag = 1001
        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(2),ierr)
        recv_tag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_sw_m_1_haloTout => tab_sub_interiorT_sw(level,m-1)%haloTout
           !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTout)
           call mpi_irecv(ztab_sub_interiorT_sw_m_1_haloTout,size(ztab_sub_interiorT_sw_m_1_haloTout), &
                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(2),ierr)
           !$acc end host_data
#else
           call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(2),ierr)         
#endif           
        endif

#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from SW failed in mpi_irecv().")
#endif
        ! Receive from SE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                           source_rank, &
                           ierr)
        recv_tag = 1002
        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                           recv_request(3),ierr)
        recv_tag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_se_m_1_haloTout => tab_sub_interiorT_se(level,m-1)%haloTout
           !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTout)
           call mpi_irecv(ztab_sub_interiorT_se_m_1_haloTout,size(ztab_sub_interiorT_se_m_1_haloTout), &
                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(3),ierr)
           !$acc end host_data
#else
           call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                recv_requestT(3),ierr)           
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: receive from SE failed in mpi_irecv().")
#endif
        ! Copy local data while waiting for data from other processes
        if (LUseO) b%s(0:nz+1,1:a_n,1:a_n) = a%s(0:nz+1,1:a_n,1:a_n)
        if (LUseT) then
#ifdef MNH_GPUDIRECT           
           zb_st => b%st
           za_st => a%st
           !$acc kernels present_cr(zb_st,za_st)
           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
              zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels
#else           
           b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1)
#endif
        end if
        ! Wait for receives to complete before proceeding
        if (LUseO) call mpi_waitall(3,recv_request,MPI_STATUSES_IGNORE,ierr)
        if (LUseT) call mpi_waitall(3,recv_requestT,MPI_STATUSES_IGNORE,ierr)
#ifdef MNH_GPUDIRECT
        if (LUseT) then
           zb_st => b%st
           ! copy from buffer for GPU DIRECT
           ! Receive from NE
           !$acc kernels present_cr(zb_st,ztab_sub_interiorT_ne_m_1_haloTout)
           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
              zb_st(ii+b_n/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels           
           ! Receive from SW
           !$acc kernels present_cr(zb_st,ztab_sub_interiorT_sw_m_1_haloTout)
           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
              zb_st(ii,ij+b_n/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
           ! Receive from SE
           !$acc kernels present_cr(zb_st,ztab_sub_interiorT_se_m_1_haloTout)
           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
               zb_st(ii+b_n/2,ij+b_n/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
        end if
#endif
      end if
      if ( corner_ne ) then
        ! Send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)-stepsize/), &
                           dest_rank, &
                           ierr)

        za_st => a%st
        
        send_tag = 1000
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_ne_m_haloTin => tab_interiorT_ne(level,m)%haloTin
           !$acc kernels present_cr(ztab_interiorT_ne_m_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
              ztab_interiorT_ne_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels           
           !$acc host_data use_device(ztab_interiorT_ne_m_haloTin)
           call mpi_send(ztab_interiorT_ne_m_haloTin,size(ztab_interiorT_ne_m_haloTin), &
                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
           !$acc end host_data
#else
           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from NE failed in mpi_send().")
#endif
      end if
      if ( corner_sw ) then
        ! Send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)/), &
                           dest_rank, &
                           ierr)
        
        za_st => a%st
        
        send_tag = 1001
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_sw_m_haloTin => tab_interiorT_sw(level,m)%haloTin
           !$acc kernels present_cr(ztab_interiorT_sw_m_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
              ztab_interiorT_sw_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels           
           !$acc host_data use_device(ztab_interiorT_sw_m_haloTin)
           call mpi_send(ztab_interiorT_sw_m_haloTin,size(ztab_interiorT_sw_m_haloTin), &
                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
           !$acc end host_data
#else
           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)        
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from SW failed in mpi_send().")
#endif
      end if
      if ( corner_se ) then
        ! send to NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                           dest_rank, &
                           ierr)

        za_st => a%st
        
        send_tag = 1002
        if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
        send_tag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_se_m_haloTin => tab_interiorT_se(level,m)%haloTin
           !$acc kernels present_cr(ztab_interiorT_se_m_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
              ztab_interiorT_se_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels
           !$acc host_data use_device(ztab_interiorT_se_m_haloTin)
           call mpi_send(ztab_interiorT_se_m_haloTin,size(ztab_interiorT_se_m_haloTin), &
                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
           !$acc end host_data
#else
           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Collect: send from SE failed in mpi_send().")
#endif
      end if

    end if
    call finish_timer(t_collect(m))

  end subroutine collect

!==================================================================
!  Distribute data in a(level,m-1) and store in b(level,m)
!
!  Example for p-m = 1, i.e. stepsize = 2:
!
!   NW (0,0)  -->  NE (2,0)
!
!     !      .
!     v        .
!                .
!   SW (0,2)       SE (2,2) [receive from to 0,0]
!==================================================================
  subroutine distribute(level,m, &  ! multigrid and processor level
                        a,       &  ! IN: Data on level (level,m-1)
                        b)          ! OUT: Data on level (level,m)
    implicit none
    integer, intent(in) :: level
    integer, intent(in) :: m
    type(scalar3d), intent(in) :: a
    type(scalar3d), intent(inout) :: b
    integer :: a_n, b_n   ! horizontal grid sizes
    integer :: nz ! vertical grid size
    integer, dimension(2) :: p_horiz
    integer :: stepsize
    integer :: ierr, source_rank, dest_rank, send_tag, recv_tag, rank, iz
    integer :: stat(MPI_STATUS_SIZE)
    integer :: send_request(3)
    integer :: send_requestT(3)
    logical :: corner_nw, corner_ne, corner_sw, corner_se

    integer :: ii,ij,ik

    real , pointer , contiguous , dimension(:,:,:) :: za_st,zb_st

    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_ne_m_1_haloTin
    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_sw_m_1_haloTin
    real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_se_m_1_haloTin

    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_ne_m_haloTout
    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_sw_m_haloTout
    real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_se_m_haloTout

    call start_timer(t_distribute(m))

    stepsize = 2**(pproc-m)

    a_n = a%ix_max-a%ix_min+1
    b_n = b%ix_max-b%ix_min+1
    nz = a%grid_param%nz

    ! Work out rank, only execute on relevant processes
    call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
    call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)

    ! Ignore all processes that do not participate at this level
    if ( (stepsize .eq. 1) .or. ((mod(p_horiz(1),stepsize) == 0) .and. (mod(p_horiz(2),stepsize)) == 0)) then
      ! Work out coordinates in local 2 x 2 block
      if (stepsize .eq. 1) then
        corner_nw = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 0))
        corner_ne = ((mod(p_horiz(1),2) == 0) .and. (mod(p_horiz(2),2) == 1))
        corner_sw = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 0))
        corner_se = ((mod(p_horiz(1),2) == 1) .and. (mod(p_horiz(2),2) == 1))
      else
        corner_nw = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_ne = ((mod(p_horiz(1)/stepsize,2) == 0) .and. (mod(p_horiz(2)/stepsize,2) == 1))
        corner_sw = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 0))
        corner_se = ((mod(p_horiz(1)/stepsize,2) == 1) .and. (mod(p_horiz(2)/stepsize,2) == 1))
      end if
      if ( corner_nw ) then
        ! (Asynchronous) send to NE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)+stepsize/), &
                           dest_rank, &
                           ierr)
        
        za_st => a%st
        
        send_tag = 1000
        if (LUseO) call mpi_isend(a%s(0,1,a_n/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, &
                       MPI_COMM_HORIZ,send_request(1),ierr)
        send_tag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_ne_m_1_haloTin => tab_sub_interiorT_ne(level,m-1)%haloTin
           !$acc kernels present_cr(ztab_sub_interiorT_ne_m_1_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
              ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels           
           !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTin)
           call mpi_isend(ztab_sub_interiorT_ne_m_1_haloTin,size(ztab_sub_interiorT_ne_m_1_haloTin), &
                MPI_DOUBLE_PRECISION,dest_rank, send_tag, &
                MPI_COMM_HORIZ,send_requestT(1),ierr)
           !$acc end host_data
#else
           call mpi_isend(a%st(a_n/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, &
                MPI_COMM_HORIZ,send_requestT(1),ierr)          
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to NE failed in mpi_isend().")
#endif
        ! (Asynchronous) send to SW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)/), &
                           dest_rank, &
                           ierr)
        send_tag = 1001
        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                       MPI_COMM_HORIZ, send_request(2), ierr)
        send_tag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_sw_m_1_haloTin => tab_sub_interiorT_sw(level,m-1)%haloTin
           !$acc kernels present_cr(ztab_sub_interiorT_sw_m_1_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
              ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_n/2,ik-1)
           !$mnh_end_do()
           !$acc end kernels           
           !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTin)
           call mpi_isend(ztab_sub_interiorT_sw_m_1_haloTin,size(ztab_sub_interiorT_sw_m_1_haloTin), &
                MPI_DOUBLE_PRECISION, dest_rank, send_tag, &
                MPI_COMM_HORIZ, send_requestT(2), ierr)
           !$acc end host_data
#else
           call mpi_isend(a%st(1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                MPI_COMM_HORIZ, send_requestT(2), ierr)           
#endif
        end if

#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to SW failed in mpi_isend().")
#endif
        ! (Asynchronous) send to SE
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                           dest_rank, &
                           ierr)
        send_tag = 1002
        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,a_n/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                      MPI_COMM_HORIZ, send_request(3), ierr)
        send_tag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_sub_interiorT_se_m_1_haloTin => tab_sub_interiorT_se(level,m-1)%haloTin
           !$acc kernels present_cr(ztab_sub_interiorT_se_m_1_haloTin,za_st)
           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
              ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij+a_n/2,ik-1)
           !$mnh_end_do()
           !$acc end kernels           
           !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTin)
           call mpi_isend(ztab_sub_interiorT_se_m_1_haloTin,size(ztab_sub_interiorT_se_m_1_haloTin), &
                MPI_DOUBLE_PRECISION, dest_rank, send_tag, &
                MPI_COMM_HORIZ, send_requestT(3), ierr)
           !$acc end host_data
#else
           call mpi_isend(a%st(a_n/2+1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                MPI_COMM_HORIZ, send_requestT(3), ierr)          
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: send to SE failed in mpi_isend().")
#endif
        ! While sending, copy local data
        if (LUseO) b%s(0:nz+1,1:b_n,1:b_n) = a%s(0:nz+1,1:b_n,1:b_n)
        if (LUseT) then
#ifdef MNH_GPUDIRECT                     
           zb_st => b%st
           za_st => a%st
           !$acc kernels present_cr(zb_st,za_st)
           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
              zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
           !$mnh_end_do()
           !$acc end kernels
#else
           b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1)
#endif
        end if
        ! Only proceed when async sends to complete
        if (LUseO) call mpi_waitall(3, send_request, MPI_STATUSES_IGNORE, ierr)
        if (LUseT) call mpi_waitall(3, send_requestT, MPI_STATUSES_IGNORE, ierr)
      end if
      if ( corner_ne ) then

        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1),p_horiz(2)-stepsize/), &
                           source_rank, &
                           ierr)
        
        zb_st => b%st
        
        recv_tag = 1000
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1010
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_ne_m_haloTout => tab_interiorT_ne(level,m)%haloTout
           !$acc host_data use_device(ztab_interiorT_ne_m_haloTout)
           call mpi_recv(ztab_interiorT_ne_m_haloTout,size(ztab_interiorT_ne_m_haloTout), &
                MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
           !$acc end host_data
           !$acc kernels present_cr(zb_st,ztab_interiorT_ne_m_haloTout)
           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
              zb_st(ii,ij,ik-1) = ztab_interiorT_ne_m_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels           
#else
           call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on NE failed in mpi_recv().")
#endif
      end if
      if ( corner_sw ) then
        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)/), &
                           source_rank, &
                           ierr)

        zb_st => b%st
        
        recv_tag = 1001
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1011
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_sw_m_haloTout => tab_interiorT_sw(level,m)%haloTout
           !$acc host_data use_device(ztab_interiorT_sw_m_haloTout)
           call mpi_recv(ztab_interiorT_sw_m_haloTout,size(ztab_interiorT_sw_m_haloTout), &           
                MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
           !$acc end host_data
           !$acc kernels present_cr(zb_st,ztab_interiorT_sw_m_haloTout)
           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
              zb_st(ii,ij,ik-1) = ztab_interiorT_sw_m_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
#else
           call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on SW failed in mpi_recv().")
#endif
      end if
      if ( corner_se ) then
        ! Receive from NW
        call mpi_cart_rank(MPI_COMM_HORIZ, &
                           (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                           source_rank, &
                           ierr)
        zb_st => b%st
        
        recv_tag = 1002
        if (LUseO) call mpi_recv(b%s(0,1,1),1,interior(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
        recv_tag = 1012
        if (LUseT) then
#ifdef MNH_GPUDIRECT
           ztab_interiorT_se_m_haloTout => tab_interiorT_se(level,m)%haloTout
           !$acc host_data use_device(ztab_interiorT_se_m_haloTout)
           call mpi_recv(ztab_interiorT_se_m_haloTout,size(ztab_interiorT_se_m_haloTout), &
                MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
           !$acc end host_data
           !$acc kernels present_cr(zb_st,ztab_interiorT_se_m_haloTout)
           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
              zb_st(ii,ij,ik-1) = ztab_interiorT_se_m_haloTout(ii,ij,ik)
           !$mnh_end_do()
           !$acc end kernels
#else
           call mpi_recv(b%st(1,1,0),1,interiorT(level,m),source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)           
#endif
        end if
#ifndef NDEBUG
  if (ierr .ne. 0) &
    call fatalerror("Distribute: receive on NW failed in mpi_recv().")
#endif
      end if

    end if
    call finish_timer(t_distribute(m))

  end subroutine distribute

end module communication