diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
index 77f0f54eea9d6a6d4963233f27d48f212c42af38..b7ac6348d477ebc9952d9ea235c82414e1856b5a 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
@@ -64,15 +64,15 @@ public::collect
 public::distribute
 public::i_am_master_mpi
 public::master_rank
-public::pproc
+public::pproc,pprocx,pprocy
 public::MPI_COMM_HORIZ
 public::comm_parameters
 public::comm_measuretime
 
 
   ! Number of processors
-  ! n_proc = 2^(2*pproc), with integer pproc
-  integer :: pproc
+  ! n_proc = 2^(1*pproc) = 2^pprocx*2^pprocy ! 2^(2*pproc), with integer pproc
+  integer :: pproc,pprocx,pprocy
 
 ! Rank of master process
   integer, parameter :: master_rank = 0
@@ -216,14 +216,14 @@ contains
     i_am_master_mpi = (rank == master_rank)
     ! Check that nproc = 2^(2*p)
     !pproc = floor(log(1.0d0*nproc)/log(4.0d0))
-    pproc = nint(log(1.0d0*nproc)/log(4.0d0))
-    if ( (nproc - 4**pproc) .ne. 0) then
-      print*,"Number of processors has to be 2^(2*pproc) with integer nproc,pproc=",nproc,pproc
-      call fatalerror("Number of processors has to be 2^(2*pproc) with integer pproc.")
+    pproc = nint(log(1.0d0*nproc)/log(2.0d0))
+    if ( (nproc - 2**pproc) .ne. 0) then
+      print*,"Number of processors has to be 2^(1*pproc) with integer nproc,pproc=",nproc,pproc
+      call fatalerror("Number of processors has to be 2^(1*pproc) with integer pproc.")
     end if
     if (i_am_master_mpi) then
       write(STDOUT,'("PARALLEL RUN")')
-      write(STDOUT,'("Number of processors : 2^(2*pproc) = ",I10," with pproc = ",I6)') &
+      write(STDOUT,'("Number of processors : 2^(1*pproc) = ",I10," with pproc = ",I6)') &
         nproc, pproc
     end if
     ! Create halo data types
@@ -256,18 +256,27 @@ contains
     integer,parameter    :: nb_dims=3
     integer,dimension(nb_dims) :: profil_tab,profil_sous_tab,coord_debut
 
+    integer :: nlocalx,nlocaly
+
     n = grid_param%n
     nz = grid_param%nz
 
     comm_param = comm_param_in
     halo_size = comm_param%halo_size
 
+#ifndef MPIVIDE
     call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr)
-
+    !
+    !  Manage nproc=2^pproc ( and not 2^2pproc )
+    !
+    pprocx = pproc / 2
+    pprocy = pproc - pprocx
+    pproc = pprocx
+    if (i_am_master_mpi) write(STDOUT,*) "pproc=",pproc," pprocx=",pprocx," pprocy=",pprocy
     ! Create cartesian topology
     call mpi_cart_create(MPI_COMM_WORLD,        & ! Old communicator name
                          dim_horiz,             & ! horizontal dimension
-                         (/2**pproc,2**pproc/), & ! extent in each horizontal direction
+                         (/2**pprocy,2**pprocx/), & ! extent in each horizontal direction
                          (/.false.,.false./),   & ! periodic?
                          .false.,               & ! reorder?
                          MPI_COMM_HORIZ,        & ! Name of new communicator
@@ -275,9 +284,11 @@ contains
    ! calculate and display rank and corrdinates in cartesian grid
     call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr)
     call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
-
+#endif
     ! Local size of (horizontal) grid
     nlocal = n/2**pproc
+    nlocaly = n/2**pprocy
+    nlocalx = n/2**pprocx
 
     ! === Set up data types ===
     ! Halo for exchange in north-south direction
@@ -320,85 +331,101 @@ contains
       count = nlocal
       blocklength = (nz+2)*halo_size
       stride = (nlocal+2*halo_size)*(nz+2)
+#ifndef MPIVIDE
       call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                            halo_ns(level,m),ierr)
       call mpi_type_commit(halo_ns(level,m),ierr)
+#endif
       endif
       ! tranpose
       if (LUseT) then
       ! NS- (y-) transpose direction
       count =        nz+2                                        ! nlocal
-      blocklength =  nlocal*halo_size                            ! (nz+2)*halo_size
-      stride =       (nlocal+2*halo_size) * (nlocal+2*halo_size) ! (nlocal+2*halo_size)*(nz+2)
+      blocklength =  nlocalx*halo_size                            ! (nz+2)*halo_size
+      stride =       (nlocalx+2*halo_size) * (nlocaly+2*halo_size) ! (nlocal+2*halo_size)*(nz+2)
+#ifndef MPIVIDE
       call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                            halo_nst(level,m),ierr)
       call mpi_type_commit(halo_nst(level,m),ierr)
+#endif
       ! allocate send/recv buffer for GPU copy/managed
-      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTin,nlocal,halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTout,nlocal,halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTin,nlocal,halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTout,nlocal,halo_size,nz+2)      
+      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTin,nlocalx,halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTout,nlocalx,halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTin,nlocalx,halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTout,nlocalx,halo_size,nz+2)      
       
      ! WE- (x-) transpose direction
-      count =        (nz+2)*(nlocal+2*halo_size)*halo_size       ! nlocal
+      count =        (nz+2)*(nlocaly+2*halo_size)*halo_size       ! nlocal
       blocklength =  1*halo_size                                 ! (nz+2)*halo_size
-      stride =       nlocal+2*halo_size                          ! (nlocal+2*halo_size)*(nz+2)
+      stride =       nlocalx+2*halo_size                          ! (nlocal+2*halo_size)*(nz+2)
+#ifndef MPIVIDE
       call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, &
                            halo_wet(level,m),ierr)
       call mpi_type_commit(halo_wet(level,m),ierr)
+#endif
       ! allocate send/recv buffer for GPU copy/managed      !
-      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2)
-      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2)      
+      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTin,halo_size,nlocaly+2*halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTout,halo_size,nlocaly+2*halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTin,halo_size,nlocaly+2*halo_size,nz+2)
+      call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTout,halo_size,nlocaly+2*halo_size,nz+2)      
       endif
 #ifndef NDEBUG
+#ifndef MPIVIDE
   if (ierr .ne. 0) &
     call fatalerror("Commit halo_ns failed in mpi_type_commit().")
+#endif
 #endif
       ! --- Create interior data types ---
       if (LUseO) then
          count = nlocal
          blocklength = nlocal*(nz+2)
          stride = (nz+2)*(nlocal+2*halo_size)
+#ifndef MPIVIDE
          call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,interior(level,m),ierr)
          call mpi_type_commit(interior(level,m),ierr)
+#endif
          count = nlocal/2
          blocklength = nlocal/2*(nz+2)
          stride = (nlocal+2*halo_size)*(nz+2)
+#ifndef MPIVIDE
          call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,sub_interior(level,m),ierr)
          call mpi_type_commit(sub_interior(level,m),ierr)
+#endif
       end if
       if (LUseT) then
          ! interiorT
-         if ( nlocal /= 0 ) then
-            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
-            profil_sous_tab = (/ nlocal             , nlocal             , nz+2 /)
+         if ( ( nlocalx /= 0 ) .and. ( nlocaly /= 0 ) ) then
+            profil_tab      = (/ nlocalx+2*halo_size , nlocaly+2*halo_size , nz+2 /)
+            profil_sous_tab = (/ nlocalx             , nlocaly             , nz+2 /)
             coord_debut     = (/ 0                  , 0                  , 0    /)
+#ifndef MPIVIDE
             call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                  MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,interiorT(level,m),ierr)
             call mpi_type_commit(interiorT(level,m),ierr)
-            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocal,nlocal,nz+2)
-            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocal,nlocal,nz+2)
-            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocal,nlocal,nz+2)
-            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocal,nlocal,nz+2)
-            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocal,nlocal,nz+2)
-            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocal,nlocal,nz+2)
+#endif
+            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocalx,nlocaly,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocalx,nlocaly,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocalx,nlocaly,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocalx,nlocaly,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocalx,nlocaly,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocalx,nlocaly,nz+2)
          end if
          ! sub_interiorT
-         if ( (nlocal/2) /= 0 ) then
-            profil_tab      = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /)
-            profil_sous_tab = (/ nlocal/2           , nlocal/2           , nz+2 /)
+         if ( ( (nlocalx/2) /= 0 ) .and. ( (nlocaly/2) /= 0 ) ) then
+            profil_tab      = (/ nlocalx+2*halo_size , nlocaly+2*halo_size , nz+2 /)
+            profil_sous_tab = (/ nlocalx/2           , nlocaly/2           , nz+2 /)
             coord_debut     = (/ 0                  , 0                  , 0    /)
+#ifndef MPIVIDE
             call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                  MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sub_interiorT(level,m),ierr)
             call mpi_type_commit(sub_interiorT(level,m),ierr)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
-            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)            
+#endif
+            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2)            
          end if
       end if
       ! --- Create timers ---
@@ -410,11 +437,15 @@ contains
         reduced_m = .true.
         m = m-1
         nlocal = 2*nlocal
+        nlocalx = 2*nlocalx
+        nlocaly = 2*nlocaly
         cycle
       end if
       reduced_m = .false.
       level = level-1
       nlocal = nlocal/2
+      nlocalx = nlocalx/2
+      nlocaly = nlocaly/2
     end do
 
   end subroutine comm_initialise
@@ -436,9 +467,13 @@ contains
     integer :: nlocal,n
     character(len=80) :: s
 
+    integer :: nlocalx,nlocaly
+
    ! Local size of (horizontal) grid
     n = grid_param%n
     nlocal = n/2**pproc
+    nlocalx = n/2**pprocx
+    nlocaly = n/2**pprocy
 
     m = pproc
     level = n_lev
@@ -453,24 +488,30 @@ contains
       ! --- Print out timer information ---
       call print_elapsed(t_haloswap(level,m),.True.,1.0_rl)
       ! --- Free halo data types ---
+#ifndef MPIVIDE
       if (LUseO) call mpi_type_free(halo_ns(level,m),ierr)
       if (LUseT) call mpi_type_free(halo_nst(level,m),ierr)
       if (LUseT) call mpi_type_free(halo_wet(level,m),ierr)
       ! --- Free interior data types ---
       if (LUseO) call mpi_type_free(interior(level,m),ierr)
       if (LUseO) call mpi_type_free(sub_interior(level,m),ierr)
-      if (LUseT .and. (nlocal /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr)
-      if (LUseT .and. ( (nlocal/2) /= 0 ) ) call mpi_type_free(sub_interiorT(level,m),ierr)
+      if (LUseT .and. (nlocalx /= 0 )  .and. ( nlocaly /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr)
+      if (LUseT .and. ( (nlocalx/2) /= 0 )  .and. ( (nlocaly/2) /= 0 )) call mpi_type_free(sub_interiorT(level,m),ierr)
+#endif
       ! If we are below L_split, split data
       if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
         reduced_m = .true.
         m = m-1
         nlocal = 2*nlocal
+        nlocalx = 2*nlocalx
+        nlocaly = 2*nlocaly
         cycle
       end if
       reduced_m = .false.
       level = level-1
       nlocal = nlocal/2
+      nlocalx = nlocalx/2
+      nlocaly = nlocaly/2
     end do
     do m=pproc,0,-1
       write(s,'("m = ",I3)') m
@@ -524,7 +565,12 @@ contains
     integer :: icompy_min,icompy_max, icompx_min,icompx_max
     real , dimension(:,:,:) , pointer , contiguous :: za_st,zb_st
 
+    integer :: nlocalx,nlocaly
+
     nlocal = a%ix_max-a%ix_min+1
+    nlocalx = a%ix_max-a%ix_min+1
+    nlocaly = a%iy_max-a%iy_min+1
+    
     nz = a%grid_param%nz
 
     iy_min = a%iy_min
@@ -678,7 +724,11 @@ contains
     call scalarprod_mnh(m, a, a, s)
     s = sqrt(s)
     if (i_am_master_mpi) then
+#ifdef MG_DEBUG
+       write(STDOUT,'("Print_norm::",A,2I3,E23.10)') message, l,m,s
+#else
        write(STDOUT,'("Print_norm::",A,2I3,E23.15)') message, l,m,s
+#endif
        call flush(STDOUT)
     end if
 
@@ -818,11 +868,15 @@ contains
     integer :: neighbour_w_rank
     integer :: yoffset, blocklength
 
+    integer :: a_nx,a_ny, blocklengthx,blocklengthy
+
     halo_size = comm_param%halo_size
 
     ! Do nothing if we are only using one processor
-    if (m > 0) then
+    if (m >= 0) then
       a_n = a%ix_max-a%ix_min+1
+      a_nx = a%ix_max-a%ix_min+1
+      a_ny = a%iy_max-a%iy_min+1
       nz = a%grid_param%nz
       stepsize = 2**(pproc-m)
 
@@ -832,11 +886,13 @@ contains
 
       ! Work out ranks of neighbours
       ! W -> E
+#ifndef MPIVIDE
       call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                           neighbour_w_rank,neighbour_e_rank,ierr)
       ! N -> S
       call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                           neighbour_n_rank,neighbour_s_rank,ierr)
+#endif
       if ( (stepsize == 1) .or.                   &
          (  (mod(p_horiz(1),stepsize) == 0) .and. &
             (mod(p_horiz(2),stepsize) == 0) ) ) then
@@ -844,9 +900,13 @@ contains
           ! Do not include corners in send/recv
           yoffset = 1
           blocklength = a_n*(nz+2)*halo_size
+          blocklengthx = a_nx*(nz+2)*halo_size
+          blocklengthy = a_ny*(nz+2)*halo_size
         else
           yoffset = 1-halo_size
           blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
+          blocklengthx = (a_nx+2*halo_size)*(nz+2)*halo_size
+          blocklengthy = (a_ny+2*halo_size)*(nz+2)*halo_size
         end if
         ! Receive from north
         recvtag = 1002
@@ -861,23 +921,23 @@ contains
         end if
         ! Receive from south
         recvtag = 1003
-        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
+        if (LUseO) call mpi_irecv(a%s(0,a_ny+1,1),1,                 &
                        halo_ns(level,m),neighbour_s_rank,recvtag,   &
                        MPI_COMM_HORIZ, recv_requests(2), ierr)
         recvtag = 1013
         if (LUseT) then
-           call mpi_irecv(a%st(1,a_n+1,0),1,                &
+           call mpi_irecv(a%st(1,a_ny+1,0),1,                &
                 halo_nst(level,m),neighbour_s_rank,recvtag,  &
                 MPI_COMM_HORIZ, recv_requestsT(2), ierr)
         end if
         ! Send to south
         sendtag = 1002
-        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
+        if (LUseO) call mpi_isend(a%s(0,a_ny-(halo_size-1),1),1,     &
                        halo_ns(level,m),neighbour_s_rank,sendtag,   &
                        MPI_COMM_HORIZ, send_requests(1), ierr)
         sendtag = 1012
         if (LUseT) then
-           call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
+           call mpi_isend(a%st(1,a_ny-(halo_size-1),0),1,    &
                 halo_nst(level,m),neighbour_s_rank,sendtag,  &
                 MPI_COMM_HORIZ, send_requestsT(1), ierr)
         end if
@@ -905,23 +965,23 @@ contains
         end if
         ! Receive from east
         sendtag = 1001
-        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
+        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_nx+1),blocklength,          &
                        MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                        MPI_COMM_HORIZ, recv_requests(4), ierr)
         sendtag = 1011
         if (LUseT) then
-           call mpi_irecv(a%st(a_n+1,0,0),1,          &
+           call mpi_irecv(a%st(a_nx+1,0,0),1,          &
                 halo_wet(level,m),neighbour_e_rank,recvtag, &
                 MPI_COMM_HORIZ, recv_requestsT(4), ierr)
         end if
         ! Send to east
         sendtag = 1000
-        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
+        if (LUseO) call mpi_isend(a%s(0,yoffset,a_nx-(halo_size-1)),blocklength,  &
                        MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                        MPI_COMM_HORIZ, send_requests(3), ierr)
         sendtag = 1010
         if (LUseT) then
-           call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
+           call mpi_isend(a%st(a_nx-(halo_size-1),0,0),1,  &
                 halo_wet(level,m),neighbour_e_rank,sendtag, &
                 MPI_COMM_HORIZ, send_requestsT(3), ierr)
         end if
@@ -967,7 +1027,7 @@ contains
     halo_size = comm_param%halo_size
 
     ! Do nothing if we are only using one processor
-    if (m > 0) then
+    if (m >= 0) then
       a_n = a%ix_max-a%ix_min+1
       nz = a%grid_param%nz
       stepsize = 2**(pproc-m)
@@ -978,11 +1038,13 @@ contains
 
       ! Work out ranks of neighbours
       ! W -> E
+#ifndef MPIVIDE
       call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                           neighbour_w_rank,neighbour_e_rank,ierr)
       ! N -> S
       call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                           neighbour_n_rank,neighbour_s_rank,ierr)
+#endif
       if ( (stepsize == 1) .or.                   &
          (  (mod(p_horiz(1),stepsize) == 0) .and. &
             (mod(p_horiz(2),stepsize) == 0) ) ) then
@@ -1095,14 +1157,21 @@ contains
 
     LOGICAL :: Gneighbour_s,Gneighbour_n,Gneighbour_e,Gneighbour_w
 
+    integer :: a_nx,a_ny, blocklengthx,blocklengthy
+
     halo_size = comm_param%halo_size
 
+#ifdef MG_DEBUG
+    call print_scalaprod2(level , m  , a  , "InBeg haloswap_mnh a(level,m  )=" )
+#endif
     ! Do nothing if we are only using one processor
-    if (m > 0) then
+    if (m >= 0) then
       if (comm_measuretime) then
         call start_timer(t_haloswap(level,m))
       end if
       a_n = a%ix_max-a%ix_min+1
+      a_nx = a%ix_max-a%ix_min+1
+      a_ny = a%iy_max-a%iy_min+1
       nz = a%grid_param%nz
       stepsize = 2**(pproc-m)
 
@@ -1112,11 +1181,13 @@ contains
 
       ! Work out ranks of neighbours
       ! W -> E
+#ifndef MPIVIDE
       call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                           neighbour_w_rank,neighbour_e_rank,ierr)
       ! N -> S
       call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                           neighbour_n_rank,neighbour_s_rank,ierr)
+#endif
       if ( (stepsize == 1) .or.                   &
          (  (mod(p_horiz(1),stepsize) == 0) .and. &
             (mod(p_horiz(2),stepsize) == 0) ) ) then
@@ -1124,9 +1195,13 @@ contains
           ! Do not include corners in send/recv
           yoffset = 1
           blocklength = a_n*(nz+2)*halo_size
+          blocklengthx = a_nx*(nz+2)*halo_size
+          blocklengthy = a_ny*(nz+2)*halo_size
         else
           yoffset = 1-halo_size
           blocklength = (a_n+2*halo_size)*(nz+2)*halo_size
+          blocklengthx = (a_nx+2*halo_size)*(nz+2)*halo_size
+          blocklengthy = (a_ny+2*halo_size)*(nz+2)*halo_size         
         end if
         !
         zst => a%st
@@ -1190,7 +1265,11 @@ contains
      if (comm_measuretime) then
         call finish_timer(t_haloswap(level,m))
      end if
-  end if !  (m > 0)
+  end if !  (m >= 0)
+
+#ifdef MG_DEBUG
+    call print_scalaprod2(level , m  , a  , "OuBeg haloswap_mnh a(level,m  )=" )
+#endif
 
 contains
   subroutine haloswap_mnh_dim(pztab_halo_st_haloTin,pztab_halo_nt_haloTin,&
@@ -1200,15 +1279,15 @@ contains
                               pzst)
 
     implicit none
-    real :: pztab_halo_st_haloTin(1:a_n,1:halo_size,1:nz+2), &
-            pztab_halo_nt_haloTin(1:a_n,1:halo_size,1:nz+2), &
-            pztab_halo_et_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
-            pztab_halo_wt_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
-            pztab_halo_nt_haloTout(1:a_n,1:halo_size,1:nz+2), &
-            pztab_halo_st_haloTout(1:a_n,1:halo_size,1:nz+2), &
-            pztab_halo_wt_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
-            pztab_halo_et_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), &
-            pzst(1-halo_size:a_n+halo_size,1-halo_size:a_n+halo_size,0:nz+1)
+    real :: pztab_halo_st_haloTin(1:a_nx,1:halo_size,1:nz+2), &
+            pztab_halo_nt_haloTin(1:a_nx,1:halo_size,1:nz+2), &
+            pztab_halo_et_haloTin(1:halo_size,1:a_ny+2*halo_size,1:nz+2), &
+            pztab_halo_wt_haloTin(1:halo_size,1:a_ny+2*halo_size,1:nz+2), &
+            pztab_halo_nt_haloTout(1:a_nx,1:halo_size,1:nz+2), &
+            pztab_halo_st_haloTout(1:a_nx,1:halo_size,1:nz+2), &
+            pztab_halo_wt_haloTout(1:halo_size,1:a_ny+2*halo_size,1:nz+2), &
+            pztab_halo_et_haloTout(1:halo_size,1:a_ny+2*halo_size,1:nz+2), &
+            pzst(1-halo_size:a_nx+halo_size,1-halo_size:a_ny+halo_size,0:nz+1)
         !
         ! Do Comm
         !
@@ -1221,8 +1300,8 @@ contains
            if (Gneighbour_s) then
 !!$           pztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin
            !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTin)
-           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
-              pztab_halo_st_haloTin(ii,ij,ik) = pzst(ii,ij+a_n-halo_size,ik-1)
+           !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 )
+              pztab_halo_st_haloTin(ii,ij,ik) = pzst(ii,ij+a_ny-halo_size,ik-1)
            !$mnh_end_do()
            !$acc end kernels
            end if
@@ -1230,7 +1309,7 @@ contains
            if (Gneighbour_n) then
 !!$           pztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin
            !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTin)
-           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )           
+           !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 )           
               pztab_halo_nt_haloTin(ii,ij,ik) = pzst(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels
@@ -1239,8 +1318,8 @@ contains
            if (Gneighbour_e) then
 !!$           pztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin
            !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTin)
-           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
-              pztab_halo_et_haloTin(ii,ij,ik) = pzst(ii+a_n-halo_size,ij-halo_size,ik-1)
+           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) 
+              pztab_halo_et_haloTin(ii,ij,ik) = pzst(ii+a_nx-halo_size,ij-halo_size,ik-1)
            !$mnh_end_do()
            !$acc end kernels
            end if
@@ -1248,7 +1327,7 @@ contains
            if (Gneighbour_w) then
 !!$           pztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin
            !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTin)
-           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) 
+           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) 
               pztab_halo_wt_haloTin(ii,ij,ik) = pzst(ii,ij-halo_size,ik-1)
            !$mnh_end_do()
            !$acc end kernels
@@ -1280,7 +1359,7 @@ contains
         end if
         ! Receive from south
         recvtag = 1003
-        if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1,                 &
+        if (LUseO) call mpi_irecv(a%s(0,a_ny+1,1),1,                 &
                        halo_ns(level,m),neighbour_s_rank,recvtag,   &
                        MPI_COMM_HORIZ, requests_ns(2), ierr)
         recvtag = 1013
@@ -1296,7 +1375,7 @@ contains
            end if
            !print*,"mpi_irecv(pztab_halo_st_haloTout,neighbour_s_rank=",neighbour_s_rank
 #else
-           call mpi_irecv(a%st(1,a_n+1,0),1,                &
+           call mpi_irecv(a%st(1,a_ny+1,0),1,                &
                        halo_nst(level,m),neighbour_s_rank,recvtag,  &
                        MPI_COMM_HORIZ, requests_nsT(2), ierr)           
 #endif
@@ -1309,7 +1388,7 @@ contains
 #endif        
         ! Send to south
         sendtag = 1002
-        if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1,     &
+        if (LUseO) call mpi_isend(a%s(0,a_ny-(halo_size-1),1),1,     &
                        halo_ns(level,m),neighbour_s_rank,sendtag,   &
                        MPI_COMM_HORIZ, requests_ns(3), ierr)
         sendtag = 1012
@@ -1324,7 +1403,7 @@ contains
            end if
            !print*,"mpi_isend(pztab_halo_st_haloTin,neighbour_s_rank=",neighbour_s_rank
 #else   
-           call mpi_isend(a%st(1,a_n-(halo_size-1),0),1,    &
+           call mpi_isend(a%st(1,a_ny-(halo_size-1),0),1,    &
                        halo_nst(level,m),neighbour_s_rank,sendtag,  &
                        MPI_COMM_HORIZ, requests_nsT(3), ierr)
 #endif
@@ -1381,7 +1460,7 @@ contains
         end if
         ! Receive from east
         sendtag = 1001
-        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength,          &
+        if (LUseO) call mpi_irecv(a%s(0,yoffset,a_nx+1),blocklength,          &
                        MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, &
                        MPI_COMM_HORIZ, requests_ew(2), ierr)
         sendtag = 1011
@@ -1397,7 +1476,7 @@ contains
            end if
            !print*,"mpi_irecv(pztab_halo_et_haloTout,neighbour_e_rank=",neighbour_e_rank
 #else
-           call mpi_irecv(a%st(a_n+1,0,0),1,          &
+           call mpi_irecv(a%st(a_nx+1,0,0),1,          &
                        halo_wet(level,m),neighbour_e_rank,recvtag, &
                        MPI_COMM_HORIZ, requests_ewT(2), ierr)
            
@@ -1405,7 +1484,7 @@ contains
         end if
         ! Send to east
         sendtag = 1000
-        if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength,  &
+        if (LUseO) call mpi_isend(a%s(0,yoffset,a_nx-(halo_size-1)),blocklength,  &
                        MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, &
                        MPI_COMM_HORIZ, requests_ew(3), ierr)
         sendtag = 1010
@@ -1420,7 +1499,7 @@ contains
            end if
            !print*,"mpi_isend(pztab_halo_et_haloTin,neighbour_e_rank=",neighbour_e_rank
 #else
-           call mpi_isend(a%st(a_n-(halo_size-1),0,0),1,  &
+           call mpi_isend(a%st(a_nx-(halo_size-1),0,0),1,  &
                        halo_wet(level,m),neighbour_e_rank,sendtag, &
                        MPI_COMM_HORIZ, requests_ewT(3), ierr)           
 #endif
@@ -1460,7 +1539,7 @@ contains
            if (Gneighbour_n) then
            ! copy north halo for GPU managed
            !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTout)
-           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
+           !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 )
               pzst(ii,ij-halo_size,ik-1) = pztab_halo_nt_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
@@ -1468,15 +1547,15 @@ contains
            if (Gneighbour_s) then
            ! copy south halo for GPU managed
            !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTout)
-           !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 )
-              pzst(ii,ij+a_n,ik-1) = pztab_halo_st_haloTout(ii,ij,ik)
+           !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 )
+              pzst(ii,ij+a_ny,ik-1) = pztab_halo_st_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
            end if
            if (Gneighbour_w) then
            ! copy west halo for GPU managed
            !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTout)
-           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
+           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 )
               pzst(ii-halo_size,ij-halo_size,ik-1) = pztab_halo_wt_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
@@ -1484,8 +1563,8 @@ contains
            if (Gneighbour_e) then
            ! copy east halo for GPU managed
            !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTout)
-           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 )
-              pzst(ii+a_n,ij-halo_size,ik-1) = pztab_halo_et_haloTout(ii,ij,ik)
+           !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 )
+              pzst(ii+a_nx,ij-halo_size,ik-1) = pztab_halo_et_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels           
            end if 
@@ -1539,7 +1618,7 @@ contains
     halo_size = comm_param%halo_size
 
     ! Do nothing if we are only using one processor
-    if (m > 0) then
+    if (m >= 0) then
       if (comm_measuretime) then
         call start_timer(t_haloswap(level,m))
       end if
@@ -1553,11 +1632,13 @@ contains
 
       ! Work out ranks of neighbours
       ! W -> E
+#ifndef MPIVIDE
       call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, &
                           neighbour_w_rank,neighbour_e_rank,ierr)
       ! N -> S
       call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, &
                           neighbour_n_rank,neighbour_s_rank,ierr)
+#endif
       if ( (stepsize == 1) .or.                   &
          (  (mod(p_horiz(1),stepsize) == 0) .and. &
             (mod(p_horiz(2),stepsize) == 0) ) ) then
@@ -1668,12 +1749,20 @@ contains
     real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_sw_m_1_haloTout
     real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_se_m_1_haloTout
 
+    integer :: a_nx,a_ny, b_nx,b_ny
+
     call start_timer(t_collect(m))
 
     stepsize = 2**(pproc-m)
 
     a_n = a%ix_max-a%ix_min+1
+    a_nx = a%ix_max-a%ix_min+1 
+    a_ny = a%iy_max-a%iy_min+1
+
     b_n = b%ix_max-b%ix_min+1
+    b_nx = b%ix_max-b%ix_min+1
+    b_ny = b%iy_max-b%iy_min+1
+    
     nz = b%grid_param%nz
 
     ! Work out rank, only execute on relevant processes
@@ -1699,12 +1788,14 @@ contains
       ! NW receives from the other three processes
       if ( corner_nw ) then
         ! Receive from NE
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1),p_horiz(2)+stepsize/), &
                            source_rank, &
                            ierr)
+#endif
         recv_tag = 1000
-        if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, &
+        if (LUseO) call mpi_irecv(b%s(0,1,b_nx/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(1),ierr)
         recv_tag = 1010
         if (LUseT) then
@@ -1716,7 +1807,7 @@ contains
                 recv_requestT(1),ierr)
            !$acc end host_data
 #else
-           call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+           call mpi_irecv(b%st(b_nx/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                 recv_requestT(1),ierr)
 #endif
         end if
@@ -1725,12 +1816,14 @@ contains
     call fatalerror("Collect: receive from NE failed in mpi_irecv().")
 #endif
          ! Receive from SW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)+stepsize,p_horiz(2)/), &
                            source_rank, &
                            ierr)
+#endif
         recv_tag = 1001
-        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+        if (LUseO) call mpi_irecv(b%s(0,b_ny/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(2),ierr)
         recv_tag = 1011
         if (LUseT) then
@@ -1742,7 +1835,7 @@ contains
                 recv_requestT(2),ierr)
            !$acc end host_data
 #else
-           call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+           call mpi_irecv(b%st(1,b_ny/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                 recv_requestT(2),ierr)         
 #endif           
         endif
@@ -1752,12 +1845,14 @@ contains
     call fatalerror("Collect: receive from SW failed in mpi_irecv().")
 #endif
         ! Receive from SE
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                            source_rank, &
                            ierr)
+#endif
         recv_tag = 1002
-        if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+        if (LUseO) call mpi_irecv(b%s(0,b_ny/2+1,b_nx/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(3),ierr)
         recv_tag = 1012
         if (LUseT) then
@@ -1769,7 +1864,7 @@ contains
                 recv_requestT(3),ierr)
            !$acc end host_data
 #else
-           call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+           call mpi_irecv(b%st(b_nx/2+1,b_ny/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                 recv_requestT(3),ierr)           
 #endif
         end if
@@ -1778,18 +1873,18 @@ contains
     call fatalerror("Collect: receive from SE failed in mpi_irecv().")
 #endif
         ! Copy local data while waiting for data from other processes
-        if (LUseO) b%s(0:nz+1,1:a_n,1:a_n) = a%s(0:nz+1,1:a_n,1:a_n)
+        if (LUseO) b%s(0:nz+1,1:a_ny,1:a_nx) = a%s(0:nz+1,1:a_ny,1:a_nx)
         if (LUseT) then
 #ifdef MNH_GPUDIRECT           
            zb_st => b%st
            za_st => a%st
            !$acc kernels present_cr(zb_st,za_st)
-           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels
 #else           
-           b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1)
+           b%st(1:a_nx,1:a_ny,0:nz+1) = a%st(1:a_nx,1:a_ny,0:nz+1)
 #endif
         end if
         ! Wait for receives to complete before proceeding
@@ -1801,20 +1896,20 @@ contains
            ! copy from buffer for GPU DIRECT
            ! Receive from NE
            !$acc kernels present_cr(zb_st,ztab_sub_interiorT_ne_m_1_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
-              zb_st(ii+b_n/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik)
+           !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2)
+              zb_st(ii+b_nx/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels           
            ! Receive from SW
            !$acc kernels present_cr(zb_st,ztab_sub_interiorT_sw_m_1_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
-              zb_st(ii,ij+b_n/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik)
+           !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2)
+              zb_st(ii,ij+b_ny/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
            ! Receive from SE
            !$acc kernels present_cr(zb_st,ztab_sub_interiorT_se_m_1_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2)
-               zb_st(ii+b_n/2,ij+b_n/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik)
+           !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2)
+               zb_st(ii+b_nx/2,ij+b_ny/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
         end if
@@ -1822,11 +1917,12 @@ contains
       end if
       if ( corner_ne ) then
         ! Send to NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1),p_horiz(2)-stepsize/), &
                            dest_rank, &
                            ierr)
-
+#endif
         za_st => a%st
         
         send_tag = 1000
@@ -1836,7 +1932,7 @@ contains
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_ne_m_haloTin => tab_interiorT_ne(level,m)%haloTin
            !$acc kernels present_cr(ztab_interiorT_ne_m_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2)
               ztab_interiorT_ne_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels           
@@ -1855,11 +1951,12 @@ contains
       end if
       if ( corner_sw ) then
         ! Send to NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)-stepsize,p_horiz(2)/), &
                            dest_rank, &
                            ierr)
-        
+#endif
         za_st => a%st
         
         send_tag = 1001
@@ -1869,7 +1966,7 @@ contains
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_sw_m_haloTin => tab_interiorT_sw(level,m)%haloTin
            !$acc kernels present_cr(ztab_interiorT_sw_m_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2)
               ztab_interiorT_sw_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels           
@@ -1888,11 +1985,13 @@ contains
       end if
       if ( corner_se ) then
         ! send to NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                            dest_rank, &
                            ierr)
 
+#endif
         za_st => a%st
         
         send_tag = 1002
@@ -1902,7 +2001,7 @@ contains
 #ifdef MNH_GPUDIRECT
            ztab_interiorT_se_m_haloTin => tab_interiorT_se(level,m)%haloTin
            !$acc kernels present_cr(ztab_interiorT_se_m_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2)
               ztab_interiorT_se_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels
@@ -1967,12 +2066,20 @@ contains
     real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_sw_m_haloTout
     real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_se_m_haloTout
 
+    integer :: a_nx,a_ny, b_nx,b_ny
+
     call start_timer(t_distribute(m))
 
     stepsize = 2**(pproc-m)
 
     a_n = a%ix_max-a%ix_min+1
+    a_nx = a%ix_max-a%ix_min+1 
+    a_ny = a%iy_max-a%iy_min+1
+
     b_n = b%ix_max-b%ix_min+1
+    b_nx = b%ix_max-b%ix_min+1
+    b_ny = b%iy_max-b%iy_min+1
+    
     nz = a%grid_param%nz
 
     ! Work out rank, only execute on relevant processes
@@ -1995,23 +2102,25 @@ contains
       end if
       if ( corner_nw ) then
         ! (Asynchronous) send to NE
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1),p_horiz(2)+stepsize/), &
                            dest_rank, &
                            ierr)
         
+#endif
         za_st => a%st
         
         send_tag = 1000
-        if (LUseO) call mpi_isend(a%s(0,1,a_n/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, &
+        if (LUseO) call mpi_isend(a%s(0,1,a_nx/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, &
                        MPI_COMM_HORIZ,send_request(1),ierr)
         send_tag = 1010
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_ne_m_1_haloTin => tab_sub_interiorT_ne(level,m-1)%haloTin
            !$acc kernels present_cr(ztab_sub_interiorT_ne_m_1_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
-              ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij,ik-1)
+           !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2)
+              ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_nx/2,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTin)
@@ -2020,7 +2129,7 @@ contains
                 MPI_COMM_HORIZ,send_requestT(1),ierr)
            !$acc end host_data
 #else
-           call mpi_isend(a%st(a_n/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, &
+           call mpi_isend(a%st(a_nx/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, &
                 MPI_COMM_HORIZ,send_requestT(1),ierr)          
 #endif
         end if
@@ -2029,20 +2138,22 @@ contains
     call fatalerror("Distribute: send to NE failed in mpi_isend().")
 #endif
         ! (Asynchronous) send to SW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)+stepsize,p_horiz(2)/), &
                            dest_rank, &
                            ierr)
+#endif
         send_tag = 1001
-        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, &
+        if (LUseO) call mpi_isend(a%s(0,a_ny/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                        MPI_COMM_HORIZ, send_request(2), ierr)
         send_tag = 1011
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_sw_m_1_haloTin => tab_sub_interiorT_sw(level,m-1)%haloTin
            !$acc kernels present_cr(ztab_sub_interiorT_sw_m_1_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
-              ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_n/2,ik-1)
+           !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2)
+              ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_ny/2,ik-1)
            !$mnh_end_do()
            !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTin)
@@ -2051,7 +2162,7 @@ contains
                 MPI_COMM_HORIZ, send_requestT(2), ierr)
            !$acc end host_data
 #else
-           call mpi_isend(a%st(1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
+           call mpi_isend(a%st(1,a_ny/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                 MPI_COMM_HORIZ, send_requestT(2), ierr)           
 #endif
         end if
@@ -2061,20 +2172,22 @@ contains
     call fatalerror("Distribute: send to SW failed in mpi_isend().")
 #endif
         ! (Asynchronous) send to SE
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), &
                            dest_rank, &
                            ierr)
+#endif
         send_tag = 1002
-        if (LUseO) call mpi_isend(a%s(0,a_n/2+1,a_n/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, &
+        if (LUseO) call mpi_isend(a%s(0,a_ny/2+1,a_nx/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, &
                       MPI_COMM_HORIZ, send_request(3), ierr)
         send_tag = 1012
         if (LUseT) then
 #ifdef MNH_GPUDIRECT
            ztab_sub_interiorT_se_m_1_haloTin => tab_sub_interiorT_se(level,m-1)%haloTin
            !$acc kernels present_cr(ztab_sub_interiorT_se_m_1_haloTin,za_st)
-           !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2)
-              ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij+a_n/2,ik-1)
+           !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2)
+              ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_nx/2,ij+a_ny/2,ik-1)
            !$mnh_end_do()
            !$acc end kernels           
            !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTin)
@@ -2083,7 +2196,7 @@ contains
                 MPI_COMM_HORIZ, send_requestT(3), ierr)
            !$acc end host_data
 #else
-           call mpi_isend(a%st(a_n/2+1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
+           call mpi_isend(a%st(a_nx/2+1,a_ny/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, &
                 MPI_COMM_HORIZ, send_requestT(3), ierr)          
 #endif
         end if
@@ -2092,18 +2205,18 @@ contains
     call fatalerror("Distribute: send to SE failed in mpi_isend().")
 #endif
         ! While sending, copy local data
-        if (LUseO) b%s(0:nz+1,1:b_n,1:b_n) = a%s(0:nz+1,1:b_n,1:b_n)
+        if (LUseO) b%s(0:nz+1,1:b_ny,1:b_nx) = a%s(0:nz+1,1:b_ny,1:b_nx)
         if (LUseT) then
 #ifdef MNH_GPUDIRECT                     
            zb_st => b%st
            za_st => a%st
            !$acc kernels present_cr(zb_st,za_st)
-           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1)
            !$mnh_end_do()
            !$acc end kernels
 #else
-           b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1)
+           b%st(1:b_nx,1:b_ny,0:nz+1) = a%st(1:b_nx,1:b_ny,0:nz+1)
 #endif
         end if
         ! Only proceed when async sends to complete
@@ -2113,11 +2226,12 @@ contains
       if ( corner_ne ) then
 
         ! Receive from NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1),p_horiz(2)-stepsize/), &
                            source_rank, &
                            ierr)
-        
+#endif
         zb_st => b%st
         
         recv_tag = 1000
@@ -2131,7 +2245,7 @@ contains
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
            !$acc kernels present_cr(zb_st,ztab_interiorT_ne_m_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_ne_m_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels           
@@ -2146,11 +2260,12 @@ contains
       end if
       if ( corner_sw ) then
         ! Receive from NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)-stepsize,p_horiz(2)/), &
                            source_rank, &
                            ierr)
-
+#endif
         zb_st => b%st
         
         recv_tag = 1001
@@ -2164,7 +2279,7 @@ contains
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
            !$acc kernels present_cr(zb_st,ztab_interiorT_sw_m_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_sw_m_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
@@ -2179,11 +2294,12 @@ contains
       end if
       if ( corner_se ) then
         ! Receive from NW
+#ifndef MPIVIDE
         call mpi_cart_rank(MPI_COMM_HORIZ, &
                            (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), &
                            source_rank, &
                            ierr)
-
+#endif
         zb_st => b%st
         
         recv_tag = 1002
@@ -2197,7 +2313,7 @@ contains
                 MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr)
            !$acc end host_data
            !$acc kernels present_cr(zb_st,ztab_interiorT_se_m_haloTout)
-           !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2)
+           !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2)
               zb_st(ii,ij,ik-1) = ztab_interiorT_se_m_haloTout(ii,ij,ik)
            !$mnh_end_do()
            !$acc end kernels
diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90
index 0a99735ec1f269d2ef824843bf861c55d71b5dc3..fddc325865d6341b19de414d7dffc746c0ca6d53 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90
@@ -144,9 +144,18 @@ private
 
     real(kind=rl) , dimension(:,:,:), pointer , contiguous :: zphi_st
 
+    integer :: pproc,pprocx,pprocy
+    integer :: nlocalx,nlocaly
+
     phi%grid_param = grid_param
     call mpi_comm_size(comm_horiz, nproc, ierr)
+    pproc = nint(log(1.0d0*nproc)/log(2.0d0))
+    pprocx = pproc / 2
+    pprocy = pproc - pprocx
+    pproc = pprocx
     nlocal = grid_param%n/sqrt(1.0*nproc)
+    nlocaly = grid_param%n/2**pprocy
+    nlocalx = grid_param%n/2**pprocx
 
     ! Work out position in 2d processor grid
     call mpi_comm_rank(comm_horiz, rank, ierr)
@@ -154,10 +163,10 @@ private
     ! Set local data ranges
     ! NB: p_horiz stores (py,px) in that order (see comment in
     ! communication module)
-    phi%iy_min = p_horiz(1)*nlocal + 1
-    phi%iy_max = (p_horiz(1)+1)*nlocal
-    phi%ix_min = p_horiz(2)*nlocal + 1
-    phi%ix_max = (p_horiz(2)+1)*nlocal
+    phi%iy_min = p_horiz(1)*nlocaly + 1
+    phi%iy_max = (p_horiz(1)+1)*nlocaly
+    phi%ix_min = p_horiz(2)*nlocalx + 1
+    phi%ix_max = (p_horiz(2)+1)*nlocalx
     ! Set computational ranges. Note that these are different at
     ! the edges of the domain!
     if (p_horiz(1) == 0) then
@@ -165,20 +174,20 @@ private
     else
       phi%icompy_min = 1 - (halo_size - 1)
     end if
-    if (p_horiz(1) == floor(sqrt(1.0_rl*nproc))-1) then
-      phi%icompy_max = nlocal
+    if (p_horiz(1) == pprocy-1) then
+      phi%icompy_max = nlocaly
     else
-      phi%icompy_max = nlocal + (halo_size - 1)
+      phi%icompy_max = nlocaly + (halo_size - 1)
     end if
     if (p_horiz(2) == 0) then
       phi%icompx_min = 1
     else
       phi%icompx_min = 1 - (halo_size - 1)
     end if
-    if (p_horiz(2) == floor(sqrt(1.0_rl*nproc))-1) then
-      phi%icompx_max = nlocal
+    if (p_horiz(2) == pprocx-1) then
+      phi%icompx_max = nlocalx
     else
-      phi%icompx_max = nlocal + (halo_size - 1)
+      phi%icompx_max = nlocalx + (halo_size - 1)
     end if
     ! Set halo size
     phi%halo_size = halo_size
@@ -192,8 +201,8 @@ private
     phi%s(:,:,:) = 0.0_rl
     end if
     if (LUseT) then
-    allocate(zphi_st(1-halo_size:nlocal+halo_size, &
-                    1-halo_size:nlocal+halo_size, &
+    allocate(zphi_st(1-halo_size:nlocalx+halo_size, &
+                    1-halo_size:nlocaly+halo_size, &
                     0:grid_param%nz+1))
     !$acc enter data create (zphi_st)
     phi%st => zphi_st
@@ -483,7 +492,11 @@ private
     integer :: rank, nproc, ierr
     character(len=21) :: s
 
+    integer :: nlocalx,nlocaly
+
     nlocal = phi%ix_max-phi%ix_min+1
+    nlocalx = phi%ix_max-phi%ix_min+1
+    nlocaly = phi%iy_max-phi%iy_min+1
 
     ! Get number of processes and rank
     call mpi_comm_size(comm_horiz, nproc, ierr)
@@ -523,8 +536,8 @@ private
     if (LUseT) then
        !$acc update host(phi%st)
        do iz=0,phi%grid_param%nz+1
-          do iy=1-phi%halo_size,nlocal+phi%halo_size
-             do ix=1-phi%halo_size,nlocal+phi%halo_size
+          do iy=1-phi%halo_size,nlocaly+phi%halo_size
+             do ix=1-phi%halo_size,nlocalx+phi%halo_size
                 write(file_id,'(E24.15)') phi%st(ix,iy,iz)
              end do
           end do
diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
index edbe5ebfc57b22b3c2543f9445bb52b0d28a70fe..4929ad8a52862008152fea1c326c35d71b5356b1 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90
@@ -926,6 +926,8 @@ end subroutine construct_vertical_coeff
     integer :: halo_size
     integer :: nlocal, nz
 
+    integer :: nlocalx,nlocaly
+
 #ifdef MEASURESMOOTHINGRATE
     r%ix_min = u%ix_min
     r%ix_max = u%ix_max
@@ -939,6 +941,8 @@ end subroutine construct_vertical_coeff
     r%isactive = u%isactive
     r%grid_param = u%grid_param
     nlocal = r%ix_max-r%ix_min+1
+    nlocalx = r%ix_max-r%ix_min+1
+    nlocaly = r%iy_max-r%iy_min+1
     halo_size = r%halo_size
     nz = r%grid_param%nz
 
@@ -949,8 +953,8 @@ end subroutine construct_vertical_coeff
     end if
 
     if (LUseT) then
-       allocate(r%st(1-halo_size:nlocal+halo_size, &
-                     1-halo_size:nlocal+halo_size, &
+       allocate(r%st(1-halo_size:nlocalx+halo_size, &
+                     1-halo_size:nlocaly+halo_size, &
                      0:nz+1))
        !$acc enter data create (r%st)
     end if
@@ -1087,6 +1091,8 @@ end subroutine construct_vertical_coeff
     integer :: iblock
     logical :: overlap_comms
 
+    integer :: nlocalx,nlocaly
+
     call boundary_mnh(u)
 
     ordering = smoother_param%ordering
@@ -1100,6 +1106,8 @@ end subroutine construct_vertical_coeff
     allocate(c(nz))
     allocate(utmp(nz))
     nlocal = u%ix_max-u%ix_min+1
+    nlocalx = u%ix_max-u%ix_min+1
+    nlocaly = u%iy_max-u%iy_min+1
 #ifdef OVERLAPCOMMS
     overlap_comms = (nlocal > 2)
 #else
@@ -1107,37 +1115,37 @@ end subroutine construct_vertical_coeff
 #endif
     ! Block 1 (N)
     ixmin(1) = 1
-    ixmax(1) = nlocal
+    ixmax(1) = nlocalx
     iymin(1) = 1
     iymax(1) = 1
     ! Block 2 (S)
     ixmin(2) = 1
-    ixmax(2) = nlocal
-    iymin(2) = nlocal
-    iymax(2) = nlocal
+    ixmax(2) = nlocalx
+    iymin(2) = nlocaly
+    iymax(2) = nlocaly
     ! Block 3 (W)
     ixmin(3) = 1
     ixmax(3) = 1
     iymin(3) = 2
-    iymax(3) = nlocal-1
+    iymax(3) = nlocaly-1
     ! Block 4 (E)
-    ixmin(4) = nlocal
-    ixmax(4) = nlocal
+    ixmin(4) = nlocalx
+    ixmax(4) = nlocalx
     iymin(4) = 2
-    iymax(4) = nlocal-1
+    iymax(4) = nlocaly-1
     ! Block 5 (INTERIOR)
     if (overlap_comms) then
       ixmin(5) = 2
-      ixmax(5) = nlocal-1
+      ixmax(5) = nlocalx-1
       iymin(5) = 2
-      iymax(5) = nlocal-1
+      iymax(5) = nlocaly-1
     else
       ! If there are no interior cells, do not overlap
       ! communications and calculations, just loop over interior cells
       ixmin(5) = 1
-      ixmax(5) = nlocal
+      ixmax(5) = nlocalx
       iymin(5) = 1
-      iymax(5) = nlocal
+      iymax(5) = nlocaly
     end if
     dix = +1
     diy = +1
@@ -1474,21 +1482,24 @@ end subroutine construct_vertical_coeff
     integer :: nib,nie,njb,nje,nkb,nke
     integer :: ii,ij,ik
     integer :: iindex
-    integer :: n_lin 
+    integer :: n_lin
+
+    integer :: nlocalx,nlocaly
 
     !
     !  init size , param
     !
     nz = u%grid_param%nz
     nlocal = u%ix_max -u%ix_min + 1
+    nlocalx = u%ix_max -u%ix_min + 1
+    nlocaly = u%iy_max -u%iy_min + 1
     halo_size = u%halo_size
-    n_lin = (nlocal+2*halo_size)**2 * (nz+2)
+    n_lin = (nlocalx+2*halo_size) * (nlocaly+2*halo_size) * (nz+2)
 
- 
     nib = 1-halo_size
-    nie = nlocal+halo_size
+    nie = nlocalx+halo_size
     njb = 1-halo_size
-    nje = nlocal+halo_size
+    nje = nlocaly+halo_size
     nkb = 0
     nke = nz + 1
     
@@ -1559,9 +1570,16 @@ end subroutine construct_vertical_coeff
           Tjacobi(level)%Sutmp%st => zSutmp_st
        end if
     end if
-
+    
+#ifdef MG_DEBUG
+    if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin boundary_mnh_mnh ====== level=',level
+    call print_scalaprod2(level , m  , u  , "Befor boundary_mnh u=" )
+#endif
     call boundary_mnh(u)
-
+#ifdef MG_DEBUG
+    call print_scalaprod2(level , m  , u  , "After boundary_mnh u=" )
+#endif
+    
 #ifdef OVERLAPCOMMS
     overlap_comms = (nlocal > 2)
 #else
@@ -1570,37 +1588,37 @@ end subroutine construct_vertical_coeff
 
     ! Block 1 (N)
     ixmin(1) = 1
-    ixmax(1) = nlocal
+    ixmax(1) = nlocalx
     iymin(1) = 1
     iymax(1) = 1
     ! Block 2 (S)
     ixmin(2) = 1
-    ixmax(2) = nlocal
-    iymin(2) = nlocal
-    iymax(2) = nlocal
+    ixmax(2) = nlocalx
+    iymin(2) = nlocaly
+    iymax(2) = nlocaly
     ! Block 3 (W)
     ixmin(3) = 1
     ixmax(3) = 1
     iymin(3) = 2
-    iymax(3) = nlocal-1
+    iymax(3) = nlocaly-1
     ! Block 4 (E)
-    ixmin(4) = nlocal
-    ixmax(4) = nlocal
+    ixmin(4) = nlocalx
+    ixmax(4) = nlocalx
     iymin(4) = 2
-    iymax(4) = nlocal-1
+    iymax(4) = nlocaly-1
     ! Block 5 (INTERIOR)
     if (overlap_comms) then
       ixmin(5) = 2
-      ixmax(5) = nlocal-1
+      ixmax(5) = nlocalx-1
       iymin(5) = 2
-      iymax(5) = nlocal-1
+      iymax(5) = nlocaly-1
     else
       ! If there are no interior cells, do not overlap
       ! communications and calculations, just loop over interior cells
       ixmin(5) = 1
-      ixmax(5) = nlocal
+      ixmax(5) = nlocalx
       iymin(5) = 1
-      iymax(5) = nlocal
+      iymax(5) = nlocaly
     end if
 
     ! Temporary vector 
diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90
index b72b933725bbe147bc38280112bbe9d0a640f866..217ad17235bfe0de7b88583e7570af4fcb6c6a82 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90
@@ -174,6 +174,8 @@ contains
 
     real , dimension(:,:,:) , pointer , contiguous :: zxu_mg_st,zxb_mg_st,zxr_mg_st
 
+    integer :: nlocalx,nlocaly
+
     Gmg_initialized = .TRUE.
     
     if (i_am_master_mpi) &
@@ -214,14 +216,16 @@ contains
     allocate(xr_mg(mg_param%n_lev,0:pproc))
     n = grid_param%n
     nlocal = n/(2**pproc)
+    nlocalx = n/(2**pprocx)
+    nlocaly = n/(2**pprocy)
     nz = grid_param%nz
     L = grid_param%L
     H = grid_param%H
     level = mg_param%n_lev
-    m = pproc
+    m = pprocx ! pproc
     reduced_m = .false.
     !
-    call zt1d_discretisation_init(nlocal,nlocal,nz)
+    call zt1d_discretisation_init(nlocalx,nlocaly,nz)
     !
     ! Work out local processor coordinates (this is necessary to identify
     ! global coordinates)
@@ -236,8 +240,8 @@ contains
       if (i_am_master_mpi) &
         write(STDOUT, &
           '(" Local gridsize (x,y,z) on level ",I3," m = ",I4," : ",I8," x ",I8," x ",I8)') &
-          level, m, nlocal, nlocal, nz
-      if (nlocal < 1) then
+          level, m, nlocalx, nlocaly, nz
+      if ( (nlocalx < 1) .or. (nlocaly < 1) ) then
         call fatalerror('Number of grid points < 1')
       end if
 
@@ -254,16 +258,16 @@ contains
         icompx_min = 1 - (halo_size - 1)
       end if
 
-      if (p_horiz(1) == 2**pproc-1) then
-        icompy_max = nlocal
+      if (p_horiz(1) == 2**pprocy-1) then
+        icompy_max = nlocaly
       else
-        icompy_max = nlocal + (halo_size - 1)
+        icompy_max = nlocaly + (halo_size - 1)
       end if
 
-      if (p_horiz(2) == 2**pproc-1) then
-        icompx_max = nlocal
+      if (p_horiz(2) == 2**pprocx-1) then
+        icompx_max = nlocalx
       else
-        icompx_max = nlocal + (halo_size - 1)
+        icompx_max = nlocalx + (halo_size - 1)
       end if
 
       ! Allocate data
@@ -283,20 +287,20 @@ contains
       endif
 
       if (LUseT) then
-         allocate(zxu_mg_st(1-halo_size:nlocal+halo_size, &
-              1-halo_size:nlocal+halo_size, &
+         allocate(zxu_mg_st(1-halo_size:nlocalx+halo_size, &
+              1-halo_size:nlocaly+halo_size, &
               0:nz+1))
          !$acc enter data create (zxu_mg_st)
          xu_mg(level,m)%st => zxu_mg_st
          
-         allocate(zxb_mg_st(1-halo_size:nlocal+halo_size, &
-              1-halo_size:nlocal+halo_size, &
+         allocate(zxb_mg_st(1-halo_size:nlocalx+halo_size, &
+              1-halo_size:nlocaly+halo_size, &
               0:nz+1))
          !$acc enter data create (zxb_mg_st)
          xb_mg(level,m)%st => zxb_mg_st
          
-         allocate(zxr_mg_st(1-halo_size:nlocal+halo_size, &
-              1-halo_size:nlocal+halo_size, &
+         allocate(zxr_mg_st(1-halo_size:nlocalx+halo_size, &
+              1-halo_size:nlocaly+halo_size, &
               0:nz+1))
          !$acc enter data create (zxr_mg_st)
          xr_mg(level,m)%st => zxr_mg_st
@@ -311,10 +315,10 @@ contains
       ! NB: 1st coordinate is in the y-direction of the processor grid,
       ! second coordinate is in the x-direction (see comments in
       ! communication module)
-      iy_min = (p_horiz(1)/2**(pproc-m))*nlocal+1
-      iy_max = (p_horiz(1)/2**(pproc-m)+1)*nlocal
-      ix_min = p_horiz(2)/2**(pproc-m)*nlocal+1
-      ix_max = (p_horiz(2)/2**(pproc-m)+1)*nlocal
+      iy_min = (p_horiz(1)/2**(pproc-m))*nlocaly+1
+      iy_max = (p_horiz(1)/2**(pproc-m)+1)*nlocaly
+      ix_min = p_horiz(2)/2**(pproc-m)*nlocalx+1
+      ix_max = (p_horiz(2)/2**(pproc-m)+1)*nlocalx
       ! Set grid parameters and local data ranges
       ! Note that only n (and possibly nz) change as we
       ! move down the levels
@@ -365,8 +369,8 @@ contains
 
       ! Are these grids active?
       if ( (m == pproc) .or. &
-           ( (mod(p_horiz(1),2**(pproc-m)) == 0) .and. &
-             (mod(p_horiz(2),2**(pproc-m)) == 0) ) ) then
+           ( (mod(p_horiz(1),2**(pprocx-m)) == 0) .and. &
+             (mod(p_horiz(2),2**(pprocx-m)) == 0) ) ) then
         grid_active = .true.
       else
         grid_active = .false.
@@ -395,12 +399,16 @@ contains
         reduced_m = .true.
         m = m-1
         nlocal = 2*nlocal
+        nlocalx = 2*nlocalx
+        nlocaly = 2*nlocaly
         cycle
       end if
       reduced_m = .false.
       level = level-1
       n = n/2
       nlocal = nlocal/2
+      nlocalx = nlocalx/2
+      nlocaly = nlocaly/2
     end do
     if (i_am_master_mpi) &
       write(STDOUT,*) ''
@@ -663,6 +671,8 @@ contains
     integer :: ierr
     integer :: iblock
 
+    integer :: nlocalx,nlocaly
+
     ! Needed for interpolation matrix
 #ifdef PIECEWISELINEAR
 #else
@@ -676,6 +686,8 @@ contains
 #endif
 
     nlocal = phicoarse%ix_max-phicoarse%ix_min+1
+    nlocalx = phicoarse%ix_max-phicoarse%ix_min+1
+    nlocaly = phicoarse%iy_max-phicoarse%iy_min+1
     n = phicoarse%grid_param%n
     nz = phicoarse%grid_param%nz
 
@@ -686,37 +698,37 @@ contains
 #endif
     ! Block 1 (N)
     ixmin(1) = 1
-    ixmax(1) = nlocal
+    ixmax(1) = nlocalx
     iymin(1) = 1
     iymax(1) = 1
     ! Block 2 (S)
     ixmin(2) = 1
-    ixmax(2) = nlocal
-    iymin(2) = nlocal
-    iymax(2) = nlocal
+    ixmax(2) = nlocalx
+    iymin(2) = nlocaly
+    iymax(2) = nlocaly
     ! Block 3 (W)
     ixmin(3) = 1
     ixmax(3) = 1
     iymin(3) = 2
-    iymax(3) = nlocal-1
+    iymax(3) = nlocaly-1
     ! Block 4 (E)
-    ixmin(4) = nlocal
-    ixmax(4) = nlocal
+    ixmin(4) = nlocalx
+    ixmax(4) = nlocalx
     iymin(4) = 2
-    iymax(4) = nlocal-1
+    iymax(4) = nlocaly-1
     ! Block 5 (INTERIOR)
     if (overlap_comms) then
       ixmin(5) = 2
-      ixmax(5) = nlocal-1
+      ixmax(5) = nlocalx-1
       iymin(5) = 2
-      iymax(5) = nlocal-1
+      iymax(5) = nlocaly-1
     else
       ! If there are no interior cells, do not overlap
       ! communications and calculations, just loop over interior cells
       ixmin(5) = 1
-      ixmax(5) = nlocal
+      ixmax(5) = nlocalx
       iymin(5) = 1
-      iymax(5) = nlocal
+      iymax(5) = nlocaly
     end if
 
     ! *** Constant prolongation or (tri-) linear prolongation ***
@@ -1252,9 +1264,18 @@ contains
       ! Perform n_presmooth smoothing steps
       call start_timer(t_smooth(level,m))
       call start_timer(t_total(level,m))
+#ifdef MG_DEBUG
+      if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin  smooth_mnh DIRECTION_FORWARD ====== level=',level
+      call print_scalaprod2(level   , m  , b(level  ,m)  , "Befor smooth_mnh b(level  ,m  )=" )
+      call print_scalaprod2(level   , m  , u(level  ,m)  , "Befor smooth_mnh u(level  ,m  )=" )            
+#endif
       call smooth_mnh(level,m,mg_param%n_presmooth, &
                   DIRECTION_FORWARD, &
                   b(level,m),u(level,m))
+#ifdef MG_DEBUG
+      call print_scalaprod2(level   , m  , b(level  ,m)  , "After smooth_mnh b(level  ,m  )=" )
+      call print_scalaprod2(level   , m  , u(level  ,m)  , "After smooth_mnh u(level  ,m  )=" )                  
+#endif
       call finish_timer(t_total(level,m))
       call finish_timer(t_smooth(level,m))
       ! Calculate residual
@@ -1266,15 +1287,27 @@ contains
       ! Restrict residual
       call start_timer(t_restrict(level,m))
       call start_timer(t_total(level,m))
+#ifdef MG_DEBUG
+      if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin  restrict_mnh ====== level=',level
+      call print_scalaprod2(level-1 , m  , b(level-1,m)  , "Befor restrict_mnh b(level-1,m  )=" )
+      call print_scalaprod2(level   , m  , r(level  ,m)  , "Befor restrict_mnh r(level  ,m  )=" )      
+#endif
       call restrict_mnh(r(level,m),b(level-1,m))
+#ifdef MG_DEBUG
+      call print_scalaprod2(level-1 , m  , b(level-1,m)  , "After restrict_mnh b(level-1,m  )=" )
+      call print_scalaprod2(level   , m  , r(level  ,m)  , "After restrict_mnh r(level  ,m  )=" )
+#endif
       call finish_timer(t_total(level,m))
       call finish_timer(t_restrict(level,m))
       if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then
         ! Collect data on coarser level
         call start_timer(t_total(level,m))
         call collect(level-1,m,b(level-1,m),b(level-1,m-1))
-!!$        call print_scalaprod2(level-1, m  , b(level-1,m)  , "After collect b(level-1,m  )=" )
-!!$        call print_scalaprod2(level-1, m-1, b(level-1,m-1), "After collect b(level-1,m-1)=" )
+#ifdef MG_DEBUG
+        if (i_am_master_mpi)  write(STDOUT,*) ' ====== after collect ====== level-1=',level-1
+        call print_scalaprod2(level-1, m  , b(level-1,m)  , "After collect b(level-1,m  )=" )
+        call print_scalaprod2(level-1, m-1, b(level-1,m-1), "After collect b(level-1,m-1)=" )
+#endif
         call finish_timer(t_total(level,m))
         ! Set initial solution on coarser level to zero (incl. halos!)
         if (LUseO) u(level-1,m-1)%s(:,:,:) = 0.0_rl
@@ -1289,8 +1322,11 @@ contains
         ! Distribute data on coarser grid
         call start_timer(t_total(level,m))
         call distribute(level-1,m,u(level-1,m-1),u(level-1,m))
-!!$        call print_scalaprod2(level-1, m  , u(level-1,m)  , "After distribute u(level-1,m  )=" )
-!!$        call print_scalaprod2(level-1, m-1, u(level-1,m-1), "After distribute u(level-1,m-1)=" )       
+#ifdef MG_DEBUG
+        if (i_am_master_mpi)  write(STDOUT,*) ' ====== after distribute ====== level-1=',level-1
+        call print_scalaprod2(level-1, m  , u(level-1,m)  , "After distribute u(level-1,m  )=" )
+        call print_scalaprod2(level-1, m-1, u(level-1,m-1), "After distribute u(level-1,m-1)=" )       
+#endif
         call finish_timer(t_total(level,m))
       else
         ! Set initial solution on coarser level to zero (incl. halos!)
@@ -1320,13 +1356,31 @@ contains
       ! Prolongate error
       call start_timer(t_prolongate(level,m))
       call start_timer(t_total(level,m))
+
+#ifdef MG_DEBUG
+      if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin haloswap_mnh_mnh ====== level-1=',level-1
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "Befor haloswap_mnh u(level-1,m  )=" )
+#endif
       call haloswap_mnh(level-1,m,u(level-1,m))
+#ifdef MG_DEBUG
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "After haloswap_mnh u(level-1,m  )=" )
+
+      if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin boundary_mnh_mnh ====== level-1=',level-1
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "Befor boundary_mnh u(level-1,m  )=" )
+#endif
       call boundary_mnh(u(level-1,m))
-!!$      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "Befor prolongate_mnh u(level-1,m  )=" )
-!!$      call print_scalaprod2(level   , m  , r(level  ,m)  , "Befor prolongate_mnh r(level  ,m  )=" )
+#ifdef MG_DEBUG
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "After boundary_mnh u(level-1,m  )=" )
+      
+      if (i_am_master_mpi)  write(STDOUT,*) ' ====== begin prolongate_mnh ====== level=',level
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "Befor prolongate_mnh u(level-1,m  )=" )
+      call print_scalaprod2(level   , m  , r(level  ,m)  , "Befor prolongate_mnh r(level  ,m  )=" )
+#endif
       call prolongate_mnh(level,m,u(level-1,m),r(level,m))
-!!$      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "After prolongate_mnh u(level-1,m  )=" )
-!!$      call print_scalaprod2(level   , m  , r(level  ,m)  , "After prolongate_mnh r(level  ,m  )=" )
+#ifdef MG_DEBUG
+      call print_scalaprod2(level-1 , m  , u(level-1,m)  , "After prolongate_mnh u(level-1,m  )=" )
+      call print_scalaprod2(level   , m  , r(level  ,m)  , "After prolongate_mnh r(level  ,m  )=" )
+#endif
       call finish_timer(t_total(level,m))
       call finish_timer(t_prolongate(level,m))
       ! Add error to fine grid solution
@@ -1350,7 +1404,7 @@ contains
       call start_timer(t_total(level,m))
       if (mg_param%coarsegridsolver == COARSEGRIDSOLVER_CG) then
         call cg_solve_mnh(level,m,b(level,m),u(level,m))
-!!$        call print_scalaprod2(level, m , u(level,m), "After cg_solve_mnh u=(level,m)" )
+        call print_scalaprod2(level, m , u(level,m), "After cg_solve_mnh u=(level,m)" )
       else if (mg_param%coarsegridsolver == COARSEGRIDSOLVER_SMOOTHER) then
         ! Smooth on coarsest level
         call smooth_mnh(level,m, &
@@ -1415,7 +1469,7 @@ contains
       call restrict(r(level,m),b(level-1,m))
       call finish_timer(t_total(level,m))
       call finish_timer(t_restrict(level,m))
-      if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then
+      if ( ((level-1) .le. splitlevel) .and. (m >= 0) ) then
         ! Collect data on coarser level
         call start_timer(t_total(level,m))
         call collect(level-1,m,b(level-1,m),b(level-1,m-1))
@@ -1500,7 +1554,7 @@ contains
         call haloswap(level,m,u(level,m))
       end do
       call mpi_barrier(MPI_COMM_HORIZ,ierr)
-      if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then
+      if ( ((level-1) .le. splitlevel) .and. (m >= 0) ) then
         call mpi_barrier(MPI_COMM_HORIZ,ierr)
         do i=1,nhaloswap
           call haloswap(level-1,m,u(level-1,m))
@@ -1627,7 +1681,7 @@ contains
     if (mg_param%verbose > 0) then
       if (i_am_master_mpi) then
         write(STDOUT,'(" *** Multigrid solver ***")')
-        write(STDOUT,'(" <MG> Initial residual : ",E10.5)') res_initial
+        write(STDOUT,'(" <MG> Initial residual : ",E23.10)') res_initial
       end if
     end if
     if (mg_param%verbose > 0) then