diff --git a/tensorproductmultigrid_Source/communication.f90 b/tensorproductmultigrid_Source/communication.f90
index ed3b86f9ebfa78bcfdb97b97b64409f1421055b0..8fc5fe85790e5dcf21ee8015d422a4522a19c91d 100644
--- a/tensorproductmultigrid_Source/communication.f90
+++ b/tensorproductmultigrid_Source/communication.f90
@@ -47,6 +47,7 @@ module communication
 public::comm_preinitialise
 public::comm_initialise
 public::comm_finalise
+public::scalarprod_mnh
 public::scalarprod
 public::boundary_mnh
 public::haloswap_mnh
@@ -378,6 +379,54 @@ contains
 !==================================================================
 !  Scalar product of two fields
 !==================================================================
+  subroutine scalarprod_mnh(m, a, b, s)
+    implicit none
+    integer, intent(in) :: m
+    type(scalar3d), intent(in) :: a
+    type(scalar3d), intent(in) :: b
+    real(kind=rl), intent(out) :: s
+    integer :: nprocs, rank, ierr
+    integer :: p_horiz(2)
+    integer :: stepsize
+    integer, parameter :: dim_horiz = 2
+    real(kind=rl) :: local_sum, global_sum
+    integer :: nlocal, nz, i
+    real(kind=rl) :: ddot
+
+    nlocal = a%ix_max-a%ix_min+1
+    nz = a%grid_param%nz
+    ! Work out coordinates of processor
+    call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
+    call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
+    stepsize = 2**(pproc-m)
+    if (nprocs > 1) then
+      ! Only inlcude local sum if the processor coordinates
+      ! are multiples of stepsize
+      call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr)
+      if ( (stepsize == 1) .or.                   &
+           ( (stepsize > 1) .and.                 &
+             (mod(p_horiz(1),stepsize)==0) .and.  &
+             (mod(p_horiz(2),stepsize)==0) ) ) then
+        local_sum = 0.0_rl
+        do i = 1, nlocal
+          local_sum = local_sum &
+                    + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
+        end do
+      else
+        local_sum = 0.0_rl
+      end if
+      call mpi_allreduce(local_sum,global_sum,1,MPI_DOUBLE_PRECISION, &
+                         MPI_SUM,MPI_COMM_HORIZ,ierr)
+    else
+      global_sum = 0.0_rl
+      do i = 1, nlocal
+        global_sum = global_sum &
+                   + ddot((nz+2)*nlocal,a%s(0,1,i),1,b%s(0,1,i),1)
+      end do
+    end if
+    s = global_sum
+  end subroutine scalarprod_mnh
+!-------------------------------------------------------------------------------
   subroutine scalarprod(m, a, b, s)
     implicit none
     integer, intent(in) :: m
diff --git a/tensorproductmultigrid_Source/conjugategradient.f90 b/tensorproductmultigrid_Source/conjugategradient.f90
index d0253bf9a93d5febcb8081da1284312a080c7404..320e272c9ffbd58d44279094ad1edcd57656c184 100644
--- a/tensorproductmultigrid_Source/conjugategradient.f90
+++ b/tensorproductmultigrid_Source/conjugategradient.f90
@@ -225,9 +225,9 @@ contains
     if (LUseO) call dcopy(n_lin,z%s,1,p%s,1)
     if (LUseT) call dcopy(n_lin,z%st,1,p%st,1)
     ! rz_old = <r,z>
-    call scalarprod(m,r,z,rz_old)
+    call scalarprod_mnh(m,r,z,rz_old)
     ! res0 <- ||r||
-    call scalarprod(m,r,r,res0)
+    call scalarprod_mnh(m,r,r,res0)
     res0 = dsqrt(res0)
     if (cg_param%verbose > 0) then
       if (i_am_master_mpi) then
@@ -241,7 +241,7 @@ contains
         call haloswap_mnh(level,m,p)
         call apply_mnh(p,Ap)
         ! alpha <- res_old / <p,A.p>
-        call scalarprod(m,p,Ap,pAp)
+        call scalarprod_mnh(m,p,Ap,pAp)
         alpha = rz_old/pAp
         ! x <- x + alpha*p
         if (LUseO) call daxpy(n_lin,alpha,p%s,1,u%s,1)
@@ -249,7 +249,7 @@ contains
         ! r <- r - alpha*A.p
         if (LUseO) call daxpy(n_lin,-alpha,Ap%s,1,r%s,1)
         if (LUseT) call daxpy(n_lin,-alpha,Ap%st,1,r%st,1)
-        call scalarprod(m,r,r,res)
+        call scalarprod_mnh(m,r,r,res)
         res = dsqrt(res)
         if (cg_param%verbose > 1) then
           if (i_am_master_mpi) then
@@ -272,7 +272,7 @@ contains
           if (LUseO) call dcopy(n_lin,r%s,1,z%s,1)
           if (LUseT) call dcopy(n_lin,r%st,1,z%st,1)
         end if
-        call scalarprod(m,r,z,rz)
+        call scalarprod_mnh(m,r,z,rz)
         ! p <- res/res_old*p
         if (LUseO) call dscal(n_lin,rz/rz_old,p%s,1)
         if (LUseT) call dscal(n_lin,rz/rz_old,p%st,1)
diff --git a/tensorproductmultigrid_Source/multigrid.f90 b/tensorproductmultigrid_Source/multigrid.f90
index 5ec2a7a631fbc233e3153807790fb761a1607006..897ac633f0481030fec97ea9d2cd507879fa3bb6 100644
--- a/tensorproductmultigrid_Source/multigrid.f90
+++ b/tensorproductmultigrid_Source/multigrid.f90
@@ -1494,7 +1494,7 @@ contains
       if (LUseT) call dcopy(n_gridpoints,xu_mg(level,pproc)%st,1,pp%st,1)
       ! Calculate rz_old = <pp,b>
       call start_timer(t_scalprod)
-      call scalarprod(pproc,pp,xb_mg(level,pproc),rz_old)
+      call scalarprod_mnh(pproc,pp,xb_mg(level,pproc),rz_old)
       call finish_timer(t_scalprod)
       do iter = 1, maxiter
         ! Apply matrix q <- A.pp
@@ -1503,7 +1503,7 @@ contains
         call finish_timer(t_apply)
         ! Calculate pq <- <pp,q>
         call start_timer(t_scalprod)
-        call scalarprod(pproc,pp,q,pq)
+        call scalarprod_mnh(pproc,pp,q,pq)
         call finish_timer(t_scalprod)
         alpha = rz_old/pq
         ! x <- x + alpha*p
@@ -1539,7 +1539,7 @@ contains
         if (LUseO) call dcopy(n_gridpoints,xu_mg(level,pproc)%s,1,q%s,1)
         if (LUseT) call dcopy(n_gridpoints,xu_mg(level,pproc)%st,1,q%st,1)
         call start_timer(t_scalprod)
-        call scalarprod(pproc,q,xb_mg(level,pproc),rz)
+        call scalarprod_mnh(pproc,q,xb_mg(level,pproc),rz)
         call finish_timer(t_scalprod)
         beta = rz/rz_old
         ! p <- beta*p