diff --git a/tensorproductmultigrid_Source/communication.f90 b/tensorproductmultigrid_Source/communication.f90
index b5b2091857c91d9c08322e01e9b531dddbd1bc6a..e362a6f2d1b59b90967586d9a93d6b3ba6121e8b 100644
--- a/tensorproductmultigrid_Source/communication.f90
+++ b/tensorproductmultigrid_Source/communication.f90
@@ -372,15 +372,24 @@ contains
 !==================================================================
 ! Finalise communication routines
 !==================================================================
-  subroutine comm_finalise(n_lev,   &  ! }
-                           lev_split)  ! } Multigrid parameters
+  subroutine comm_finalise(n_lev,   &  ! } Multigrid parameters
+                           lev_split,  & !}
+                           grid_param )  ! } Grid parameters
     implicit none
     integer, intent(in) :: n_lev
     integer, intent(in) :: lev_split
+    type(grid_parameters), intent(in) :: grid_param
+    ! local var
     logical :: reduced_m
     integer :: level, m
     integer :: ierr
+    integer :: nlocal,n
     character(len=80) :: s
+
+   ! Local size of (horizontal) grid
+    n = grid_param%n
+    nlocal = n/2**pproc
+
     m = pproc
     level = n_lev
     reduced_m = .false.
@@ -400,16 +409,18 @@ contains
       ! --- Free interior data types ---
       if (LUseO) call mpi_type_free(interior(level,m),ierr)
       if (LUseO) call mpi_type_free(sub_interior(level,m),ierr)
-      if (LUseT) call mpi_type_free(interiorT(level,m),ierr)
-      if (LUseT) call mpi_type_free(sub_interiorT(level,m),ierr)
+      if (LUseT .and. (nlocal /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr)
+      if (LUseT .and. ( (nlocal/2) /= 0 ) ) call mpi_type_free(sub_interiorT(level,m),ierr)
       ! If we are below L_split, split data
       if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then
         reduced_m = .true.
         m = m-1
+        nlocal = 2*nlocal
         cycle
       end if
       reduced_m = .false.
       level = level-1
+      nlocal = nlocal/2
     end do
     do m=pproc,0,-1
       write(s,'("m = ",I3)') m
@@ -445,6 +456,7 @@ contains
     type(scalar3d), intent(in) :: a
     type(scalar3d), intent(in) :: b
     real(kind=rl), intent(out) :: s
+    !local var
     integer :: nprocs, rank, ierr
     integer :: p_horiz(2)
     integer :: stepsize
@@ -455,8 +467,17 @@ contains
     integer :: ix,iy,iz
     real(kind=rl) :: ddot
 
+    integer :: iy_min,iy_max, ix_min,ix_max
+    real , dimension(:,:,:) , pointer :: za_st,zb_st
+
     nlocal = a%ix_max-a%ix_min+1
     nz = a%grid_param%nz
+
+    iy_min = a%iy_min
+    iy_max = a%iy_max
+    ix_min = a%ix_min
+    ix_max = a%ix_max
+
     ! Work out coordinates of processor
     call mpi_comm_size(MPI_COMM_HORIZ,nprocs,ierr)
     call mpi_comm_rank(MPI_COMM_HORIZ,rank,ierr)
@@ -504,15 +525,19 @@ contains
       end do
       endif
       if (LUseT) then
+      za_st => a%st
+      zb_st => b%st
       global_sumt = 0.0_rl
+         !$acc kernels
          do iz=0,nz+1
-            do iy=a%iy_min,a%iy_max
-               do ix=a%ix_min,a%ix_max
+            do iy=iy_min,iy_max
+               do ix=ix_min,ix_max
                   global_sumt = global_sumt &
-                       + a%st(ix,iy,iz)*b%st(ix,iy,iz)
+                       + za_st(ix,iy,iz)*zb_st(ix,iy,iz)
                end do
             end do
          end do
+         !$acc end kernels
       endif
     end if
     if (LUseO) then
@@ -601,6 +626,9 @@ contains
 
     !local var
     integer :: n, ix_min,ix_max,iy_min,iy_max
+    integer :: icompx_max,icompy_max
+
+    real , dimension(:,:,:) , pointer :: za_st
     
     ! Update Real Boundary for Newman case   u(0) = u(1) , etc ...
 
@@ -627,34 +655,36 @@ contains
     endif
     if (LUseT) then
     !  transpose
+
+    za_st => a%st
+    icompx_max = a%icompx_max
+    icompy_max = a%icompy_max
+
+    !$acc kernels
     if ( ix_min == 1 ) then
-       a%st(0,:,:) = a%st(1,:,:)
+       !acc kernels
+       za_st(0,:,:) = za_st(1,:,:)
+       !acc end kernels
     endif
     if ( ix_max == n ) then
-       a%st(a%icompx_max+1,:,:) = a%st(a%icompx_max,:,:)
+       !acc kernels
+       za_st(icompx_max+1,:,:) = za_st(icompx_max,:,:)
+       !acc end kernels
     endif
     if ( iy_min == 1 ) then
-       a%st(:,0,:) = a%st(:,1,:)
+       !acc kernels
+       za_st(:,0,:) = za_st(:,1,:)
+       !acc end kernels
     endif
     if ( iy_max == n ) then
-       a%st(:,a%icompy_max+1,:) = a%st(:,a%icompy_max,:)
+       !acc kernels
+       za_st(:,icompy_max+1,:) = za_st(:,icompy_max,:)
+       !acc end kernels
     endif
+    !$acc end kernels
+   
     endif
-    !
-    ! corner ( inutile , deja réalisé au dessus )
-    !
-!!$    if ( ( ix_min == 1 ) .and. ( iy_min == 1 ) ) then
-!!$       a%s(:,0,0) = a%s(:,1,1) 
-!!$    endif
-!!$    if ( ( ix_min == 1 ) .and. ( iy_max == n ) ) then
-!!$       a%s(:,a%icompy_max+1,0) = a%s(:,a%icompy_max,1)
-!!$    end if
-!!$    if ( ( ix_max == n ) .and. ( iy_min == 1 ) ) then
-!!$       a%s(:,0,a%icompx_max+1) = a%s(:,1,a%icompx_max)
-!!$    end if
-!!$    if ( ( ix_max == n ) .and. ( iy_max == n ) ) then
-!!$     a%s(:,a%icompy_max+1,a%icompx_max+1) = a%s(:,a%icompy_max,a%icompx_max)
-!!$    end if
+
   end subroutine boundary_mnh
 !==================================================================
 !  Initiate asynchronous halo exchange
diff --git a/tensorproductmultigrid_Source/datatypes.f90 b/tensorproductmultigrid_Source/datatypes.f90
index 76dc6445a323529e638ae1eeba5d073be9a44c53..ef71df25c8e791932abd914914712bbeadf38ce2 100644
--- a/tensorproductmultigrid_Source/datatypes.f90
+++ b/tensorproductmultigrid_Source/datatypes.f90
@@ -76,7 +76,7 @@ module datatypes
     logical :: isactive   ! Is this field active, i.e. used on one of the
                           ! active processes on coarser grids?
     real(kind=rl),allocatable :: s(:,:,:)
-    real(kind=rl),allocatable :: st(:,:,:)
+    real(kind=rl),pointer :: st(:,:,:)
     type(grid_parameters) :: grid_param
   end type scalar3d
 
@@ -309,6 +309,7 @@ private
     implicit none
     type(scalar3d), intent(in) :: phi
     logical, optional :: phi_is_volumeintegral
+    !local var
     integer :: ix, iy, iz
     real(kind=rl) :: tmp, global_tmp
     real(kind=rl) :: tmpt, global_tmpt
@@ -318,6 +319,8 @@ private
     logical :: divide_by_volume
     real(kind=rl) :: volume_factor
 
+    real , dimension(:,:,:) , pointer :: zphi_st
+
     nlocalx = phi%ix_max-phi%ix_min+1
     nlocaly = phi%iy_max-phi%iy_min+1
     nz      = phi%grid_param%nz
@@ -337,13 +340,16 @@ private
        end if
        
        if (LUseT) then
+          zphi_st => phi%st
+          !$acc kernels loop collapse(3)
           do iz=1,nz
              do iy=1,nlocaly
                 do ix=1,nlocalx
-                   tmpt = tmpt + phi%st(ix,iy,iz)**2 ! * volume_factor
+                   tmpt = tmpt + zphi_st(ix,iy,iz)**2 ! * volume_factor
                 end do
              end do
           end do
+          !$acc end kernels
  
        end if
     end if
diff --git a/tensorproductmultigrid_Source/dblas.f90 b/tensorproductmultigrid_Source/dblas.f90
index 487c947c77a9589c637aa12799d6ce7822169610..48d5dcdce4f328566fd6e968cd8ba21311c5dd30 100644
--- a/tensorproductmultigrid_Source/dblas.f90
+++ b/tensorproductmultigrid_Source/dblas.f90
@@ -38,6 +38,7 @@ subroutine dcopy(n,sx,incx,sy,incy)
    30 continue
       if( n .lt. 7 ) return
    40 mp1 = m + 1
+      !$acc kernels
       do 50 i = mp1,n,7
         sy(i) = sx(i)
         sy(i + 1) = sx(i + 1)
@@ -47,6 +48,7 @@ subroutine dcopy(n,sx,incx,sy,incy)
         sy(i + 5) = sx(i + 5)
         sy(i + 6) = sx(i + 6)
        50 continue
+      !$acc end kernels
       return
 
 end subroutine dcopy
@@ -94,12 +96,14 @@ SUBROUTINE DAXPY( N, SA, SX, INCX, SY, INCY )
    20       CONTINUE
 	   ENDIF
 !                              ** UNROLL LOOP FOR SPEED
+    !$acc kernels
 	   DO 30  I = M+1, N, 4
 	      SY(I)   = SY(I)   + SA * SX(I)
 	      SY(I+1) = SY(I+1) + SA * SX(I+1)
 	      SY(I+2) = SY(I+2) + SA * SX(I+2)
 	      SY(I+3) = SY(I+3) + SA * SX(I+3)
    30    CONTINUE
+   !$acc end kernels
 
 	ELSE
 !               ** NONEQUAL OR NONPOSITIVE INCREMENTS.
diff --git a/tensorproductmultigrid_Source/discretisation.f90 b/tensorproductmultigrid_Source/discretisation.f90
index 5717460c3b94aebf8db62a45a56851076b789398..7fca1ce705d2beb19dc7d8732596d8fb2b0aaf00 100644
--- a/tensorproductmultigrid_Source/discretisation.f90
+++ b/tensorproductmultigrid_Source/discretisation.f90
@@ -123,10 +123,10 @@ private
 
   ! Data structure for storing the vertical discretisation
   type vertical_coefficients
-    real(kind=rl), allocatable :: a(:)
-    real(kind=rl), allocatable :: b(:)
-    real(kind=rl), allocatable :: c(:)
-    real(kind=rl), allocatable :: d(:)
+    real(kind=rl), pointer :: a(:)
+    real(kind=rl), pointer :: b(:)
+    real(kind=rl), pointer :: c(:)
+    real(kind=rl), pointer :: d(:)
   end type vertical_coefficients
 
   ! Stoarge for vertical coefficients
@@ -553,6 +553,8 @@ end subroutine construct_vertical_coeff
     integer :: ix,iy,iz
     integer ::  iib,iie,ijb,ije,ikb,ike
 
+    real , dimension(:,:,:) , pointer :: zr_st , zb_st
+
     ! r <- A.u
     !call boundary_mnh(u)
     call apply_mnh(u,r)
@@ -581,7 +583,12 @@ end subroutine construct_vertical_coeff
        ije=u%icompy_max
        ikb=1
        ike=u%grid_param%nz
-       r%st(iib:iie,ijb:ije,ikb:ike) = b%st(iib:iie,ijb:ije,ikb:ike) - r%st(iib:iie,ijb:ije,ikb:ike)
+
+       zr_st => r%st
+       zb_st => b%st
+       !$acc kernels
+       zr_st(iib:iie,ijb:ije,ikb:ike) = zb_st(iib:iie,ijb:ije,ikb:ike) - zr_st(iib:iie,ijb:ije,ikb:ike)
+       !$acc end kernels
     endif
 
   end subroutine calculate_residual_mnh
@@ -624,6 +631,10 @@ end subroutine construct_vertical_coeff
     real(kind=rl) :: tmp
     integer :: iib,iie,ijb,ije
 
+    real, dimension(:,:,:) , pointer :: zv_st , zu_st
+    real, dimension(:)     , pointer :: za_k, zb_k, zc_k, zd_k
+    integer :: ii,ij
+
     call boundary_mnh(u)
 
     if (LUseO) then     
@@ -660,63 +671,42 @@ end subroutine construct_vertical_coeff
     endif
     if (LUseT) then 
        call construct_alpha_T_cst_mnh(u%grid_param,alpha_T,Tij)    
-!!$       do iz=1,u%grid_param%nz    
-!!$          do iy=u%icompy_min,u%icompy_max
-!!$             do ix=u%icompx_min,u%icompx_max
-!!$                ! Construct horizontal part of stencil        
-!!$                b_k = vert_coeff%b(iz)
-!!$                c_k = vert_coeff%c(iz)
-!!$                d_k = vert_coeff%d(iz)
-!!$                tmp = ((-b_k-c_k)*Tij ) * u%st(ix,iy,iz)
-!!$                if (iz < grid_param%nz) then
-!!$                   tmp = tmp + b_k*Tij * u%st(ix,iy,iz+1)
-!!$                end if
-!!$                if (iz > 1) then
-!!$                   tmp = tmp + c_k*Tij * u%st(ix,iy,iz-1)
-!!$                end if
-!!$                if ((iz > 1) .and. (iz < grid_param%nz)) then
-!!$                   tmp = tmp - alpha_T(5) * u%st(ix,  iy  ,iz  ) &
-!!$                        + alpha_T(1) * u%st(ix+1,  iy  ,iz) &
-!!$                        + alpha_T(2) * u%st(ix-1,  iy  ,iz) &
-!!$                        + alpha_T(3) * u%st(ix  ,  iy+1,iz) &
-!!$                        + alpha_T(4) * u%st(ix  ,  iy-1,iz)
-!!$                end if
-!!$                v%st(ix,iy,iz) = d_k*tmp 
-!!$             end do
-!!$          end do
-!!$       end do
        !-----------------------------------------------------------
        iib=u%icompx_min
        iie=u%icompx_max
        ijb=u%icompy_min
        ije=u%icompy_max
        !
+       zv_st => v%st
+       zu_st => u%st
+       zb_k => vert_coeff%b
+       zc_k => vert_coeff%c
+       zd_k => vert_coeff%d
+
+       !$acc kernels
        iz=1
-       b_k = vert_coeff%b(iz)
-       c_k = vert_coeff%c(iz)
-       d_k = vert_coeff%d(iz)
-       v%st(iib:iie,ijb:ije,iz) = d_k* ( (-b_k-c_k)*Tij * u%st(iib:iie,ijb:ije,iz  )  &
-                                          +b_k     *Tij * u%st(iib:iie,ijb:ije,iz+1)  )
+       zv_st(iib:iie,ijb:ije,iz) = zd_k(iz)* ( (-zb_k(iz)-zc_k(iz))*Tij * zu_st(iib:iie,ijb:ije,iz  )  &
+                                               +zb_k(iz)          *Tij * zu_st(iib:iie,ijb:ije,iz+1)  )
        !
        do iz=2,u%grid_param%nz-1    
-          b_k = vert_coeff%b(iz)
-          c_k = vert_coeff%c(iz)
-          d_k = vert_coeff%d(iz)
-               v%st(iib:iie,ijb:ije,iz) = d_k*  ( ((-b_k-c_k)*Tij - 4.0_rl ) * u%st(iib:iie,ijb:ije,iz)    &
-                                                    +b_k     *Tij                * u%st(iib:iie,ijb:ije,iz+1)  &
-                                                        +c_k *Tij                * u%st(iib:iie,ijb:ije,iz-1)  &
-                                                                  +                u%st(iib+1:iie+1,ijb:ije,iz) &
-                                                                  +                u%st(iib-1:iie-1,ijb:ije,iz) &
-                                                                  +                u%st(iib:iie,ijb+1:ije+1,iz) &
-                                                                  +                u%st(iib:iie,ijb-1:ije-1,iz) )
+          do ij=ijb,ije
+             do ii=iib,iie
+                zv_st(ii,ij,iz) = zd_k(iz)*  ( ((-zb_k(iz)-zc_k(iz))*Tij - 4.0_rl ) * zu_st(ii,ij,iz)    &
+                                                +zb_k(iz)          *Tij            * zu_st(ii,ij,iz+1)  &
+                                                         +zc_k(iz) *Tij            * zu_st(ii,ij,iz-1)  &
+                                           +                                         zu_st(ii+1,ij,iz) &
+                                           +                                         zu_st(ii-1,ij,iz) &
+                                           +                                         zu_st(ii,ij+1,iz) &
+                                           +                                         zu_st(ii,ij-1,iz) &
+                                       )
+             end do
+          end do
        end do
        !
        iz=u%grid_param%nz        
-       b_k = vert_coeff%b(iz)
-       c_k = vert_coeff%c(iz)
-       d_k = vert_coeff%d(iz) 
-       v%st(iib:iie,ijb:ije,iz) = d_k*  (  (-b_k-c_k)*Tij  * u%st(iib:iie,ijb:ije,iz)    &
-                                                +c_k *Tij  * u%st(iib:iie,ijb:ije,iz-1)  )
+       zv_st(iib:iie,ijb:ije,iz) = zd_k(iz)*  (  (-zb_k(iz)-zc_k(iz))*Tij  * zu_st(iib:iie,ijb:ije,iz)    &
+                                                          +zc_k(iz) *Tij  * zu_st(iib:iie,ijb:ije,iz-1)  )
+       !$acc end kernels
     endif
     
   end subroutine apply_mnh
@@ -1315,6 +1305,8 @@ end subroutine construct_vertical_coeff
     integer :: iib,iie,ijb,ije
     type(scalar3d) :: Sr,Sc,Sut0,Sutmp
 
+    real , dimension(:,:,:) , pointer :: zSut0_st , zu_st
+
     call boundary_mnh(u)
 
     ! Set optimal smoothing parameter on each level
@@ -1374,7 +1366,12 @@ end subroutine construct_vertical_coeff
                             1-halo_size:nlocal+halo_size,   &
                             0:u%grid_param%nz+1) )
     if (LUseO) u0(:,:,:) = u%s(:,:,:)
-    if (LUseT) ut0(:,:,:) = u%st(:,:,:)
+    if (LUseT) then
+       zu_st =>  u%st
+       !$acc kernels
+       ut0(:,:,:) = zu_st(:,:,:)
+       !$acc end kernels
+    end if
     ! Create residual vector
     allocate(r(nz))
     ! Allocate memory for auxiliary vectors for Thomas algorithm
@@ -1384,16 +1381,21 @@ end subroutine construct_vertical_coeff
        allocate(Sr%st(1-halo_size:nlocal+halo_size,   &
                             1-halo_size:nlocal+halo_size,   &
                             0:u%grid_param%nz+1) )
-!!$       allocate(Sc%st(1-halo_size:nlocal+halo_size,   &
-!!$                            1-halo_size:nlocal+halo_size,   &
-!!$                            0:u%grid_param%nz+1) )
        allocate(Sut0%st(1-halo_size:nlocal+halo_size,   &
                             1-halo_size:nlocal+halo_size,   &
                             0:u%grid_param%nz+1) )
-       Sut0%st(:,:,:) = u%st(:,:,:)
+
+       zSut0_st => Sut0%st
+       zu_st => u%st
+
+       !$acc kernels
+       zSut0_st(:,:,:) = zu_st(:,:,:)
+       !$acc end kernels
+
        allocate(Sutmp%st(1-halo_size:nlocal+halo_size,   &
                             1-halo_size:nlocal+halo_size,   &
                             0:u%grid_param%nz+1) )
+       
     endif
 
     ! Loop over grid
@@ -1433,6 +1435,8 @@ end subroutine construct_vertical_coeff
     implicit none
     integer, intent(in) :: iblock
     integer :: ix,iy,iz
+
+    real , dimension(:,:,:) , pointer :: zu_st , zSutmp_st , zSut0_st
     
     if (LUseO) then
        do ix=ixmin(iblock),ixmax(iblock)
@@ -1451,30 +1455,23 @@ end subroutine construct_vertical_coeff
        end do
     end if
    if (LUseT) then
-!!$       do ix=ixmin(iblock),ixmax(iblock)
-!!$          do iy=iymin(iblock),iymax(iblock)
-!!$             call apply_tridiag_solve_mnhT(ix,iy,r,c,b,        &
-!!$                  ut0(ix+1,iy  ,1:nz), &
-!!$                  ut0(ix-1,iy  ,1:nz), &
-!!$                  ut0(ix  ,iy+1,1:nz), &
-!!$                  ut0(ix  ,iy-1,1:nz), &
-!!$                  utmp)
-!!$             ! Add correction
-!!$             do iz=1,nz
-!!$                u%st(ix,iy,iz) = rho*utmp(iz) + (1.0_rl-rho)*ut0(ix,iy,iz)
-!!$             end do
-!!$          end do
-!!$       end do
        iib=ixmin(iblock)
        iie=ixmax(iblock)
        ijb=iymin(iblock)
        ije=iymax(iblock)
+
+       zu_st => u%st
+       zSutmp_st => Sutmp%st
+       zSut0_st => Sut0%st
+
        call apply_tridiag_solve_mnh_allT(iib,iie,ijb,ije,Sr,c,b,        &
                   Sut0, &
                   Sutmp )
-       u%st(iib:iie,ijb:ije,1:nz) = & 
-                         rho*Sutmp%st(iib:iie,ijb:ije,1:nz) & 
-                       + (1.0_rl-rho)*Sut0%st(iib:iie,ijb:ije,1:nz)
+       !$acc kernels
+       zu_st(iib:iie,ijb:ije,1:nz) = & 
+                         rho*zSutmp_st(iib:iie,ijb:ije,1:nz) & 
+                       + (1.0_rl-rho)*zSut0_st(iib:iie,ijb:ije,1:nz)
+       !$acc end kernels
     end if
 
   end subroutine loop_over_grid_jacobi_mnh
@@ -1794,55 +1791,86 @@ end subroutine line_Jacobi_mnh
     real(kind=rl) :: alpha_div_Tij, tmp, b_k_tmp, c_k_tmp
     integer :: iz, nz
 
+    real, dimension(:,:,:) , pointer :: zSr_st , zb_st , zSu_in_st , zSu_out_st
+    real, dimension(:)     , pointer :: za_k, zb_k, zc_k, zd_k
+    integer :: ii,ij
+
     nz = b%grid_param%nz
 
     call construct_alpha_T_cst_mnh(b%grid_param,alpha_T,Tij)
     ! Calculate r_i = b_i - A_{ij} u_i
     if (LUseT ) then 
+
+      zSr_st => Sr%st
+      zb_st  => b%st
+      zSu_in_st => Su_in%st
+      zSu_out_st => Su_out%st
+      za_k  => vert_coeff%a
+      zb_k  => vert_coeff%b
+      zc_k  => vert_coeff%c
+      zd_k  => vert_coeff%d
+
+       !$acc kernels
        iz=1 
-       Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz)
+       zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz)
        do iz=2,nz-1
-          Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz) - vert_coeff%d(iz) * ( &
-                            Su_in%st(iib+1:iie+1,ijb:ije,iz) + &
-                            Su_in%st(iib-1:iie-1,ijb:ije,iz) + &
-                            Su_in%st(iib:iie,ijb+1:ije+1,iz) + &
-                            Su_in%st(iib:iie,ijb-1:ije-1,iz) )
+          zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz) - zd_k(iz) * ( &
+                            zSu_in_st(iib+1:iie+1,ijb:ije,iz) + &
+                            zSu_in_st(iib-1:iie-1,ijb:ije,iz) + &
+                            zSu_in_st(iib:iie,ijb+1:ije+1,iz) + &
+                            zSu_in_st(iib:iie,ijb-1:ije-1,iz) )
        end do
        iz=nz
-       Sr%st(iib:iie,ijb:ije,iz) = b%st(iib:iie,ijb:ije,iz)
+       zSr_st(iib:iie,ijb:ije,iz) = zb_st(iib:iie,ijb:ije,iz)
        
        ! Thomas algorithm
        ! STEP 1: Create modified coefficients
        iz = 1
        alpha_div_Tij = alpha_T(5)/Tij
-       tmp = (vert_coeff%a(iz)-vert_coeff%b(iz)-vert_coeff%c(iz))
+       tmp = (za_k(iz)-vert_coeff%b(iz)-vert_coeff%c(iz))
        c(iz) = vert_coeff%b(iz)/tmp
-       Su_out%st(iib:iie,ijb:ije,iz) = Sr%st(iib:iie,ijb:ije,iz) / (tmp*Tij*vert_coeff%d(iz))
+       zSu_out_st(iib:iie,ijb:ije,iz) = zSr_st(iib:iie,ijb:ije,iz) / (tmp*Tij*zd_k(iz))
        !
+       !acc loop seq
        do iz=2,nz-1
-          b_k_tmp = vert_coeff%b(iz)
-          c_k_tmp = vert_coeff%c(iz)
-          tmp = ((vert_coeff%a(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) &
+          b_k_tmp = zb_k(iz)
+          c_k_tmp = zc_k(iz)
+          tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)-alpha_div_Tij) &
                - c(iz-1)*c_k_tmp
           c(iz) = b_k_tmp / tmp
-          Su_out%st(iib:iie,ijb:ije,iz) = (Sr%st(iib:iie,ijb:ije,iz) / (Tij*vert_coeff%d(iz)) & 
-                                         - Su_out%st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp
+
+!!$          zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) & 
+!!$                                         - zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp
+
+          !$acc loop independent collapse(2)
+          do ij=ijb,ije
+             do ii=iib,iie
+                zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & 
+                                     - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp
+             end do
+          end do
+!!$          do concurrent ( ij=ijb:ije ,  ii=iib:iie )
+!!$             zSu_out_st(ii,ij,iz) = (zSr_st(ii,ij,iz) / (Tij*zd_k(iz)) & 
+!!$                  - zSu_out_st(ii,ij,iz-1)*c_k_tmp) / tmp
+!!$          end do
+          
        end do
        !
        iz=nz
-       b_k_tmp = vert_coeff%b(iz)
-       c_k_tmp = vert_coeff%c(iz)
-       tmp = ((vert_coeff%a(iz)-b_k_tmp-c_k_tmp)) &
+       b_k_tmp = zb_k(iz)
+       c_k_tmp = zc_k(iz)
+       tmp = ((za_k(iz)-b_k_tmp-c_k_tmp)) &
             - c(iz-1)*c_k_tmp
        c(iz) = b_k_tmp / tmp
-       Su_out%st(iib:iie,ijb:ije,iz) = (Sr%st(iib:iie,ijb:ije,iz) / (Tij*vert_coeff%d(iz)) &
-                                     -  Su_out%st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp
+       zSu_out_st(iib:iie,ijb:ije,iz) = (zSr_st(iib:iie,ijb:ije,iz) / (Tij*zd_k(iz)) &
+                                     -  zSu_out_st(iib:iie,ijb:ije,iz-1)*c_k_tmp) / tmp
        
        ! STEP 2: back substitution
        do iz=nz-1,1,-1
-          Su_out%st(iib:iie,ijb:ije,iz) = Su_out%st(iib:iie,ijb:ije,iz) & 
-                                        - c(iz) * Su_out%st(iib:iie,ijb:ije,iz+1)
+          zSu_out_st(iib:iie,ijb:ije,iz) = zSu_out_st(iib:iie,ijb:ije,iz) & 
+                                        - c(iz) * zSu_out_st(iib:iie,ijb:ije,iz+1)
        end do
+       !$acc end kernels
     end if
 
   end subroutine apply_tridiag_solve_mnh_allT
diff --git a/tensorproductmultigrid_Source/mode_mg.f90 b/tensorproductmultigrid_Source/mode_mg.f90
index c0bf52f57638595a3c2b12c20f7506b71d3186b5..c388cee404dfd606e9a815d13adbd4677cce2ee1 100644
--- a/tensorproductmultigrid_Source/mode_mg.f90
+++ b/tensorproductmultigrid_Source/mode_mg.f90
@@ -310,7 +310,7 @@ implicit none
 
 
   ! Finalise communications ...
-  call comm_finalise(mg_param%n_lev,mg_param%lev_split)
+  call comm_finalise(mg_param%n_lev,mg_param%lev_split,grid_param)
   call finish_timer(t_finalise)
   call print_timerinfo("# --- Main timing results ---")
   call print_elapsed(t_readparam,.true.,1.0_rl)
diff --git a/tensorproductmultigrid_Source/multigrid.f90 b/tensorproductmultigrid_Source/multigrid.f90
index a96e7a1362ee8bce2e62668068d6cf662aee57ab..2a594d28ef2e0d7ab82d2fb10ff8b8e955a1ffa7 100644
--- a/tensorproductmultigrid_Source/multigrid.f90
+++ b/tensorproductmultigrid_Source/multigrid.f90
@@ -452,10 +452,13 @@ contains
     implicit none
     type(scalar3d), intent(in)    :: phifine
     type(scalar3d), intent(inout) :: phicoarse
+    ! local var
     integer :: ix,iy,iz
     integer :: ix_min, ix_max, iy_min, iy_max, n
     real(kind=rl) :: xn,xs,xw,xe
 
+    real , dimension(:,:,:) , pointer :: zphifine_st , zphicoarse_st
+
     n      = phicoarse%grid_param%n
     ix_min = phicoarse%icompx_min
     ix_max = phicoarse%icompx_max
@@ -478,17 +481,21 @@ contains
       end do
       endif
       if (LUseT) then 
+         zphifine_st => phifine%st
+         zphicoarse_st => phicoarse%st
+         !$acc kernels loop independent collapse(3)
          do iz=1,phicoarse%grid_param%nz
             do iy=iy_min,iy_max
                do ix=ix_min,ix_max
-                  phicoarse%st(ix,iy,iz) =  &
-                       phifine%st(2*ix  ,2*iy  ,iz) + &
-                       phifine%st(2*ix  ,2*iy-1,iz) + &
-                       phifine%st(2*ix-1,2*iy  ,iz) + &
-                       phifine%st(2*ix-1,2*iy-1,iz)
+                  zphicoarse_st(ix,iy,iz) =  &
+                       zphifine_st(2*ix  ,2*iy  ,iz) + &
+                       zphifine_st(2*ix  ,2*iy-1,iz) + &
+                       zphifine_st(2*ix-1,2*iy  ,iz) + &
+                       zphifine_st(2*ix-1,2*iy-1,iz)
                end do
             end do
          end do
+         !$acc end kernels
       endif
 
    elseif(mg_param%restriction == REST_KHALIL) then
@@ -769,6 +776,8 @@ contains
       !local var
       integer :: ix,iy,iz
 
+      real , dimension(:,:,:) , pointer :: zphifine_st , zphicoarse_st
+
       ! optimisation for newman MNH case : all coef constant
       rhox = 0.25_rl
       rhoy = 0.25_rl
@@ -794,22 +803,28 @@ contains
       end if
       if (LUseT) then  
          ! Piecewise linear interpolation
+         
+         zphifine_st => phifine%st
+         zphicoarse_st => phicoarse%st
+
+         !$acc kernels loop independent collapse(5)
          do iz=1,phicoarse%grid_param%nz 
             do diy = -1,0
                do dix = -1,0
                   do iy=iymin(iblock),iymax(iblock)
                      do ix=ixmin(iblock),ixmax(iblock)
-                        phifine%st(2*ix+dix,2*iy+diy,iz) =      &
-                             phicoarse%st(ix,iy,iz) +                &
-                             rhox*(phicoarse%st(ix+(2*dix+1),iy,iz)  &
-                             - phicoarse%st(ix,iy,iz)) +       &
-                             rhoy*(phicoarse%st(ix,iy+(2*diy+1),iz)  &
-                             - phicoarse%st(ix,iy,iz))
+                        zphifine_st(2*ix+dix,2*iy+diy,iz) =      &
+                             zphicoarse_st(ix,iy,iz) +                &
+                             rhox*(zphicoarse_st(ix+(2*dix+1),iy,iz)  &
+                             - zphicoarse_st(ix,iy,iz)) +       &
+                             rhoy*(zphicoarse_st(ix,iy+(2*diy+1),iz)  &
+                             - zphicoarse_st(ix,iy,iz))
                      end do
                   end do
                end do
             end do
          end do
+         !$acc end kernels
       end if
 
     end subroutine loop_over_grid_linear_mnh
@@ -1159,10 +1174,13 @@ contains
     integer, intent(in)                                     :: splitlevel
     integer, intent(in)                                     :: level
     integer, intent(in)                                     :: m
+    !local var
     integer                                                 :: n_gridpoints
     integer                                                 :: nlocalx, nlocaly
     integer                                                 :: halo_size
 
+     real , dimension(:,:,:) , pointer ::  zu_level_1_m_st
+
     nlocalx = u(level,m)%ix_max-u(level,m)%ix_min+1
     nlocaly = u(level,m)%iy_max-u(level,m)%iy_min+1
     halo_size = u(level,m)%halo_size
@@ -1212,7 +1230,12 @@ contains
       else
         ! Set initial solution on coarser level to zero (incl. halos!)
         if (LUseO) u(level-1,m)%s(:,:,:) = 0.0_rl
-        if (LUseT) u(level-1,m)%st(:,:,:) = 0.0_rl
+        if (LUseT) then
+           zu_level_1_m_st =>  u(level-1,m)%st(:,:,:)
+           !$acc kernels 
+           zu_level_1_m_st(:,:,:) = 0.0_rl
+           !$acc end kernels
+        end if
         ! solve on coarser grid
         call mg_vcycle_mnh(b,u,r,finelevel,splitlevel,level-1,m)
       end if
@@ -1433,7 +1456,7 @@ contains
     type(time) :: t_prec, t_res, t_apply, t_l2norm, t_scalprod, t_mainloop
     type(scalar3d) :: pp
     type(scalar3d) :: q
-    type(scalar3d) :: zone 
+    type(scalar3d) :: z_one 
     real(kind=rl) :: alpha, beta, pq, rz, rz_old
     integer :: ierr
 
@@ -1460,27 +1483,27 @@ contains
                  * (nlocaly+2*halo_size) &
                  * (usolution%grid_param%nz+2)
     !
-    ! Init 1 vector = zone
-    call create_scalar3d(MPI_COMM_HORIZ,bRHS%grid_param,halo_size,zone)
+    ! Init 1 vector = z_one
+    call create_scalar3d(MPI_COMM_HORIZ,bRHS%grid_param,halo_size,z_one)
     if (LUseO) then
-       zone%s(:,:,:) = 0.0_rl
-       zone%s(1:zone%grid_param%nz,1:zone%icompy_max,1:zone%icompx_max) = 1.0_rl
+       z_one%s(:,:,:) = 0.0_rl
+       z_one%s(1:z_one%grid_param%nz,1:z_one%icompy_max,1:z_one%icompx_max) = 1.0_rl
     end if
     if (LUseT) then
-       zone%st(:,:,:) = 0.0_rl
-       zone%st(1:zone%icompx_max,1:zone%icompy_max,1:zone%grid_param%nz) = 1.0_rl
+       z_one%st(:,:,:) = 0.0_rl
+       z_one%st(1:z_one%icompx_max,1:z_one%icompy_max,1:z_one%grid_param%nz) = 1.0_rl
     end if
     !   Mean / Norm of B
-    call scalarprod_mnh(pproc,zone,zone, mean_initial ) 
-    call scalarprod_mnh(pproc,zone,bRHS, mean_initial ) 
+    call scalarprod_mnh(pproc,z_one,z_one, mean_initial ) 
+    call scalarprod_mnh(pproc,z_one,bRHS, mean_initial ) 
     norm_initial =  l2norm_mnh(bRHS,.true.)
-    mean_initial =  mean_initial / (( zone%grid_param%nz ) * ( zone%grid_param%n )**2)
+    mean_initial =  mean_initial / (( z_one%grid_param%nz ) * ( z_one%grid_param%n )**2)
     norm_initial =  mean_initial / norm_initial
     if (LMean) then
-       ! b <- b -mean_initial * zone
-       if (LUseO) call daxpy(n_gridpoints,-mean_initial,zone%s,1,bRHS%s,1)
-       if (LUseT) call daxpy(n_gridpoints,-mean_initial,zone%st,1,bRHS%st,1)
-       call scalarprod_mnh(pproc,zone,bRHS, mean_initial ) 
+       ! b <- b -mean_initial * z_one
+       if (LUseO) call daxpy(n_gridpoints,-mean_initial,z_one%s,1,bRHS%s,1)
+       if (LUseT) call daxpy(n_gridpoints,-mean_initial,z_one%st,1,bRHS%st,1)
+       call scalarprod_mnh(pproc,z_one,bRHS, mean_initial ) 
     endif
     !
     ! Copy b to b(1)
@@ -1492,7 +1515,7 @@ contains
 
 ! Scale with volume of grid cells
     call volscale_scalar3d_mnh(xb_mg(level,pproc),1)
-    call scalarprod_mnh(pproc,zone,xb_mg(level,pproc), mean_initial )
+    call scalarprod_mnh(pproc,z_one,xb_mg(level,pproc), mean_initial )
 
     call start_timer(t_res)
     call calculate_residual_mnh(level, pproc, &
@@ -1624,6 +1647,9 @@ contains
       if (LUseO) call dcopy(n_gridpoints,xu_mg(level,pproc)%s,1,usolution%s,1)
       if (LUseT) call dcopy(n_gridpoints,xu_mg(level,pproc)%st,1,usolution%st,1)
     end if
+
+    call destroy_scalar3d(z_one)
+
     call finish_timer(t_mainloop)
 
     ! Print out solver information