diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
index 243365e917d19e5db6f708089de2c86396304274..d3fe6b88fa353daa1d4770af1f10af0b077d330e 100644
--- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
+++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90
@@ -103,12 +103,14 @@ private
   type(type_halo_T), allocatable, dimension(:,:) :: tab_halo_et
   ! Vector data type for interior of field a(level,m)
   integer, allocatable, dimension(:,:) :: interior
-  integer, allocatable, dimension(:,:) :: interiorT 
+  integer, allocatable, dimension(:,:) :: interiorT
+  type(type_halo_T), allocatable, dimension(:,:) :: tab_interiorT_ne,tab_interiorT_sw,tab_interiorT_se
   ! Vector data type for one quarter of interior of field
   ! at level a(level,m). This has the same size (and can be
   ! used for communications with) the interior of a(level,m+1)
   integer, allocatable, dimension(:,:) :: sub_interior
-  integer, allocatable, dimension(:,:) :: sub_interiorT 
+  integer, allocatable, dimension(:,:) :: sub_interiorT
+  type(type_halo_T), allocatable, dimension(:,:) :: tab_sub_interiorT_ne,tab_sub_interiorT_sw,tab_sub_interiorT_se
   ! Timer for halo swaps
   type(time), allocatable, dimension(:,:) :: t_haloswap
   ! Timer for collect and distribute
@@ -289,6 +291,12 @@ contains
     if (LUseO) allocate(sub_interior(n_lev,0:pproc))
     if (LUseT) allocate(interiorT(n_lev,0:pproc))
     if (LUseT) allocate(sub_interiorT(n_lev,0:pproc))
+    if (LUseT) allocate(tab_interiorT_ne(n_lev,0:pproc))
+    if (LUseT) allocate(tab_interiorT_sw(n_lev,0:pproc))
+    if (LUseT) allocate(tab_interiorT_se(n_lev,0:pproc))
+    if (LUseT) allocate(tab_sub_interiorT_ne(n_lev,0:pproc))
+    if (LUseT) allocate(tab_sub_interiorT_sw(n_lev,0:pproc))
+    if (LUseT) allocate(tab_sub_interiorT_se(n_lev,0:pproc))
     ! Timer
     allocate(t_haloswap(n_lev,0:pproc))
     allocate(t_collect(0:pproc))
@@ -368,6 +376,12 @@ contains
             call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                  MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,interiorT(level,m),ierr)
             call mpi_type_commit(interiorT(level,m),ierr)
+            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocal,nlocal,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocal,nlocal,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocal,nlocal,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocal,nlocal,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocal,nlocal,nz+2)
+            call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocal,nlocal,nz+2)
          end if
          ! sub_interiorT
          if ( (nlocal/2) /= 0 ) then
@@ -377,6 +391,12 @@ contains
             call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,&
                  MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sub_interiorT(level,m),ierr)
             call mpi_type_commit(sub_interiorT(level,m),ierr)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocal/2,nlocal/2,nz+2)
+            call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocal/2,nlocal/2,nz+2)            
          end if
       end if
       ! --- Create timers ---
@@ -1470,11 +1490,19 @@ contains
                            source_rank, &
                            ierr)
         recv_tag = 1000
-        if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+        if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(1),ierr)
         recv_tag = 1010
-        if (LUseT) call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
-                           recv_requestT(1),ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT           
+           call mpi_irecv(tab_sub_interiorT_ne(level,m-1)%haloTout,size(tab_sub_interiorT_ne(level,m-1)%haloTout), &
+                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(1),ierr)
+#else
+           call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(1),ierr)
+#endif
+        end if
 #ifndef NDEBUG
   if (ierr .ne. 0) &
     call fatalerror("Collect: receive from NE failed in mpi_irecv().")
@@ -1488,8 +1516,17 @@ contains
         if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(2),ierr)
         recv_tag = 1011
-        if (LUseT) call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
-                           recv_requestT(2),ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT          
+           call mpi_irecv(tab_sub_interiorT_sw(level,m-1)%haloTout,size(tab_sub_interiorT_sw(level,m-1)%haloTout), &
+                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(2),ierr)
+#else
+           call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(2),ierr)         
+#endif
+           
+        endif
 
 #ifndef NDEBUG
   if (ierr .ne. 0) &
@@ -1504,8 +1541,16 @@ contains
         if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
                            recv_request(3),ierr)
         recv_tag = 1012
-        if (LUseT) call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
-                           recv_requestT(3),ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT           
+           call mpi_irecv(tab_sub_interiorT_se(level,m-1)%haloTout,size(tab_sub_interiorT_se(level,m-1)%haloTout), &
+                MPI_DOUBLE_PRECISION,source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(3),ierr)
+#else
+           call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, &
+                recv_requestT(3),ierr)           
+#endif
+        end if
 #ifndef NDEBUG
   if (ierr .ne. 0) &
     call fatalerror("Collect: receive from SE failed in mpi_irecv().")
@@ -1516,6 +1561,17 @@ contains
         ! Wait for receives to complete before proceeding
         if (LUseO) call mpi_waitall(3,recv_request,MPI_STATUSES_IGNORE,ierr)
         if (LUseT) call mpi_waitall(3,recv_requestT,MPI_STATUSES_IGNORE,ierr)
+#ifdef MNH_GPUDIRECT
+        if (LUseT) then
+           ! copy from buffer for GPU DIRECT
+           ! Receive from NE 
+           b%st(b_n/2+1:b_n,1:b_n/2,0:nz+1) = tab_sub_interiorT_ne(level,m-1)%haloTout(1:b_n/2,1:b_n/2,1:nz+2)
+           ! Receive from SW 
+           b%st(1:b_n/2,b_n/2+1:b_n,0:nz+1) = tab_sub_interiorT_sw(level,m-1)%haloTout(1:b_n/2,1:b_n/2,1:nz+2)
+           ! Receive from SE
+           b%st(b_n/2+1:b_n,b_n/2+1:b_n,0:nz+1) = tab_sub_interiorT_se(level,m-1)%haloTout(1:b_n/2,1:b_n/2,1:nz+2)
+        end if
+#endif
       end if
       if ( corner_ne ) then
         ! Send to NW
@@ -1526,7 +1582,15 @@ contains
         send_tag = 1000
         if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
         send_tag = 1010
-        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT
+           tab_interiorT_ne(level,m)%haloTin(1:a_n,1:a_n,1:nz+2) = a%st(1:a_n,1:a_n,0:nz+1)
+           call mpi_send(tab_interiorT_ne(level,m)%haloTin,size(tab_interiorT_ne(level,m)%haloTin), &
+                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+#else
+           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+#endif
+        end if
 #ifndef NDEBUG
   if (ierr .ne. 0) &
     call fatalerror("Collect: send from NE failed in mpi_send().")
@@ -1541,7 +1605,15 @@ contains
         send_tag = 1001
         if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
         send_tag = 1011
-        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT
+           tab_interiorT_sw(level,m)%haloTin(1:a_n,1:a_n,1:nz+2) = a%st(1:a_n,1:a_n,0:nz+1)
+           call mpi_send(tab_interiorT_sw(level,m)%haloTin,size(tab_interiorT_sw(level,m)%haloTin), &
+                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+#else
+           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)        
+#endif
+        end if
 #ifndef NDEBUG
   if (ierr .ne. 0) &
     call fatalerror("Collect: send from SW failed in mpi_send().")
@@ -1556,7 +1628,15 @@ contains
         send_tag = 1002
         if (LUseO) call mpi_send(a%s(0,1,1),1,interior(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
         send_tag = 1012
-        if (LUseT) call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+        if (LUseT) then
+#ifdef MNH_GPUDIRECT
+           tab_interiorT_se(level,m)%haloTin(1:a_n,1:a_n,1:nz+2) = a%st(1:a_n,1:a_n,0:nz+1)
+           call mpi_send(tab_interiorT_se(level,m)%haloTin,size(tab_interiorT_se(level,m)%haloTin), &
+                MPI_DOUBLE_PRECISION,dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+#else
+           call mpi_send(a%st(1,1,0),1,interiorT(level,m),dest_rank,send_tag,MPI_COMM_HORIZ,ierr)
+#endif
+        end if
 #ifndef NDEBUG
   if (ierr .ne. 0) &
     call fatalerror("Collect: send from SE failed in mpi_send().")