From 915e7cb287cd53c34655b244f3ec9e8d44880acb Mon Sep 17 00:00:00 2001
From: Juan Escobar <juan.escobar@aero.obs-mip.fr>
Date: Tue, 14 Sep 2021 11:45:41 +0200
Subject: [PATCH] Juan 14/09/2021:add turb_hor_dyn_corr.f90 on ZSOLVER for
 GPU/MANAGED optimisation

---
 src/ZSOLVER/turb_hor_dyn_corr.f90 | 1297 +++++++++++++++++++++++++++++
 1 file changed, 1297 insertions(+)
 create mode 100644 src/ZSOLVER/turb_hor_dyn_corr.f90

diff --git a/src/ZSOLVER/turb_hor_dyn_corr.f90 b/src/ZSOLVER/turb_hor_dyn_corr.f90
new file mode 100644
index 000000000..1373e2597
--- /dev/null
+++ b/src/ZSOLVER/turb_hor_dyn_corr.f90
@@ -0,0 +1,1297 @@
+!MNH_LIC Copyright 1994-2020 CNRS, Meteo-France and Universite Paul Sabatier
+!MNH_LIC This is part of the Meso-NH software governed by the CeCILL-C licence
+!MNH_LIC version 1. See LICENSE, CeCILL-C_V1-en.txt and CeCILL-C_V1-fr.txt
+!MNH_LIC for details. version 1.
+!-----------------------------------------------------------------
+MODULE MODI_TURB_HOR_DYN_CORR
+!
+INTERFACE
+!
+      SUBROUTINE TURB_HOR_DYN_CORR(KSPLT, PTSTEP,                    &
+                      OCLOSE_OUT,OTURB_FLX,KRR,                      &
+                      TPFILE,                                        &
+                      PK,PINV_PDZZ,                                  &
+                      PDXX,PDYY,PDZZ,PDZX,PDZY,PZZ,                  &
+                      PDIRCOSZW,                                     &
+                      PCOSSLOPE,PSINSLOPE,                           &
+                      PRHODJ,                                        &
+                      PCDUEFF,PTAU11M,PTAU12M,PTAU22M,PTAU33M,       &
+                      PUM,PVM,PWM,PUSLOPEM,PVSLOPEM,                 &
+                      PTHLM,PRM,PSVM,                                &
+                      PTKEM,PLM,                                     &
+                      PDP,PTP,                                       &
+                      PRUS,PRVS,PRWS                                 )
+!
+USE MODD_IO, ONLY: TFILEDATA
+!
+INTEGER,                  INTENT(IN)    ::  KSPLT        ! split process index
+REAL,                     INTENT(IN)    ::  PTSTEP       ! timestep
+LOGICAL,                  INTENT(IN)    ::  OCLOSE_OUT   ! switch for syncronous
+                                                         ! file opening       
+LOGICAL,                  INTENT(IN)    ::  OTURB_FLX    ! switch to write the
+                                 ! turbulent fluxes in the syncronous FM-file
+INTEGER,                  INTENT(IN)    ::  KRR          ! number of moist var.
+TYPE(TFILEDATA),          INTENT(IN)    ::  TPFILE       ! Output file
+!
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PK          ! Turbulent diffusion doef.
+                                                        ! PK = PLM * SQRT(PTKEM)
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PINV_PDZZ   ! 1./PDZZ
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PDXX, PDYY, PDZZ, PDZX, PDZY 
+                                                         ! Metric coefficients
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PZZ          ! vertical grid
+REAL, DIMENSION(:,:),     INTENT(IN)    ::  PDIRCOSZW
+! Director Cosinus along z directions at surface w-point
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PCOSSLOPE       ! cosinus of the angle 
+                                      ! between i and the slope vector
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PSINSLOPE       ! sinus of the angle 
+                                      ! between i and the slope vector
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PRHODJ       ! density * grid volume
+!
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PCDUEFF      ! Cd * || u || at time t
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU11M      ! <uu> in the axes linked 
+       ! to the maximum slope direction and the surface normal and the binormal 
+       ! at time t - dt
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU12M      ! <uv> in the same axes
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU22M      ! <vv> in the same axes
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU33M      ! <ww> in the same axes
+!
+! Variables at t-1
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PUM,PVM,PWM,PTHLM
+REAL, DIMENSION(:,:,:,:), INTENT(IN)    ::  PRM
+REAL, DIMENSION(:,:,:,:), INTENT(IN)    ::  PSVM
+REAL, DIMENSION(:,:),      INTENT(IN)   ::  PUSLOPEM     ! wind component along the 
+                                     ! maximum slope direction
+REAL, DIMENSION(:,:),      INTENT(IN)   ::  PVSLOPEM     ! wind component along the 
+                                     ! direction normal to the maximum slope one
+!
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PTKEM        ! TKE at time t- dt
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PLM          ! Turb. mixing length
+!
+REAL, DIMENSION(:,:,:),   INTENT(INOUT) ::  PRUS, PRVS, PRWS
+REAL, DIMENSION(:,:,:),   INTENT(INOUT) ::  PDP,PTP      ! TKE production terms
+!
+END SUBROUTINE TURB_HOR_DYN_CORR
+!
+END INTERFACE
+!
+END MODULE MODI_TURB_HOR_DYN_CORR
+!     ################################################################
+      SUBROUTINE TURB_HOR_DYN_CORR(KSPLT, PTSTEP,                    &
+                      OCLOSE_OUT,OTURB_FLX,KRR,                      &
+                      TPFILE,                                        &
+                      PK,PINV_PDZZ,                                  &
+                      PDXX,PDYY,PDZZ,PDZX,PDZY,PZZ,                  &
+                      PDIRCOSZW,                                     &
+                      PCOSSLOPE,PSINSLOPE,                           &
+                      PRHODJ,                                        &
+                      PCDUEFF,PTAU11M,PTAU12M,PTAU22M,PTAU33M,       &
+                      PUM,PVM,PWM,PUSLOPEM,PVSLOPEM,                 &
+                      PTHLM,PRM,PSVM,                                &
+                      PTKEM,PLM,                                     &
+                      PDP,PTP,                                       &
+                      PRUS,PRVS,PRWS                                 )
+!     ################################################################
+!
+!!****  *TURB_HOR* -routine to compute the source terms in the meso-NH
+!!               model equations due to the non-vertical turbulent fluxes.
+!!
+!!    PURPOSE
+!!    -------
+!!
+!!     see TURB_HOR
+!!
+!!**  METHOD
+!!    ------
+!!
+!!    EXTERNAL
+!!    --------
+!!
+!!    IMPLICIT ARGUMENTS
+!!    ------------------
+!!
+!!    REFERENCE
+!!    ---------
+!!
+!!    AUTHOR
+!!    ------
+!!
+!!      Joan Cuxart             * INM and Meteo-France *
+!!
+!!    MODIFICATIONS
+!!    -------------
+!!                     Aug    , 1997 (V. Saravane) spliting of TURB_HOR
+!!                     Nov  27, 1997 (V. Masson) clearing of the routine
+!!                     Oct  18, 2000 (V. Masson) LES computations + LFLAT switch
+!!                     Feb  15, 2001 (J. Stein)  remove the use of w=0 at the
+!!                                               ground   
+!!                     Mar  12, 2001 (V. Masson and J. Stein) major bugs 
+!!                                 + change of discretization at the surface
+!!                     Nov  06, 2002 (V. Masson) LES budgets
+!!                     October 2009 (G. Tanguy) add ILENCH=LEN(YCOMMENT) after
+!!                                              change of YCOMMENT
+!!                     July 2012     (V.Masson) Implicitness of W
+!!                     March 2014    (V.Masson) tridiag_w : bug between
+!!                                               mass and flux position
+!!                     J.Escobar : 15/09/2015 : WENO5 & JPHEXT <> 1
+!!                      M.Moge  04/2016 Use openACC directives to port the TURB part of Meso-NH on GPU
+!!  Philippe Wautelet: 05/2016-04/2018: new data structures and calls for I/O
+!  P. Wautelet 20/05/2019: add name argument to ADDnFIELD_ll + new ADD4DFIELD_ll subroutine
+!  J.Escobar 13/08/2020: PGI/NVHPC BUG , extend DO CONCURRENT to 3D indexes        
+!! --------------------------------------------------------------------------
+!
+!*      0. DECLARATIONS
+!          ------------
+!
+USE MODD_ARGSLIST_ll,    ONLY: LIST_ll
+USE MODD_CST
+USE MODD_CONF
+USE MODD_CTURB
+USE MODD_IO,             ONLY: TFILEDATA
+USE MODD_PARAMETERS
+USE MODD_LES
+USE MODD_NSV
+!
+USE MODE_ll
+USE MODE_FIELD,          ONLY: TFIELDDATA, TYPEREAL
+USE MODE_IO_FIELD_WRITE, only: IO_Field_write
+use mode_mppdb
+!
+USE MODI_GRADIENT_M
+USE MODI_GRADIENT_U
+USE MODI_GRADIENT_V
+USE MODI_GRADIENT_W
+#ifndef MNH_OPENACC
+USE MODI_SHUMAN
+#else
+USE MODI_SHUMAN_DEVICE
+#endif
+USE MODI_COEFJ
+USE MODI_LES_MEAN_SUBGRID
+USE MODI_TRIDIAG_W
+!
+USE MODI_SECOND_MNH
+USE MODE_MPPDB
+#ifdef MNH_BITREP
+USE MODI_BITREP
+#endif
+#ifdef MNH_OPENACC
+USE MODE_MNH_ZWORK , ONLY : MNH_ALLOCATE_ZT3D , MNH_REL_ZT3D, MNH_ALLOCATE_ZT3DP , MNH_ALLOCATE_ZT2D
+#endif
+!
+IMPLICIT NONE
+!
+!
+!*       0.1  declaration of arguments
+!
+!
+!
+INTEGER,                  INTENT(IN)    ::  KSPLT        ! split process index
+REAL,                     INTENT(IN)    ::  PTSTEP       ! timestep
+LOGICAL,                  INTENT(IN)    ::  OCLOSE_OUT   ! switch for syncronous
+                                                         ! file opening       
+LOGICAL,                  INTENT(IN)    ::  OTURB_FLX    ! switch to write the
+                                 ! turbulent fluxes in the syncronous FM-file
+INTEGER,                  INTENT(IN)    ::  KRR          ! number of moist var.
+TYPE(TFILEDATA),          INTENT(IN)    ::  TPFILE       ! Output file
+!
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PK          ! Turbulent diffusion doef.
+                                                        ! PK = PLM * SQRT(PTKEM)
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PINV_PDZZ   ! 1./PDZZ
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PDXX, PDYY, PDZZ, PDZX, PDZY 
+                                                         ! Metric coefficients
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PZZ          ! vertical grid
+REAL, DIMENSION(:,:),     INTENT(IN)    ::  PDIRCOSZW
+! Director Cosinus along z directions at surface w-point
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PCOSSLOPE       ! cosinus of the angle 
+                                      ! between i and the slope vector
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PSINSLOPE       ! sinus of the angle 
+                                      ! between i and the slope vector
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PRHODJ       ! density * grid volume
+!
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PCDUEFF      ! Cd * || u || at time t
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU11M      ! <uu> in the axes linked 
+       ! to the maximum slope direction and the surface normal and the binormal 
+       ! at time t - dt
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU12M      ! <uv> in the same axes
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU22M      ! <vv> in the same axes
+REAL, DIMENSION(:,:),   INTENT(IN)   ::  PTAU33M      ! <ww> in the same axes
+!
+! Variables at t-1
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PUM,PVM,PWM,PTHLM
+REAL, DIMENSION(:,:,:,:), INTENT(IN)    ::  PRM
+REAL, DIMENSION(:,:,:,:), INTENT(IN)    ::  PSVM
+REAL, DIMENSION(:,:),      INTENT(IN)   ::  PUSLOPEM     ! wind component along the 
+                                     ! maximum slope direction
+REAL, DIMENSION(:,:),      INTENT(IN)   ::  PVSLOPEM     ! wind component along the 
+                                     ! direction normal to the maximum slope one
+!
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PTKEM        ! TKE at time t- dt
+REAL, DIMENSION(:,:,:),   INTENT(IN)    ::  PLM          ! Turb. mixing length
+!
+REAL, DIMENSION(:,:,:),   INTENT(INOUT) ::  PRUS, PRVS, PRWS
+REAL, DIMENSION(:,:,:),   INTENT(INOUT) ::  PDP,PTP      ! TKE production terms
+!
+!*       0.2  declaration of local variables
+!
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZFLX,ZWORK ! work arrays, PK is the turb. mixing coef.
+!
+REAL, DIMENSION(:,:),   pointer , contiguous :: ZDIRSINZW
+INTEGER :: IZFLX,IZWORK,IZDIRSINZW
+      ! sinus of the angle between the vertical and the normal to the orography
+INTEGER             :: IKB,IKE
+                                    ! Index values for the Beginning and End
+                                    ! mass points of the domain  
+INTEGER             :: IKU                                   
+INTEGER             :: JSV          ! scalar loop counter
+!
+REAL, DIMENSION(:,:,:), pointer , contiguous :: GX_U_M_PUM
+REAL, DIMENSION(:,:,:), pointer , contiguous :: GY_V_M_PVM
+REAL, DIMENSION(:,:,:), pointer , contiguous :: GZ_W_M_PWM
+REAL, DIMENSION(:,:,:), pointer , contiguous :: GZ_W_M_ZWP
+INTEGER :: IGX_U_M_PUM,IGY_V_M_PVM,IGZ_W_M_PWM,IGZ_W_M_ZWP
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZMZF_DZZ   ! MZF(PDZZ)
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDFDDWDZ   ! formal derivative of the
+!                                                 ! flux (variable: dW/dz)
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZWP        ! W at future   time-step
+!
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDU_DZ_DZS_DX ! du/dz*dzs/dx surf
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDV_DZ_DZS_DY ! dv/dz*dzs/dy surf
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDU_DX        ! du/dx        surf
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDV_DY        ! dv/dy        surf
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZDW_DZ        ! dw/dz        surf
+INTEGER :: IZMZF_DZZ,IZDFDDWDZ,IZWP,IZDU_DZ_DZS_DX,IZDV_DZ_DZS_DY &
+           ,IZDU_DX,IZDV_DY,IZDW_DZ 
+!
+INTEGER                :: IINFO_ll      ! return code of parallel routine
+TYPE(LIST_ll), POINTER :: TZFIELDS_ll   ! list of fields to exchange
+
+REAL :: ZTIME1, ZTIME2
+
+
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZCOEFF , ZDZZ
+                                    ! coefficients for the uncentred gradient 
+                                    ! computation near the ground
+INTEGER  :: IZCOEFF , IZDZZ
+!
+#ifdef MNH_OPENACC
+REAL, DIMENSION(:,:,:), pointer , contiguous :: ZTMP1_DEVICE, ZTMP2_DEVICE, ZTMP3_DEVICE, ZTMP4_DEVICE
+INTEGER :: IZTMP1_DEVICE, IZTMP2_DEVICE, IZTMP3_DEVICE, IZTMP4_DEVICE
+#endif
+TYPE(TFIELDDATA) :: TZFIELD
+!
+INTEGER  :: JIU,JJU,JKU
+INTEGER  :: JI,JJ,JK
+! --------------------------------------------------------------------------
+
+!$acc data present( PK, PINV_PDZZ, PDXX, PDYY, PDZZ, PDZX, PDZY, PZZ, PDIRCOSZW,     &
+!$acc &             PCOSSLOPE, PSINSLOPE, PRHODJ, PCDUEFF,                           &
+!$acc &             PTAU11M, PTAU12M, PTAU22M, PTAU33M,                              &
+!$acc &             PUM, PVM, PWM, PTHLM, PRM, PSVM, PUSLOPEM, PVSLOPEM, PTKEM, PLM, &
+!$acc &             PRUS, PRVS, PRWS, PDP, PTP )
+
+if ( mppdb_initialized ) then
+  !Check all in arrays
+  call Mppdb_check( pk,          "Turb_hor_dyn_corr beg:pk"          )
+  call Mppdb_check( pinv_pdzz,   "Turb_hor_dyn_corr beg:pinv_pdzz"   )
+  call Mppdb_check( pdxx,        "Turb_hor_dyn_corr beg:pdxx"        )
+  call Mppdb_check( pdyy,        "Turb_hor_dyn_corr beg:pdyy"        )
+  call Mppdb_check( pdzz,        "Turb_hor_dyn_corr beg:pdzz"        )
+  call Mppdb_check( pdzx,        "Turb_hor_dyn_corr beg:pdzx"        )
+  call Mppdb_check( pdzy,        "Turb_hor_dyn_corr beg:pdzy"        )
+  call Mppdb_check( pzz,         "Turb_hor_dyn_corr beg:pzz"         )
+  call Mppdb_check( pdircoszw,   "Turb_hor_dyn_corr beg:pdircoszw"   )
+  call Mppdb_check( pcosslope,   "Turb_hor_dyn_corr beg:pcosslope"   )
+  call Mppdb_check( psinslope,   "Turb_hor_dyn_corr beg:psinslope"   )
+  call Mppdb_check( prhodj,      "Turb_hor_dyn_corr beg:prhodj"      )
+  call Mppdb_check( pcdueff,     "Turb_hor_dyn_corr beg:pcdueff"     )
+  call Mppdb_check( ptau11m,     "Turb_hor_dyn_corr beg:ptau11m"     )
+  call Mppdb_check( ptau12m,     "Turb_hor_dyn_corr beg:ptau12m"     )
+  call Mppdb_check( ptau22m,     "Turb_hor_dyn_corr beg:ptau22m"     )
+  call Mppdb_check( ptau33m,     "Turb_hor_dyn_corr beg:ptau33m"     )
+  call Mppdb_check( pum,         "Turb_hor_dyn_corr beg:pum"         )
+  call Mppdb_check( pvm,         "Turb_hor_dyn_corr beg:pvm"         )
+  call Mppdb_check( pwm,         "Turb_hor_dyn_corr beg:pwm"         )
+  call Mppdb_check( pthlm,       "Turb_hor_dyn_corr beg:pthlm"       )
+  call Mppdb_check( prm,         "Turb_hor_dyn_corr beg:prm"         )
+  call Mppdb_check( psvm,        "Turb_hor_dyn_corr beg:psvm"        )
+  call Mppdb_check( puslopem,    "Turb_hor_dyn_corr beg:puslopem"    )
+  call Mppdb_check( pvslopem,    "Turb_hor_dyn_corr beg:pvslopem"    )
+  call Mppdb_check( ptkem,       "Turb_hor_dyn_corr beg:ptkem"       )
+  call Mppdb_check( plm,         "Turb_hor_dyn_corr beg:plm"         )
+  !Check all inout arrays
+  call Mppdb_check( prus,   "Turb_hor_dyn_corr beg:prus"   )
+  call Mppdb_check( prvs,   "Turb_hor_dyn_corr beg:prvs"   )
+  call Mppdb_check( prws,   "Turb_hor_dyn_corr beg:prws"   )
+  call Mppdb_check( pdp,    "Turb_hor_dyn_corr beg:pdp"    )
+  call Mppdb_check( ptp,    "Turb_hor_dyn_corr beg:ptp"    )
+end if
+
+JIU =  size(pum, 1 )
+JJU =  size(pum, 2 )
+JKU =  size(pum, 3 )
+
+#ifndef MNH_OPENACC
+allocate( zflx (JIU,JJU,JKU ) )
+allocate( zwork(JIU,JJU,JKU ) )
+
+allocate( zdirsinzw(JIU,JJU ) )
+
+allocate( gx_u_m_pum(JIU,JJU,JKU ) )
+allocate( gy_v_m_pvm(JIU,JJU,JKU ) )
+allocate( gz_w_m_pwm(JIU,JJU,JKU ) )
+allocate( gz_w_m_zwp(JIU,JJU,JKU ) )
+allocate( zmzf_dzz  (JIU,JJU,JKU ) )
+allocate( zdfddwdz  (JIU,JJU,JKU ) )
+allocate( zwp       (JIU,JJU,JKU ) )
+
+allocate( zdu_dz_dzs_dx(JIU,JJU, 1 ) )
+allocate( zdv_dz_dzs_dy(JIU,JJU, 1 ) )
+allocate( zdu_dx       (JIU,JJU, 1 ) )
+allocate( zdv_dy       (JIU,JJU, 1 ) )
+allocate( zdw_dz       (JIU,JJU, 1 ) )
+
+allocate( zcoeff(JIU,JJU, 1 + jpvext : 3 + jpvext ) )
+allocate( zdzz  (JIU,JJU, 1 + jpvext : 3 + jpvext ) )
+#else
+izflx          = MNH_ALLOCATE_ZT3D( zflx ,JIU,JJU,JKU )
+izwork         = MNH_ALLOCATE_ZT3D( zwork,JIU,JJU,JKU )
+
+izdirsinzw     = MNH_ALLOCATE_ZT2D( zdirsinzw,JIU,JJU )
+
+igx_u_m_pum    = MNH_ALLOCATE_ZT3D( gx_u_m_pum,JIU,JJU,JKU )
+igy_v_m_pvm    = MNH_ALLOCATE_ZT3D( gy_v_m_pvm,JIU,JJU,JKU )
+igz_w_m_pwm    = MNH_ALLOCATE_ZT3D( gz_w_m_pwm,JIU,JJU,JKU )
+igz_w_m_zwp    = MNH_ALLOCATE_ZT3D( gz_w_m_zwp,JIU,JJU,JKU )
+izmzf_dzz      = MNH_ALLOCATE_ZT3D( zmzf_dzz  ,JIU,JJU,JKU )
+izdfddwdz      = MNH_ALLOCATE_ZT3D( zdfddwdz  ,JIU,JJU,JKU )
+izwp           = MNH_ALLOCATE_ZT3D( zwp       ,JIU,JJU,JKU )
+
+izdu_dz_dzs_dx = MNH_ALLOCATE_ZT3DP( zdu_dz_dzs_dx,JIU,JJU, 1 , 1 )
+izdv_dz_dzs_dy = MNH_ALLOCATE_ZT3DP( zdv_dz_dzs_dy,JIU,JJU, 1 , 1 )
+izdu_dx        = MNH_ALLOCATE_ZT3DP( zdu_dx       ,JIU,JJU, 1 , 1 )
+izdv_dy        = MNH_ALLOCATE_ZT3DP( zdv_dy       ,JIU,JJU, 1 , 1 )
+izdw_dz        = MNH_ALLOCATE_ZT3DP( zdw_dz       ,JIU,JJU, 1 , 1 )
+
+izcoeff        = MNH_ALLOCATE_ZT3DP( zcoeff,JIU,JJU, 1 + jpvext , 3 + jpvext )
+izdzz          = MNH_ALLOCATE_ZT3DP( zdzz  ,JIU,JJU, 1 + jpvext , 3 + jpvext )
+#endif
+
+#ifdef MNH_OPENACC
+iztmp1_device = MNH_ALLOCATE_ZT3D( ztmp1_device,JIU,JJU,JKU )
+iztmp2_device = MNH_ALLOCATE_ZT3D( ztmp2_device,JIU,JJU,JKU )
+iztmp3_device = MNH_ALLOCATE_ZT3D( ztmp3_device,JIU,JJU,JKU )
+iztmp4_device = MNH_ALLOCATE_ZT3D( ztmp4_device,JIU,JJU,JKU )
+#endif
+
+!$acc data present(ZFLX, ZWORK, ZDIRSINZW, ZCOEFF, ZDZZ,                  &
+!$acc &            GX_U_M_PUM, GY_V_M_PVM, GZ_W_M_PWM, GZ_W_M_ZWP,        &
+!$acc &            ZMZF_DZZ, ZDFDDWDZ, ZWP,                               &
+!$acc &            ZDU_DZ_DZS_DX, ZDV_DZ_DZS_DY, ZDU_DX, ZDV_DY, ZDW_DZ,  &
+!$acc &            ZTMP1_DEVICE, ZTMP2_DEVICE, ZTMP3_DEVICE, ZTMP4_DEVICE )
+
+!
+!*       1.   PRELIMINARY COMPUTATIONS
+!             ------------------------
+NULLIFY(TZFIELDS_ll)
+!
+IKB = 1+JPVEXT               
+IKE = SIZE(PUM,3)-JPVEXT    
+IKU = SIZE(PUM,3)
+!
+!
+!$acc kernels async(1)
+#ifndef MNH_BITREP
+ZDIRSINZW(:,:) = SQRT( 1. - PDIRCOSZW(:,:)**2 )
+#else
+!$acc loop independent collapse(2)
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
+   ZDIRSINZW(JI,JJ) = SQRT( 1. - BR_P2(PDIRCOSZW(JI,JJ)) )
+END DO
+#endif
+!$acc end kernels
+!
+#ifndef MNH_OPENACC
+GX_U_M_PUM = GX_U_M(PUM,PDXX,PDZZ,PDZX)
+IF (.NOT. L2D) THEN
+  GY_V_M_PVM = GY_V_M(PVM,PDYY,PDZZ,PDZY)
+END IF
+GZ_W_M_PWM = GZ_W_M(PWM,PDZZ)
+!
+ZMZF_DZZ = MZF(PDZZ)
+#else
+CALL GX_U_M_DEVICE(1,IKU,1,PUM,PDXX,PDZZ,PDZX,GX_U_M_PUM)
+IF (.NOT. L2D) THEN
+  CALL GY_V_M_DEVICE(1,IKU,1,PVM,PDYY,PDZZ,PDZY,GY_V_M_PVM)
+END IF
+CALL GZ_W_M_DEVICE(1,IKU,1,PWM,PDZZ,GZ_W_M_PWM)
+!
+CALL MZF_DEVICE(1,IKU,1,PDZZ,ZMZF_DZZ)
+#endif
+!
+CALL ADD3DFIELD_ll( TZFIELDS_ll, ZFLX, 'TURB_HOR_DYN_CORR::ZFLX' )
+
+
+!  compute the coefficients for the uncentred gradient computation near the 
+!  ground
+!
+!*       9.   < U'U'>
+!             -------
+!
+! Computes the U variance
+IF (.NOT. L2D) THEN
+   !$acc kernels async(2)
+   !$acc loop independent collapse(3)
+   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                            &
+           - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GX_U_M_PUM(JI,JJ,JK)        &
+           -(2./3.) * ( GY_V_M_PVM(JI,JJ,JK)                     &
+           +GZ_W_M_PWM(JI,JJ,JK)                ) )
+   END DO !CONCURRENT
+   !$acc end kernels
+   !!  &   to be tested later
+  !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+ELSE
+  !$acc kernels async(2)
+  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
+    - XCMFS * PK *( (4./3.) * GX_U_M_PUM                        &
+                   -(2./3.) * ( GZ_W_M_PWM                ) ) 
+  !$acc end kernels
+  !!  &   to be tested later
+  !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+END IF
+!
+!$acc kernels async(2)
+ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) 
+!$acc end kernels
+!
+!* prescription of du/dz and dv/dz with uncentered gradient at the surface
+!  prescription of dw/dz at Dz/2 above ground using the continuity equation
+!  using a Boussinesq hypothesis to remove the z dependance of rhod_ref
+!  (div u = 0)
+!
+#ifndef MNH_OPENACC
+ZDZZ(:,:,:) = MXM(PDZZ(:,:,IKB:IKB+2))
+#else
+CALL MXM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
+#endif
+!$acc kernels async(3)
+ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
+       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
+ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
+       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
+ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
+       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+!$acc end kernels
+!
+#ifndef MNH_OPENACC
+ZDU_DZ_DZS_DX(:,:,:)=MXF ((ZCOEFF(:,:,IKB+2:IKB+2)*PUM(:,:,IKB+2:IKB+2)       &
+                          +ZCOEFF(:,:,IKB+1:IKB+1)*PUM(:,:,IKB+1:IKB+1)       &
+                          +ZCOEFF(:,:,IKB  :IKB  )*PUM(:,:,IKB  :IKB  )       &
+                          )* 0.5 * ( PDZX(:,:,IKB+1:IKB+1)+PDZX(:,:,IKB:IKB)) &
+                         )/ MXF(PDXX(:,:,IKB:IKB))
+!
+ZDZZ(:,:,:) = MYM(PDZZ(:,:,IKB:IKB+2))
+#else
+!$acc kernels async(3)
+ZTMP1_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PUM(:,:,IKB+2)       &
+                          +ZCOEFF(:,:,IKB+1)*PUM(:,:,IKB+1)       &
+                          +ZCOEFF(:,:,IKB )*PUM(:,:,IKB)       &
+                          )* 0.5 * ( PDZX(:,:,IKB+1)+PDZX(:,:,IKB))
+!$acc end kernels
+!
+!!! wait for the computation of ZCOEFF and ZTMP1_DEVICE
+!$acc wait(3)
+!
+CALL MXF_DEVICE(ZTMP1_DEVICE(:,:,1:1), ZTMP2_DEVICE(:,:,1:1))
+CALL MXF_DEVICE(PDXX(:,:,IKB:IKB), ZTMP1_DEVICE(:,:,1:1))
+!$acc kernels async(3)
+ZDU_DZ_DZS_DX(:,:,1) = ZTMP2_DEVICE(:,:,1) / ZTMP1_DEVICE(:,:,1)
+!$acc end kernels
+!
+CALL MYM_DEVICE(PDZZ(:,:,IKB:IKB+2),ZDZZ(:,:,:))
+#endif
+!$acc kernels async(4)
+ZCOEFF(:,:,IKB+2)= - ZDZZ(:,:,2) /      &
+       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,3) )
+ZCOEFF(:,:,IKB+1)=   (ZDZZ(:,:,3)+ZDZZ(:,:,2)) /      &
+       ( ZDZZ(:,:,2) * ZDZZ(:,:,3) )
+ZCOEFF(:,:,IKB)= - (ZDZZ(:,:,3)+2.*ZDZZ(:,:,2)) /      &
+       ( (ZDZZ(:,:,3)+ZDZZ(:,:,2)) * ZDZZ(:,:,2) )
+!$acc end kernels
+!
+#ifndef MNH_OPENACC
+ZDV_DZ_DZS_DY(:,:,:)=MYF ((ZCOEFF(:,:,IKB+2:IKB+2)*PVM(:,:,IKB+2:IKB+2)       &
+                          +ZCOEFF(:,:,IKB+1:IKB+1)*PVM(:,:,IKB+1:IKB+1)       &
+                          +ZCOEFF(:,:,IKB  :IKB  )*PVM(:,:,IKB  :IKB  )       &
+                          )* 0.5 * ( PDZY(:,:,IKB+1:IKB+1)+PDZY(:,:,IKB:IKB)) &
+                         )/ MYF(PDYY(:,:,IKB:IKB))
+#else
+!$acc kernels async(4)
+ZTMP3_DEVICE(:,:,1) = (ZCOEFF(:,:,IKB+2)*PVM(:,:,IKB+2)       &
+                          +ZCOEFF(:,:,IKB+1)*PVM(:,:,IKB+1)       &
+                          +ZCOEFF(:,:,IKB)*PVM(:,:,IKB)       &
+                          )* 0.5 * ( PDZY(:,:,IKB+1)+PDZY(:,:,IKB))
+!$acc end kernels
+!
+!!! wait for the computation of ZCOEFF and ZTMP3_DEVICE
+!$acc wait(4)
+#endif
+!
+#ifndef MNH_OPENACC
+ZDU_DX(:,:,:)=  DXF(PUM(:,:,IKB:IKB)) / MXF(PDXX(:,:,IKB:IKB))  &
+              - ZDU_DZ_DZS_DX(:,:,:)
+
+ZDV_DY(:,:,:)=  DYF(PVM(:,:,IKB:IKB)) / MYF(PDYY(:,:,IKB:IKB)) &
+              - ZDV_DZ_DZS_DY(:,:,:)
+#else
+CALL MYF_DEVICE(ZTMP3_DEVICE(:,:,1:1), ZTMP4_DEVICE(:,:,1:1))
+CALL MYF_DEVICE(PDYY(:,:,IKB:IKB), ZTMP3_DEVICE(:,:,1:1))
+!$acc kernels async(4)
+ZDV_DZ_DZS_DY(:,:,1)= ZTMP4_DEVICE(:,:,1) / ZTMP3_DEVICE(:,:,1)
+!$acc end kernels
+!
+!
+!!! wait for the computation of ZDU_DZ_DZS_DX
+!$acc wait(3)
+!
+CALL DXF_DEVICE(PUM(:,:,IKB:IKB),ZTMP1_DEVICE(:,:,1:1))
+CALL MXF_DEVICE(PDXX(:,:,IKB:IKB),ZTMP2_DEVICE(:,:,1:1))
+!$acc kernels async(3)
+ZDU_DX(:,:,1)=  ZTMP1_DEVICE(:,:,1) / ZTMP2_DEVICE(:,:,1) - ZDU_DZ_DZS_DX(:,:,1)
+!$acc end kernels
+
+!!! wait for the computation of ZDV_DZ_DZS_DY
+!$acc wait(4)
+!
+CALL DYF_DEVICE(PVM(:,:,IKB:IKB),ZTMP3_DEVICE(:,:,1:1))
+CALL MYF_DEVICE(PDYY(:,:,IKB:IKB),ZTMP4_DEVICE(:,:,1:1))
+!$acc kernels! async(4)
+ZDV_DY(:,:,1)=  ZTMP3_DEVICE(:,:,1) / ZTMP4_DEVICE(:,:,1) - ZDV_DZ_DZS_DY(:,:,1)
+!$acc end kernels
+!
+!
+!!! wait for the computation of ZDU_DX and ZDV_DY
+!$acc wait(3) async(4)
+#endif
+!
+!$acc kernels async(4)
+ZDW_DZ(:,:,:)=-ZDU_DX(:,:,:)-ZDV_DY(:,:,:)
+!$acc end kernels
+!
+!* computation 
+!
+!!! wait for the computation of ZFLX
+!$acc wait(2) async(4)
+!!! wait for the computation of ZDW_DZ
+!$acc wait(4)
+!
+! ! !!! we can launch the update of ZFLX on the part that has already been computed
+! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10)
+!attention !!!!! je ne comprends pas pourquoi mais ce update plante à l'execution...
+! du coup je ne peux pas faire de update self asynchrone...
+!
+!$acc kernels async(3)
+ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
+  - XCMFS * PK(:,:,IKB) * 2. * ZDU_DX(:,:,1)
+!$acc end kernels
+
+
+!!  &  to be tested later
+!! + XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *        &
+!!   (-2./3.) * PTP(:,:,IKB:IKB)
+!
+! extrapolates this flux under the ground with the surface flux
+!
+!
+!!! wait for the computation of ZDIRSINZW
+!$acc wait(1)
+!
+!$acc kernels async(4)
+#ifndef MNH_BITREP
+ZFLX(:,:,IKB-1) =                                                            &
+        PTAU11M(:,:) * PCOSSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &
+  -2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:)       &
+  +     PTAU22M(:,:) * PSINSLOPE(:,:)**2                                     &
+  +     PTAU33M(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:)**2                 &
+  +2. * PCDUEFF(:,:) *      (                                                &
+      PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    &
+    - PUSLOPEM(:,:) * PCOSSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    )
+#else
+!$acc loop independent collapse(2)   
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
+ZFLX(JI,JJ,IKB-1) =                                                             &
+        PTAU11M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
+  -2. * PTAU12M(JI,JJ) * PCOSSLOPE(JI,JJ)* PSINSLOPE(JI,JJ) * PDIRCOSZW(JI,JJ)        &
+  +     PTAU22M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ))                                  &
+  +     PTAU33M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * BR_P2(ZDIRSINZW(JI,JJ))          &
+  +2. * PCDUEFF(JI,JJ) *      (                                                 &
+      PVSLOPEM(JI,JJ) * PCOSSLOPE(JI,JJ)    * PSINSLOPE(JI,JJ) * ZDIRSINZW(JI,JJ)     &
+      - PUSLOPEM(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ)) * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ) )
+END DO ! CONCURRENT
+#endif
+!$acc end kernels
+! 
+!!! wait for the computation of ZFLX(:,:,IKB) and ZFLX(:,:,IKB-1)
+!$acc wait(3) async(4)
+!
+!$acc kernels async(4)
+ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) -  ZFLX(:,:,IKB)
+!$acc end kernels
+!
+!
+!!! wait for the computation of ZFLX(:,:,IKB-1)
+!$acc wait(4)
+!
+
+
+! ! !!! we can launch the update of ZFLX on the rest
+! ! !$acc update self(ZFLX(:,:,1:IKB)) async(11)
+! ! !
+! ! !!! and wait for the update self(ZFLX(...)) to complete
+! ! !$acc wait(10)
+! ! !$acc wait(11)
+!attention !!!!! je ne comprends pas pourquoi mais le update self(ZFLX(:,:,IKB+1:)) plante à l'execution...
+! du coup je ne peux pas faire de update self asynchrone...
+
+
+!
+!!! at this point there are no more async operations running
+!!! to be absolutely sure, we do a wait
+!$acc wait
+!
+!$acc update self(ZFLX)
+CALL UPDATE_HALO_ll(TZFIELDS_ll, IINFO_ll)
+!$acc update device(ZFLX) async(10)
+!
+IF ( OCLOSE_OUT .AND. OTURB_FLX ) THEN
+  ! stores <U U>  
+  TZFIELD%CMNHNAME   = 'U_VAR'
+  TZFIELD%CSTDNAME   = ''
+  TZFIELD%CLONGNAME  = 'U_VAR'
+  TZFIELD%CUNITS     = 'm2 s-2'
+  TZFIELD%CDIR       = 'XY'
+  TZFIELD%CCOMMENT   = 'X_Y_Z_U_VAR'
+  TZFIELD%NGRID      = 1
+  TZFIELD%NTYPE      = TYPEREAL
+  TZFIELD%NDIMS      = 3
+  TZFIELD%LTIMEDEP   = .TRUE.
+  CALL IO_Field_write(TPFILE,TZFIELD,ZFLX)
+END IF
+!
+! Complete the U tendency
+#ifndef MNH_OPENACC
+IF (.NOT. LFLAT) THEN
+  PRUS(:,:,:)=PRUS                                            &
+              -DXM(PRHODJ * ZFLX / MXF(PDXX) )                &
+              +DZF( PDZX / MZM(PDXX) * MXM( MZM(PRHODJ*ZFLX) * PINV_PDZZ ) )
+ELSE
+  PRUS(:,:,:)=PRUS -DXM(PRHODJ * ZFLX / MXF(PDXX) )
+END IF
+#else
+CALL MXF_DEVICE(PDXX, ZTMP1_DEVICE)
+!$acc kernels async(10)
+!$acc loop independent collapse(3)
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+   ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK)
+END DO !CONCURRENT
+!$acc end kernels
+!
+!!! wait for the computation of ZTMP2_DEVICE and the update of ZFLX
+!$acc wait(10)
+!
+CALL DXM_DEVICE(ZTMP2_DEVICE, ZTMP3_DEVICE)
+IF (.NOT. LFLAT) THEN
+  CALL MZM_DEVICE(PDXX,ZTMP1_DEVICE)
+  !$acc kernels
+  !$acc loop independent collapse(3)
+  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK)
+  END DO !CONCURRENT
+  !$acc end kernels
+  CALL MZM_DEVICE(ZTMP2_DEVICE,ZTMP4_DEVICE)
+  !$acc kernels
+  !$acc loop independent collapse(3)
+  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = ZTMP4_DEVICE(JI,JJ,JK) * PINV_PDZZ(JI,JJ,JK)
+  END DO !CONCURRENT   
+  !$acc end kernels
+  CALL MXM_DEVICE( ZTMP2_DEVICE, ZTMP4_DEVICE )
+  !$acc kernels
+  !$acc loop independent collapse(3)
+  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = PDZX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) * ZTMP4_DEVICE(JI,JJ,JK)
+  END DO !CONCURRENT   
+  !$acc end kernels
+  CALL DZF_DEVICE(1,IKU,1,ZTMP2_DEVICE,ZTMP1_DEVICE)
+  !$acc kernels async(1)
+  PRUS(:,:,:)=PRUS(:,:,:)                                           &
+              -ZTMP3_DEVICE(:,:,:)                &
+              +ZTMP1_DEVICE(:,:,:)
+  !$acc end kernels
+ELSE
+  !$acc kernels async(1)
+  PRUS(:,:,:)=PRUS(:,:,:) - ZTMP3_DEVICE(:,:,:)
+  !$acc end kernels
+END IF
+#endif
+!
+IF (KSPLT==1) THEN
+  ! Contribution to the dynamic production of TKE:
+   !$acc kernels async(2)
+   !$acc loop independent collapse(3)
+   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZWORK(JI,JJ,JK)     = - ZFLX(JI,JJ,JK) * GX_U_M_PUM(JI,JJ,JK)
+   END DO !CONCURRENT
+  !$acc end kernels
+  !
+  ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
+  !
+  !$acc kernels async(2)
+  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDU_DX(:,:,1) + ZWORK(:,:,IKB+1) )
+  !$acc end kernels
+  !
+  !$acc kernels async(2)
+  PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
+  !$acc end kernels
+END IF
+!
+! Storage in the LES configuration
+!
+IF (LLES_CALL .AND. KSPLT==1) THEN
+  CALL SECOND_MNH(ZTIME1)
+!$acc data copy(X_LES_SUBGRID_U2,X_LES_RES_ddxa_U_SBG_UaU)
+  CALL LES_MEAN_SUBGRID( ZFLX, X_LES_SUBGRID_U2 ) 
+#ifndef MNH_OPENACC
+  CALL LES_MEAN_SUBGRID( -ZWORK, X_LES_RES_ddxa_U_SBG_UaU , .TRUE.)
+#else
+  !
+  !!! wait for the computation of ZWORK and PDP
+  !$acc wait(2)
+  !
+  !$acc kernels
+  ZTMP1_DEVICE = -ZWORK
+  !$acc end kernels
+  CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_U_SBG_UaU , .TRUE.)
+  !
+#endif
+!$acc end data
+  CALL SECOND_MNH(ZTIME2)
+  XTIME_LES = XTIME_LES + ZTIME2 - ZTIME1
+END IF
+
+!
+!*      10.   < V'V'>
+!             -------
+!
+!!! wait for the computation of ZWORK and PDP (that uses ZFLX)
+!$acc wait(2)
+!
+! Computes the V variance
+IF (.NOT. L2D) THEN
+   !$acc kernels async(3)
+   !$acc loop independent collapse(3)
+   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK)= (2./3.) * PTKEM(JI,JJ,JK)                                  &
+           - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GY_V_M_PVM(JI,JJ,JK)                        &
+           -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)                      &
+           +GZ_W_M_PWM(JI,JJ,JK)                ) )
+   END DO !CONCURRENT
+   !$acc end kernels
+  !! &  to be tested
+  !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+  !
+ELSE
+  !$acc kernels async(3)
+  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
+    - XCMFS * PK *(-(2./3.) * ( GX_U_M_PUM                      &
+                               +GZ_W_M_PWM                ) )  
+  !$acc end kernels
+  !! &  to be tested
+  !!  + XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+  !
+END IF
+!
+!$acc kernels async(3)
+ZFLX(:,:,IKE+1) = ZFLX(:,:,IKE) 
+!$acc end kernels
+!
+! ! !!! wait for the computation of ZFLX to begin the update
+! ! !$acc wait(3)
+! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(10)
+!
+!$acc kernels async(3)
+ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
+  - XCMFS * PK(:,:,IKB) * 2. * ZDV_DY(:,:,1)
+!$acc end kernels
+
+!!           & to be tested
+!! + XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *         &
+!!   (-2./3.) * PTP(:,:,IKB:IKB)
+!
+! extrapolates this flux under the ground with the surface flux
+!$acc kernels async(3)
+#ifndef MNH_BITREP
+ZFLX(:,:,IKB-1) =                                                            &
+        PTAU11M(:,:) * PSINSLOPE(:,:)**2 * PDIRCOSZW(:,:)**2                 &         
+  +2. * PTAU12M(:,:) * PCOSSLOPE(:,:)* PSINSLOPE(:,:) * PDIRCOSZW(:,:)       &
+  +     PTAU22M(:,:) * PCOSSLOPE(:,:)**2                                     &
+  +     PTAU33M(:,:) * PSINSLOPE(:,:)**2 * ZDIRSINZW(:,:)**2                 &
+  -2. * PCDUEFF(:,:)*       (                                                &
+      PUSLOPEM(:,:) * PSINSLOPE(:,:)**2 * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)    &
+    + PVSLOPEM(:,:) * PCOSSLOPE(:,:)    * PSINSLOPE(:,:) * ZDIRSINZW(:,:)    )
+#else
+!$acc loop independent collapse(2)   
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )
+ZFLX(JI,JJ,IKB-1) =                                                             &
+        PTAU11M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(PDIRCOSZW(JI,JJ))          &
+  +2. * PTAU12M(JI,JJ) * PCOSSLOPE(JI,JJ)* PSINSLOPE(JI,JJ) * PDIRCOSZW(JI,JJ)        &
+  +     PTAU22M(JI,JJ) * BR_P2(PCOSSLOPE(JI,JJ))                                  &
+  +     PTAU33M(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * BR_P2(ZDIRSINZW(JI,JJ))          &
+  -2. * PCDUEFF(JI,JJ)*       (                                                 &
+      PUSLOPEM(JI,JJ) * BR_P2(PSINSLOPE(JI,JJ)) * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ) &
+      + PVSLOPEM(JI,JJ) * PCOSSLOPE(JI,JJ)    * PSINSLOPE(JI,JJ) * ZDIRSINZW(JI,JJ)     )
+END DO ! CONCURRENT
+#endif
+!$acc end kernels
+! 
+!$acc kernels async(3)
+ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) -  ZFLX(:,:,IKB)
+!$acc end kernels
+!
+!
+! ! !!! wait for the computation of ZFLX(:,:,1:IKB) to begin the update
+! ! !$acc update self(ZFLX(:,:,IKB+1:)) async(3)
+! ! !
+! ! !!! and wait for the update self(ZFLX(...)) to complete
+! ! !$acc wait(10)
+! ! !$acc wait(3)
+!
+!$acc wait(3)
+!$acc update self(ZFLX)
+CALL UPDATE_HALO_ll(TZFIELDS_ll, IINFO_ll)
+!$acc update device(ZFLX) async(10)
+!
+IF ( OCLOSE_OUT .AND. OTURB_FLX ) THEN
+  ! stores <V V>  
+  TZFIELD%CMNHNAME   = 'V_VAR'
+  TZFIELD%CSTDNAME   = ''
+  TZFIELD%CLONGNAME  = 'V_VAR'
+  TZFIELD%CUNITS     = 'm2 s-2'
+  TZFIELD%CDIR       = 'XY'
+  TZFIELD%CCOMMENT   = 'X_Y_Z_V_VAR'
+  TZFIELD%NGRID      = 1
+  TZFIELD%NTYPE      = TYPEREAL
+  TZFIELD%NDIMS      = 3
+  TZFIELD%LTIMEDEP   = .TRUE.
+  CALL IO_Field_write(TPFILE,TZFIELD,ZFLX)
+END IF
+!
+!!! wait for the computation of PRUS (that uses ZTMP1_DEVICE and ZTMP3_DEVICE)
+!$acc wait(1)
+!
+!
+!
+! Complete the V tendency
+IF (.NOT. L2D) THEN
+#ifndef MNH_OPENACC
+  IF (.NOT. LFLAT) THEN
+    PRVS(:,:,:)=PRVS                                          &
+                -DYM(PRHODJ * ZFLX / MYF(PDYY) )              &
+                +DZF( PDZY / MZM(PDYY) *                      &
+                MYM( MZM(PRHODJ*ZFLX) * PINV_PDZZ ) )
+  ELSE
+    PRVS(:,:,:)=PRVS -DYM(PRHODJ * ZFLX / MYF(PDYY) )
+  END IF
+!
+! Contribution to the dynamic production of TKE:
+  IF (KSPLT==1) ZWORK(:,:,:)     = - ZFLX(:,:,:) * GY_V_M_PVM
+#else
+  CALL MYF_DEVICE(PDYY, ZTMP1_DEVICE)
+  !$acc kernels async(10)
+  !$acc loop independent collapse(3)
+  DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+     ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK)
+  END DO !CONCURRENT   
+  !$acc end kernels
+  !
+  !!! wait for the computation of ZTMP2_DEVICE and the update of ZFLX
+  !$acc wait(10)
+  !
+  CALL DYM_DEVICE( ZTMP2_DEVICE,ZTMP3_DEVICE )
+  IF (.NOT. LFLAT) THEN
+    CALL MZM_DEVICE(PDYY,ZTMP1_DEVICE)
+    !$acc kernels
+    !$acc loop independent collapse(3)
+    DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = PRHODJ(JI,JJ,JK) * ZFLX(JI,JJ,JK)
+    END DO !CONCURRENT   
+    !$acc end kernels
+    CALL MZM_DEVICE(ZTMP2_DEVICE,ZTMP4_DEVICE)
+    !$acc kernels
+    !$acc loop independent collapse(3)
+    DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = ZTMP4_DEVICE(JI,JJ,JK) * PINV_PDZZ(JI,JJ,JK)
+    END DO !CONCURRENT   
+    !$acc end kernels
+    CALL MYM_DEVICE( ZTMP2_DEVICE,ZTMP4_DEVICE )
+    !$acc kernels
+    !$acc loop independent collapse(3)
+    DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+       ZTMP2_DEVICE(JI,JJ,JK) = PDZY(JI,JJ,JK) / ZTMP1_DEVICE(JI,JJ,JK) * ZTMP4_DEVICE(JI,JJ,JK)
+    END DO !CONCURRENT   
+    !$acc end kernels
+    CALL DZF_DEVICE(1,IKU,1,ZTMP2_DEVICE,ZTMP4_DEVICE )
+    !$acc kernels async(1)
+    PRVS(:,:,:)=PRVS(:,:,:)                                          &
+                -ZTMP3_DEVICE(:,:,:)              &
+                +ZTMP4_DEVICE(:,:,:)
+    !$acc end kernels
+  ELSE
+     !$acc kernels async(1)
+     !$acc loop independent collapse(3)
+     DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+        PRVS(JI,JJ,JK)=PRVS(JI,JJ,JK) - ZTMP3_DEVICE(JI,JJ,JK)
+     END DO !CONCURRENT    
+    !$acc end kernels
+  END IF
+! Contribution to the dynamic production of TKE:
+  IF (KSPLT==1) THEN
+     !$acc kernels async(2)
+     !$acc loop independent collapse(3)
+     DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+        ZWORK(JI,JJ,JK) = - ZFLX(JI,JJ,JK) * GY_V_M_PVM(JI,JJ,JK)
+     END DO !CONCURRENT   
+    !$acc end kernels
+  ENDIF
+#endif
+ELSE
+  !$acc kernels async(2)
+  ZWORK(:,:,:)     = 0.
+  !$acc end kernels
+END IF
+!
+IF (KSPLT==1) THEN
+  !
+  ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
+  !
+  !$acc kernels async(2)
+  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDV_DY(:,:,1) + ZWORK(:,:,IKB+1) )
+  !$acc end kernels
+  !
+  !$acc kernels async(2)
+  PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
+  !$acc end kernels
+END IF
+!
+! Storage in the LES configuration
+!
+IF (LLES_CALL .AND. KSPLT==1) THEN
+  CALL SECOND_MNH(ZTIME1)
+!$acc data copy(X_LES_SUBGRID_V2,X_LES_RES_ddxa_V_SBG_UaV)
+  CALL LES_MEAN_SUBGRID( ZFLX, X_LES_SUBGRID_V2 ) 
+#ifndef MNH_OPENACC
+  CALL LES_MEAN_SUBGRID( -ZWORK, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.)
+#else
+  !
+  !!! wait for the computation of ZWORK and PDP
+  !$acc wait(2)
+  !
+  !$acc kernels
+  ZTMP1_DEVICE = -ZWORK
+  !$acc end kernels
+  CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_V_SBG_UaV , .TRUE.)
+  !
+#endif
+!$acc end data
+  CALL SECOND_MNH(ZTIME2)
+  XTIME_LES = XTIME_LES + ZTIME2 - ZTIME1
+END IF
+!
+!*      11.   < W'W'>
+!             -------
+!
+! Computes the W variance
+IF (.NOT. L2D) THEN
+   !$acc kernels async(2)
+   !$acc loop independent collapse(3)
+   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZFLX(JI,JJ,JK) = (2./3.) * PTKEM(JI,JJ,JK)                                  &
+           - XCMFS * PK(JI,JJ,JK) *( (4./3.) * GZ_W_M_PWM(JI,JJ,JK)                        &
+           -(2./3.) * ( GX_U_M_PUM(JI,JJ,JK)                      &
+           +GY_V_M_PVM(JI,JJ,JK)                ) )
+   END DO !CONCURRENT
+  !$acc end kernels
+  !!  &  to be tested
+  !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+ELSE
+  !$acc kernels async(2)
+  ZFLX(:,:,:)= (2./3.) * PTKEM                                  &
+    - XCMFS * PK *( (4./3.) * GZ_W_M_PWM                        &
+                   -(2./3.) * ( GX_U_M_PUM                ) ) 
+  !$acc end kernels
+  !!  &  to be tested
+  !!    -2.* XCMFB *  PLM / SQRT(PTKEM) * (-2./3.) * PTP 
+END IF
+!
+!$acc kernels async(2)
+ZFLX(:,:,IKE+1)= ZFLX(:,:,IKE)
+!$acc end kernels
+!
+!!! wait for the computation of ZWORK, PDP and ZFLX
+!$acc wait(2)
+!
+!
+!$acc kernels async(2)
+ZFLX(:,:,IKB)   = (2./3.) * PTKEM(:,:,IKB)                           &
+  - XCMFS * PK(:,:,IKB) * 2. * ZDW_DZ(:,:,1)
+!$acc end kernels
+!
+
+!             &  to be tested
+!   - 2.* XCMFB * PLM(:,:,IKB:IKB) /SQRT(PTKEM(:,:,IKB:IKB)) *             &
+!  (-2./3.) * PTP(:,:,IKB:IKB)
+! extrapolates this flux under the ground with the surface flux
+!$acc kernels async(3)
+#ifndef MNH_BITREP
+ZFLX(:,:,IKB-1) = &    
+        PTAU11M(:,:) * ZDIRSINZW(:,:)**2                                &
+  +     PTAU33M(:,:) * PDIRCOSZW(:,:)**2                                &
+  +2. * PCDUEFF(:,:)* PUSLOPEM(:,:)  * ZDIRSINZW(:,:) * PDIRCOSZW(:,:)
+#else
+!$acc loop independent collapse(2)
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU )        
+ZFLX(JI,JJ,IKB-1) = &
+        PTAU11M(JI,JJ) * BR_P2(ZDIRSINZW(JI,JJ))                                &
+  +     PTAU33M(JI,JJ) * BR_P2(PDIRCOSZW(JI,JJ))                                &
+  +2. * PCDUEFF(JI,JJ)* PUSLOPEM(JI,JJ)  * ZDIRSINZW(JI,JJ) * PDIRCOSZW(JI,JJ)
+END DO ! CONCURRENT        
+#endif
+!$acc end kernels
+  ! 
+!
+!!! wait for the computation of ZFLX(:,:,IKB-1) and ZFLX(:,:,IKB)
+!$acc wait(2) async(3)
+!
+!$acc kernels async(3)
+ZFLX(:,:,IKB-1) = 2. * ZFLX(:,:,IKB-1) - ZFLX(:,:,IKB)
+!$acc end kernels
+!
+IF ( OCLOSE_OUT .AND. OTURB_FLX ) THEN
+  !$acc wait(3)
+  !$acc update self(ZFLX)
+  ! stores <W W>  
+  TZFIELD%CMNHNAME   = 'W_VAR'
+  TZFIELD%CSTDNAME   = ''
+  TZFIELD%CLONGNAME  = 'W_VAR'
+  TZFIELD%CUNITS     = 'm2 s-2'
+  TZFIELD%CDIR       = 'XY'
+  TZFIELD%CCOMMENT   = 'X_Y_Z_W_VAR'
+  TZFIELD%NGRID      = 1
+  TZFIELD%NTYPE      = TYPEREAL
+  TZFIELD%NDIMS      = 3
+  TZFIELD%LTIMEDEP   = .TRUE.
+  CALL IO_Field_write(TPFILE,TZFIELD,ZFLX)
+END IF
+!
+!
+!!! wait for the computation of PRVS (that uses ZTMP1_DEVICE and ZTMP3_DEVICE)
+!$acc wait(1)
+!
+
+!
+! Complete the W tendency
+!
+!PRWS(:,:,:)=PRWS(:,:,:) - DZM( PRHODJ*ZFLX/MZF(PDZZ) )
+!$acc kernels async(2)
+ZDFDDWDZ(:,:,:)    = - XCMFS * PK(:,:,:) * (4./3.)
+!$acc end kernels
+!$acc kernels async(2)
+ZDFDDWDZ(:,:,:IKB) = 0.
+!$acc end kernels
+!
+!!! wait for the computation of ZFLX(:,:,IKB-1) and ZDFDDWDZ
+!$acc wait(3) async(2)
+!$acc wait(2)
+!
+CALL TRIDIAG_W(PWM,ZFLX,ZDFDDWDZ,PTSTEP,ZMZF_DZZ,PRHODJ,ZWP)
+!
+#ifndef MNH_OPENACC
+PRWS = PRWS(:,:,:) + MZM(PRHODJ(:,:,:))*(ZWP(:,:,:)-PWM(:,:,:))/PTSTEP
+#else
+CALL MZM_DEVICE(PRHODJ(:,:,:),ZTMP1_DEVICE)
+!$acc kernels async(1)
+PRWS = PRWS(:,:,:) + ZTMP1_DEVICE *(ZWP(:,:,:)-PWM(:,:,:))/PTSTEP
+!$acc end kernels
+#endif
+!
+!* recomputes flux using guess of W
+!
+#ifndef MNH_OPENACC
+GZ_W_M_ZWP = GZ_W_M(ZWP,PDZZ)
+#else
+CALL GZ_W_M_DEVICE(1,IKU,1,ZWP,PDZZ,GZ_W_M_ZWP)
+#endif
+!$acc kernels async(2)
+!$acc loop independent collapse(3)
+DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=IKB+1:JKU)
+   ZFLX(JI,JJ,JK)=ZFLX(JI,JJ,JK) &
+        - XCMFS * PK(JI,JJ,JK) * (4./3.) * (GZ_W_M_ZWP(JI,JJ,JK) - GZ_W_M_PWM(JI,JJ,JK))
+END DO !CONCURRENT
+!$acc end kernels
+!
+IF (KSPLT==1) THEN
+   !Contribution to the dynamic production of TKE:
+   !$acc kernels async(2)
+   !$acc loop independent collapse(3)
+   DO CONCURRENT ( JI=1:JIU,JJ=1:JJU,JK=1:JKU)
+      ZWORK(JI,JJ,JK) = - ZFLX(JI,JJ,JK) * GZ_W_M_ZWP(JI,JJ,JK)
+   END DO !CONCURRENT   
+   !$acc end kernels
+  !
+  ! evaluate the dynamic production at w(IKB+1) in PDP(IKB)
+  !
+  !$acc kernels async(2)
+  ZWORK(:,:,IKB) = 0.5* ( -ZFLX(:,:,IKB)*ZDW_DZ(:,:,1) + ZWORK(:,:,IKB+1) )
+  !$acc end kernels
+  !
+  !$acc kernels async(2)
+  PDP(:,:,:) = PDP(:,:,:) + ZWORK(:,:,:)
+  !$acc end kernels
+END IF
+!
+! Storage in the LES configuration
+!
+!
+IF (LLES_CALL .AND. KSPLT==1) THEN
+  CALL SECOND_MNH(ZTIME1)
+#ifndef MNH_OPENACC
+  CALL LES_MEAN_SUBGRID( ZFLX, X_LES_SUBGRID_W2 ) 
+  CALL LES_MEAN_SUBGRID( -ZWORK, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.)
+  CALL LES_MEAN_SUBGRID( GZ_M_M(PTHLM,PDZZ)*ZFLX, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.)
+  CALL LES_MEAN_SUBGRID(ZFLX*MZF(GZ_M_W(1,IKU,1,PTHLM,PDZZ)),X_LES_RES_ddz_Thl_SBG_W2)
+  IF (KRR>=1) THEN
+    CALL LES_MEAN_SUBGRID( GZ_M_M(PRM(:,:,:,1),PDZZ)*ZFLX, &
+                           X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.)
+    CALL LES_MEAN_SUBGRID(ZFLX*MZF(GZ_M_W(1,IKU,1,PRM(:,:,:,1),PDZZ)), &
+                           X_LES_RES_ddz_Rt_SBG_W2)
+  END IF
+  DO JSV=1,NSV
+    CALL LES_MEAN_SUBGRID( GZ_M_M(PSVM(:,:,:,JSV),PDZZ)*ZFLX, &
+                           X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.)
+    CALL LES_MEAN_SUBGRID(ZFLX*MZF(GZ_M_W(1,IKU,1,PSVM(:,:,:,JSV),PDZZ)), &
+                           X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV))
+  END DO
+#else
+!$acc data copy(X_LES_SUBGRID_W2,X_LES_RES_ddxa_W_SBG_UaW,X_LES_RES_ddxa_Thl_SBG_UaW,X_LES_RES_ddz_Thl_SBG_W2)
+  !
+  CALL LES_MEAN_SUBGRID( ZFLX, X_LES_SUBGRID_W2 ) 
+  !
+  !
+  !!! wait for the computation of ZFLX, ZDP and ZWORK
+  !$acc wait(2)
+  !
+  !$acc kernels
+  ZTMP1_DEVICE = -ZWORK
+  !$acc end kernels
+  CALL LES_MEAN_SUBGRID( ZTMP1_DEVICE, X_LES_RES_ddxa_W_SBG_UaW , .TRUE.)
+  !
+  CALL GZ_M_M_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE)
+  !$acc kernels
+  ZTMP2_DEVICE = ZTMP1_DEVICE * ZFLX
+  !$acc end kernels
+  CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Thl_SBG_UaW , .TRUE.)
+  !
+  CALL GZ_M_W_DEVICE(1,IKU,1,PTHLM,PDZZ,ZTMP1_DEVICE)
+  CALL MZF_DEVICE(1,IKU,1,ZTMP1_DEVICE,ZTMP2_DEVICE)
+  !$acc kernels
+  ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+  !$acc end kernels
+  CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE,X_LES_RES_ddz_Thl_SBG_W2)
+  !
+!$acc end data
+  !
+  IF (KRR>=1) THEN
+!$acc data copy(X_LES_RES_ddxa_Rt_SBG_UaW,X_LES_RES_ddz_Rt_SBG_W2)
+    !
+    CALL GZ_M_M_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
+    !$acc kernels
+    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+    !$acc end kernels
+    CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, X_LES_RES_ddxa_Rt_SBG_UaW , .TRUE.)
+    !
+    CALL GZ_M_W_DEVICE(1,IKU,1,PRM(:,:,:,1),PDZZ,ZTMP1_DEVICE)
+    CALL MZF_DEVICE(1,IKU,1,ZTMP1_DEVICE,ZTMP2_DEVICE)
+    !$acc kernels
+    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+    !$acc end kernels
+    CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Rt_SBG_W2)
+    !
+!$acc end data
+  END IF
+!$acc data copy(X_LES_RES_ddxa_Sv_SBG_UaW,X_LES_RES_ddz_Sv_SBG_W2)
+  DO JSV=1,NSV
+    !
+    !
+    CALL GZ_M_M_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
+    !$acc kernels
+    ZTMP2_DEVICE = ZTMP1_DEVICE*ZFLX
+    !$acc end kernels
+    CALL LES_MEAN_SUBGRID( ZTMP2_DEVICE, &
+                           X_LES_RES_ddxa_Sv_SBG_UaW(:,:,:,JSV) , .TRUE.)
+    !
+    CALL GZ_M_W_DEVICE(1,IKU,1,PSVM(:,:,:,JSV),PDZZ,ZTMP1_DEVICE)
+    CALL MZF_DEVICE(1,IKU,1,ZTMP1_DEVICE,ZTMP2_DEVICE)
+    !$acc kernels
+    ZTMP3_DEVICE = ZFLX*ZTMP2_DEVICE
+    !$acc end kernels
+    CALL LES_MEAN_SUBGRID(ZTMP3_DEVICE, X_LES_RES_ddz_Sv_SBG_W2(:,:,:,JSV))
+    !
+    !
+  END DO
+!$acc end data
+#endif
+  CALL SECOND_MNH(ZTIME2)
+  XTIME_LES = XTIME_LES + ZTIME2 - ZTIME1
+END IF
+!
+!
+!!! wait for the computation of ZFLX, ZDP and ZWORK
+!$acc wait(2)
+!!! wait for the computation of PRWS
+!$acc wait(1)
+!
+!!! et un dernier wait pour etre sur
+!$acc wait
+!
+CALL CLEANLIST_ll(TZFIELDS_ll)
+
+if ( mppdb_initialized ) then
+  !Check all inout arrays
+  call Mppdb_check( prus,   "Turb_hor_dyn_corr end:prus"   )
+  call Mppdb_check( prvs,   "Turb_hor_dyn_corr end:prvs"   )
+  call Mppdb_check( prws,   "Turb_hor_dyn_corr end:prws"   )
+  call Mppdb_check( pdp,    "Turb_hor_dyn_corr end:pdp"    )
+  call Mppdb_check( ptp,    "Turb_hor_dyn_corr end:ptp"    )
+end if
+
+!$acc end data
+
+#ifndef MNH_OPENACC
+DEALLOCATE (ZFLX, ZWORK, ZDIRSINZW, ZCOEFF, ZDZZ,                  &
+            GX_U_M_PUM, GY_V_M_PVM, GZ_W_M_PWM, GZ_W_M_ZWP,        &
+            ZMZF_DZZ, ZDFDDWDZ, ZWP,                               &
+            ZDU_DZ_DZS_DX, ZDV_DZ_DZS_DY, ZDU_DX, ZDV_DY, ZDW_DZ   )
+#else
+CALL MNH_REL_ZT3D(IZTMP1_DEVICE, IZTMP2_DEVICE, IZTMP3_DEVICE, IZTMP4_DEVICE )
+CALL MNH_REL_ZT3D(IZFLX, IZWORK, IZDIRSINZW,          &
+            IGX_U_M_PUM, IGY_V_M_PVM, IGZ_W_M_PWM, IGZ_W_M_ZWP,        &
+            IZMZF_DZZ, IZDFDDWDZ, IZWP,                                &
+            IZDU_DZ_DZS_DX, IZDV_DZ_DZS_DY, IZDU_DX, IZDV_DY, IZDW_DZ, &
+            IZCOEFF, IZDZZ             )
+
+#endif
+     
+
+!$acc end data
+
+END SUBROUTINE TURB_HOR_DYN_CORR
-- 
GitLab