diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 index 77f0f54eea9d6a6d4963233f27d48f212c42af38..b7ac6348d477ebc9952d9ea235c82414e1856b5a 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/communication.f90 @@ -64,15 +64,15 @@ public::collect public::distribute public::i_am_master_mpi public::master_rank -public::pproc +public::pproc,pprocx,pprocy public::MPI_COMM_HORIZ public::comm_parameters public::comm_measuretime ! Number of processors - ! n_proc = 2^(2*pproc), with integer pproc - integer :: pproc + ! n_proc = 2^(1*pproc) = 2^pprocx*2^pprocy ! 2^(2*pproc), with integer pproc + integer :: pproc,pprocx,pprocy ! Rank of master process integer, parameter :: master_rank = 0 @@ -216,14 +216,14 @@ contains i_am_master_mpi = (rank == master_rank) ! Check that nproc = 2^(2*p) !pproc = floor(log(1.0d0*nproc)/log(4.0d0)) - pproc = nint(log(1.0d0*nproc)/log(4.0d0)) - if ( (nproc - 4**pproc) .ne. 0) then - print*,"Number of processors has to be 2^(2*pproc) with integer nproc,pproc=",nproc,pproc - call fatalerror("Number of processors has to be 2^(2*pproc) with integer pproc.") + pproc = nint(log(1.0d0*nproc)/log(2.0d0)) + if ( (nproc - 2**pproc) .ne. 0) then + print*,"Number of processors has to be 2^(1*pproc) with integer nproc,pproc=",nproc,pproc + call fatalerror("Number of processors has to be 2^(1*pproc) with integer pproc.") end if if (i_am_master_mpi) then write(STDOUT,'("PARALLEL RUN")') - write(STDOUT,'("Number of processors : 2^(2*pproc) = ",I10," with pproc = ",I6)') & + write(STDOUT,'("Number of processors : 2^(1*pproc) = ",I10," with pproc = ",I6)') & nproc, pproc end if ! Create halo data types @@ -256,18 +256,27 @@ contains integer,parameter :: nb_dims=3 integer,dimension(nb_dims) :: profil_tab,profil_sous_tab,coord_debut + integer :: nlocalx,nlocaly + n = grid_param%n nz = grid_param%nz comm_param = comm_param_in halo_size = comm_param%halo_size +#ifndef MPIVIDE call mpi_comm_size(MPI_COMM_WORLD, nproc, ierr) - + ! + ! Manage nproc=2^pproc ( and not 2^2pproc ) + ! + pprocx = pproc / 2 + pprocy = pproc - pprocx + pproc = pprocx + if (i_am_master_mpi) write(STDOUT,*) "pproc=",pproc," pprocx=",pprocx," pprocy=",pprocy ! Create cartesian topology call mpi_cart_create(MPI_COMM_WORLD, & ! Old communicator name dim_horiz, & ! horizontal dimension - (/2**pproc,2**pproc/), & ! extent in each horizontal direction + (/2**pprocy,2**pprocx/), & ! extent in each horizontal direction (/.false.,.false./), & ! periodic? .false., & ! reorder? MPI_COMM_HORIZ, & ! Name of new communicator @@ -275,9 +284,11 @@ contains ! calculate and display rank and corrdinates in cartesian grid call mpi_comm_rank(MPI_COMM_HORIZ, rank, ierr) call mpi_cart_coords(MPI_COMM_HORIZ,rank,dim_horiz,p_horiz,ierr) - +#endif ! Local size of (horizontal) grid nlocal = n/2**pproc + nlocaly = n/2**pprocy + nlocalx = n/2**pprocx ! === Set up data types === ! Halo for exchange in north-south direction @@ -320,85 +331,101 @@ contains count = nlocal blocklength = (nz+2)*halo_size stride = (nlocal+2*halo_size)*(nz+2) +#ifndef MPIVIDE call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, & halo_ns(level,m),ierr) call mpi_type_commit(halo_ns(level,m),ierr) +#endif endif ! tranpose if (LUseT) then ! NS- (y-) transpose direction count = nz+2 ! nlocal - blocklength = nlocal*halo_size ! (nz+2)*halo_size - stride = (nlocal+2*halo_size) * (nlocal+2*halo_size) ! (nlocal+2*halo_size)*(nz+2) + blocklength = nlocalx*halo_size ! (nz+2)*halo_size + stride = (nlocalx+2*halo_size) * (nlocaly+2*halo_size) ! (nlocal+2*halo_size)*(nz+2) +#ifndef MPIVIDE call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, & halo_nst(level,m),ierr) call mpi_type_commit(halo_nst(level,m),ierr) +#endif ! allocate send/recv buffer for GPU copy/managed - call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTin,nlocal,halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTout,nlocal,halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTin,nlocal,halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTout,nlocal,halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTin,nlocalx,halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_nt(level,m)%haloTout,nlocalx,halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTin,nlocalx,halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_st(level,m)%haloTout,nlocalx,halo_size,nz+2) ! WE- (x-) transpose direction - count = (nz+2)*(nlocal+2*halo_size)*halo_size ! nlocal + count = (nz+2)*(nlocaly+2*halo_size)*halo_size ! nlocal blocklength = 1*halo_size ! (nz+2)*halo_size - stride = nlocal+2*halo_size ! (nlocal+2*halo_size)*(nz+2) + stride = nlocalx+2*halo_size ! (nlocal+2*halo_size)*(nz+2) +#ifndef MPIVIDE call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION, & halo_wet(level,m),ierr) call mpi_type_commit(halo_wet(level,m),ierr) +#endif ! allocate send/recv buffer for GPU copy/managed ! - call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTin,halo_size,nlocal+2*halo_size,nz+2) - call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTout,halo_size,nlocal+2*halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTin,halo_size,nlocaly+2*halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_wt(level,m)%haloTout,halo_size,nlocaly+2*halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTin,halo_size,nlocaly+2*halo_size,nz+2) + call mnh_allocate_mg_halo(tab_halo_et(level,m)%haloTout,halo_size,nlocaly+2*halo_size,nz+2) endif #ifndef NDEBUG +#ifndef MPIVIDE if (ierr .ne. 0) & call fatalerror("Commit halo_ns failed in mpi_type_commit().") +#endif #endif ! --- Create interior data types --- if (LUseO) then count = nlocal blocklength = nlocal*(nz+2) stride = (nz+2)*(nlocal+2*halo_size) +#ifndef MPIVIDE call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,interior(level,m),ierr) call mpi_type_commit(interior(level,m),ierr) +#endif count = nlocal/2 blocklength = nlocal/2*(nz+2) stride = (nlocal+2*halo_size)*(nz+2) +#ifndef MPIVIDE call mpi_type_vector(count,blocklength,stride,MPI_DOUBLE_PRECISION,sub_interior(level,m),ierr) call mpi_type_commit(sub_interior(level,m),ierr) +#endif end if if (LUseT) then ! interiorT - if ( nlocal /= 0 ) then - profil_tab = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /) - profil_sous_tab = (/ nlocal , nlocal , nz+2 /) + if ( ( nlocalx /= 0 ) .and. ( nlocaly /= 0 ) ) then + profil_tab = (/ nlocalx+2*halo_size , nlocaly+2*halo_size , nz+2 /) + profil_sous_tab = (/ nlocalx , nlocaly , nz+2 /) coord_debut = (/ 0 , 0 , 0 /) +#ifndef MPIVIDE call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,& MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,interiorT(level,m),ierr) call mpi_type_commit(interiorT(level,m),ierr) - call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocal,nlocal,nz+2) - call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocal,nlocal,nz+2) - call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocal,nlocal,nz+2) - call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocal,nlocal,nz+2) - call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocal,nlocal,nz+2) - call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocal,nlocal,nz+2) +#endif + call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTin,nlocalx,nlocaly,nz+2) + call mnh_allocate_mg_halo(tab_interiorT_ne(level,m)%haloTout,nlocalx,nlocaly,nz+2) + call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTin,nlocalx,nlocaly,nz+2) + call mnh_allocate_mg_halo(tab_interiorT_sw(level,m)%haloTout,nlocalx,nlocaly,nz+2) + call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTin,nlocalx,nlocaly,nz+2) + call mnh_allocate_mg_halo(tab_interiorT_se(level,m)%haloTout,nlocalx,nlocaly,nz+2) end if ! sub_interiorT - if ( (nlocal/2) /= 0 ) then - profil_tab = (/ nlocal+2*halo_size , nlocal+2*halo_size , nz+2 /) - profil_sous_tab = (/ nlocal/2 , nlocal/2 , nz+2 /) + if ( ( (nlocalx/2) /= 0 ) .and. ( (nlocaly/2) /= 0 ) ) then + profil_tab = (/ nlocalx+2*halo_size , nlocaly+2*halo_size , nz+2 /) + profil_sous_tab = (/ nlocalx/2 , nlocaly/2 , nz+2 /) coord_debut = (/ 0 , 0 , 0 /) +#ifndef MPIVIDE call MPI_TYPE_CREATE_SUBARRAY(nb_dims,profil_tab,profil_sous_tab,coord_debut,& MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sub_interiorT(level,m),ierr) call mpi_type_commit(sub_interiorT(level,m),ierr) - call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocal/2,nlocal/2,nz+2) - call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocal/2,nlocal/2,nz+2) - call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocal/2,nlocal/2,nz+2) - call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocal/2,nlocal/2,nz+2) - call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocal/2,nlocal/2,nz+2) - call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocal/2,nlocal/2,nz+2) +#endif + call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2) + call mnh_allocate_mg_halo(tab_sub_interiorT_ne(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2) + call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2) + call mnh_allocate_mg_halo(tab_sub_interiorT_sw(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2) + call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTin,nlocalx/2,nlocaly/2,nz+2) + call mnh_allocate_mg_halo(tab_sub_interiorT_se(level,m)%haloTout,nlocalx/2,nlocaly/2,nz+2) end if end if ! --- Create timers --- @@ -410,11 +437,15 @@ contains reduced_m = .true. m = m-1 nlocal = 2*nlocal + nlocalx = 2*nlocalx + nlocaly = 2*nlocaly cycle end if reduced_m = .false. level = level-1 nlocal = nlocal/2 + nlocalx = nlocalx/2 + nlocaly = nlocaly/2 end do end subroutine comm_initialise @@ -436,9 +467,13 @@ contains integer :: nlocal,n character(len=80) :: s + integer :: nlocalx,nlocaly + ! Local size of (horizontal) grid n = grid_param%n nlocal = n/2**pproc + nlocalx = n/2**pprocx + nlocaly = n/2**pprocy m = pproc level = n_lev @@ -453,24 +488,30 @@ contains ! --- Print out timer information --- call print_elapsed(t_haloswap(level,m),.True.,1.0_rl) ! --- Free halo data types --- +#ifndef MPIVIDE if (LUseO) call mpi_type_free(halo_ns(level,m),ierr) if (LUseT) call mpi_type_free(halo_nst(level,m),ierr) if (LUseT) call mpi_type_free(halo_wet(level,m),ierr) ! --- Free interior data types --- if (LUseO) call mpi_type_free(interior(level,m),ierr) if (LUseO) call mpi_type_free(sub_interior(level,m),ierr) - if (LUseT .and. (nlocal /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr) - if (LUseT .and. ( (nlocal/2) /= 0 ) ) call mpi_type_free(sub_interiorT(level,m),ierr) + if (LUseT .and. (nlocalx /= 0 ) .and. ( nlocaly /= 0 ) ) call mpi_type_free(interiorT(level,m),ierr) + if (LUseT .and. ( (nlocalx/2) /= 0 ) .and. ( (nlocaly/2) /= 0 )) call mpi_type_free(sub_interiorT(level,m),ierr) +#endif ! If we are below L_split, split data if ( (level .le. lev_split) .and. (m > 0) .and. (.not. reduced_m)) then reduced_m = .true. m = m-1 nlocal = 2*nlocal + nlocalx = 2*nlocalx + nlocaly = 2*nlocaly cycle end if reduced_m = .false. level = level-1 nlocal = nlocal/2 + nlocalx = nlocalx/2 + nlocaly = nlocaly/2 end do do m=pproc,0,-1 write(s,'("m = ",I3)') m @@ -524,7 +565,12 @@ contains integer :: icompy_min,icompy_max, icompx_min,icompx_max real , dimension(:,:,:) , pointer , contiguous :: za_st,zb_st + integer :: nlocalx,nlocaly + nlocal = a%ix_max-a%ix_min+1 + nlocalx = a%ix_max-a%ix_min+1 + nlocaly = a%iy_max-a%iy_min+1 + nz = a%grid_param%nz iy_min = a%iy_min @@ -678,7 +724,11 @@ contains call scalarprod_mnh(m, a, a, s) s = sqrt(s) if (i_am_master_mpi) then +#ifdef MG_DEBUG + write(STDOUT,'("Print_norm::",A,2I3,E23.10)') message, l,m,s +#else write(STDOUT,'("Print_norm::",A,2I3,E23.15)') message, l,m,s +#endif call flush(STDOUT) end if @@ -818,11 +868,15 @@ contains integer :: neighbour_w_rank integer :: yoffset, blocklength + integer :: a_nx,a_ny, blocklengthx,blocklengthy + halo_size = comm_param%halo_size ! Do nothing if we are only using one processor - if (m > 0) then + if (m >= 0) then a_n = a%ix_max-a%ix_min+1 + a_nx = a%ix_max-a%ix_min+1 + a_ny = a%iy_max-a%iy_min+1 nz = a%grid_param%nz stepsize = 2**(pproc-m) @@ -832,11 +886,13 @@ contains ! Work out ranks of neighbours ! W -> E +#ifndef MPIVIDE call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, & neighbour_w_rank,neighbour_e_rank,ierr) ! N -> S call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, & neighbour_n_rank,neighbour_s_rank,ierr) +#endif if ( (stepsize == 1) .or. & ( (mod(p_horiz(1),stepsize) == 0) .and. & (mod(p_horiz(2),stepsize) == 0) ) ) then @@ -844,9 +900,13 @@ contains ! Do not include corners in send/recv yoffset = 1 blocklength = a_n*(nz+2)*halo_size + blocklengthx = a_nx*(nz+2)*halo_size + blocklengthy = a_ny*(nz+2)*halo_size else yoffset = 1-halo_size blocklength = (a_n+2*halo_size)*(nz+2)*halo_size + blocklengthx = (a_nx+2*halo_size)*(nz+2)*halo_size + blocklengthy = (a_ny+2*halo_size)*(nz+2)*halo_size end if ! Receive from north recvtag = 1002 @@ -861,23 +921,23 @@ contains end if ! Receive from south recvtag = 1003 - if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1, & + if (LUseO) call mpi_irecv(a%s(0,a_ny+1,1),1, & halo_ns(level,m),neighbour_s_rank,recvtag, & MPI_COMM_HORIZ, recv_requests(2), ierr) recvtag = 1013 if (LUseT) then - call mpi_irecv(a%st(1,a_n+1,0),1, & + call mpi_irecv(a%st(1,a_ny+1,0),1, & halo_nst(level,m),neighbour_s_rank,recvtag, & MPI_COMM_HORIZ, recv_requestsT(2), ierr) end if ! Send to south sendtag = 1002 - if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1, & + if (LUseO) call mpi_isend(a%s(0,a_ny-(halo_size-1),1),1, & halo_ns(level,m),neighbour_s_rank,sendtag, & MPI_COMM_HORIZ, send_requests(1), ierr) sendtag = 1012 if (LUseT) then - call mpi_isend(a%st(1,a_n-(halo_size-1),0),1, & + call mpi_isend(a%st(1,a_ny-(halo_size-1),0),1, & halo_nst(level,m),neighbour_s_rank,sendtag, & MPI_COMM_HORIZ, send_requestsT(1), ierr) end if @@ -905,23 +965,23 @@ contains end if ! Receive from east sendtag = 1001 - if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength, & + if (LUseO) call mpi_irecv(a%s(0,yoffset,a_nx+1),blocklength, & MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, & MPI_COMM_HORIZ, recv_requests(4), ierr) sendtag = 1011 if (LUseT) then - call mpi_irecv(a%st(a_n+1,0,0),1, & + call mpi_irecv(a%st(a_nx+1,0,0),1, & halo_wet(level,m),neighbour_e_rank,recvtag, & MPI_COMM_HORIZ, recv_requestsT(4), ierr) end if ! Send to east sendtag = 1000 - if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength, & + if (LUseO) call mpi_isend(a%s(0,yoffset,a_nx-(halo_size-1)),blocklength, & MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, & MPI_COMM_HORIZ, send_requests(3), ierr) sendtag = 1010 if (LUseT) then - call mpi_isend(a%st(a_n-(halo_size-1),0,0),1, & + call mpi_isend(a%st(a_nx-(halo_size-1),0,0),1, & halo_wet(level,m),neighbour_e_rank,sendtag, & MPI_COMM_HORIZ, send_requestsT(3), ierr) end if @@ -967,7 +1027,7 @@ contains halo_size = comm_param%halo_size ! Do nothing if we are only using one processor - if (m > 0) then + if (m >= 0) then a_n = a%ix_max-a%ix_min+1 nz = a%grid_param%nz stepsize = 2**(pproc-m) @@ -978,11 +1038,13 @@ contains ! Work out ranks of neighbours ! W -> E +#ifndef MPIVIDE call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, & neighbour_w_rank,neighbour_e_rank,ierr) ! N -> S call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, & neighbour_n_rank,neighbour_s_rank,ierr) +#endif if ( (stepsize == 1) .or. & ( (mod(p_horiz(1),stepsize) == 0) .and. & (mod(p_horiz(2),stepsize) == 0) ) ) then @@ -1095,14 +1157,21 @@ contains LOGICAL :: Gneighbour_s,Gneighbour_n,Gneighbour_e,Gneighbour_w + integer :: a_nx,a_ny, blocklengthx,blocklengthy + halo_size = comm_param%halo_size +#ifdef MG_DEBUG + call print_scalaprod2(level , m , a , "InBeg haloswap_mnh a(level,m )=" ) +#endif ! Do nothing if we are only using one processor - if (m > 0) then + if (m >= 0) then if (comm_measuretime) then call start_timer(t_haloswap(level,m)) end if a_n = a%ix_max-a%ix_min+1 + a_nx = a%ix_max-a%ix_min+1 + a_ny = a%iy_max-a%iy_min+1 nz = a%grid_param%nz stepsize = 2**(pproc-m) @@ -1112,11 +1181,13 @@ contains ! Work out ranks of neighbours ! W -> E +#ifndef MPIVIDE call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, & neighbour_w_rank,neighbour_e_rank,ierr) ! N -> S call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, & neighbour_n_rank,neighbour_s_rank,ierr) +#endif if ( (stepsize == 1) .or. & ( (mod(p_horiz(1),stepsize) == 0) .and. & (mod(p_horiz(2),stepsize) == 0) ) ) then @@ -1124,9 +1195,13 @@ contains ! Do not include corners in send/recv yoffset = 1 blocklength = a_n*(nz+2)*halo_size + blocklengthx = a_nx*(nz+2)*halo_size + blocklengthy = a_ny*(nz+2)*halo_size else yoffset = 1-halo_size blocklength = (a_n+2*halo_size)*(nz+2)*halo_size + blocklengthx = (a_nx+2*halo_size)*(nz+2)*halo_size + blocklengthy = (a_ny+2*halo_size)*(nz+2)*halo_size end if ! zst => a%st @@ -1190,7 +1265,11 @@ contains if (comm_measuretime) then call finish_timer(t_haloswap(level,m)) end if - end if ! (m > 0) + end if ! (m >= 0) + +#ifdef MG_DEBUG + call print_scalaprod2(level , m , a , "OuBeg haloswap_mnh a(level,m )=" ) +#endif contains subroutine haloswap_mnh_dim(pztab_halo_st_haloTin,pztab_halo_nt_haloTin,& @@ -1200,15 +1279,15 @@ contains pzst) implicit none - real :: pztab_halo_st_haloTin(1:a_n,1:halo_size,1:nz+2), & - pztab_halo_nt_haloTin(1:a_n,1:halo_size,1:nz+2), & - pztab_halo_et_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), & - pztab_halo_wt_haloTin(1:halo_size,1:a_n+2*halo_size,1:nz+2), & - pztab_halo_nt_haloTout(1:a_n,1:halo_size,1:nz+2), & - pztab_halo_st_haloTout(1:a_n,1:halo_size,1:nz+2), & - pztab_halo_wt_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), & - pztab_halo_et_haloTout(1:halo_size,1:a_n+2*halo_size,1:nz+2), & - pzst(1-halo_size:a_n+halo_size,1-halo_size:a_n+halo_size,0:nz+1) + real :: pztab_halo_st_haloTin(1:a_nx,1:halo_size,1:nz+2), & + pztab_halo_nt_haloTin(1:a_nx,1:halo_size,1:nz+2), & + pztab_halo_et_haloTin(1:halo_size,1:a_ny+2*halo_size,1:nz+2), & + pztab_halo_wt_haloTin(1:halo_size,1:a_ny+2*halo_size,1:nz+2), & + pztab_halo_nt_haloTout(1:a_nx,1:halo_size,1:nz+2), & + pztab_halo_st_haloTout(1:a_nx,1:halo_size,1:nz+2), & + pztab_halo_wt_haloTout(1:halo_size,1:a_ny+2*halo_size,1:nz+2), & + pztab_halo_et_haloTout(1:halo_size,1:a_ny+2*halo_size,1:nz+2), & + pzst(1-halo_size:a_nx+halo_size,1-halo_size:a_ny+halo_size,0:nz+1) ! ! Do Comm ! @@ -1221,8 +1300,8 @@ contains if (Gneighbour_s) then !!$ pztab_halo_st_haloTin => tab_halo_st(level,m)%haloTin !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTin) - !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) - pztab_halo_st_haloTin(ii,ij,ik) = pzst(ii,ij+a_n-halo_size,ik-1) + !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 ) + pztab_halo_st_haloTin(ii,ij,ik) = pzst(ii,ij+a_ny-halo_size,ik-1) !$mnh_end_do() !$acc end kernels end if @@ -1230,7 +1309,7 @@ contains if (Gneighbour_n) then !!$ pztab_halo_nt_haloTin => tab_halo_nt(level,m)%haloTin !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTin) - !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 ) pztab_halo_nt_haloTin(ii,ij,ik) = pzst(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels @@ -1239,8 +1318,8 @@ contains if (Gneighbour_e) then !!$ pztab_halo_et_haloTin => tab_halo_et(level,m)%haloTin !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTin) - !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) - pztab_halo_et_haloTin(ii,ij,ik) = pzst(ii+a_n-halo_size,ij-halo_size,ik-1) + !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) + pztab_halo_et_haloTin(ii,ij,ik) = pzst(ii+a_nx-halo_size,ij-halo_size,ik-1) !$mnh_end_do() !$acc end kernels end if @@ -1248,7 +1327,7 @@ contains if (Gneighbour_w) then !!$ pztab_halo_wt_haloTin => tab_halo_wt(level,m)%haloTin !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTin) - !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) pztab_halo_wt_haloTin(ii,ij,ik) = pzst(ii,ij-halo_size,ik-1) !$mnh_end_do() !$acc end kernels @@ -1280,7 +1359,7 @@ contains end if ! Receive from south recvtag = 1003 - if (LUseO) call mpi_irecv(a%s(0,a_n+1,1),1, & + if (LUseO) call mpi_irecv(a%s(0,a_ny+1,1),1, & halo_ns(level,m),neighbour_s_rank,recvtag, & MPI_COMM_HORIZ, requests_ns(2), ierr) recvtag = 1013 @@ -1296,7 +1375,7 @@ contains end if !print*,"mpi_irecv(pztab_halo_st_haloTout,neighbour_s_rank=",neighbour_s_rank #else - call mpi_irecv(a%st(1,a_n+1,0),1, & + call mpi_irecv(a%st(1,a_ny+1,0),1, & halo_nst(level,m),neighbour_s_rank,recvtag, & MPI_COMM_HORIZ, requests_nsT(2), ierr) #endif @@ -1309,7 +1388,7 @@ contains #endif ! Send to south sendtag = 1002 - if (LUseO) call mpi_isend(a%s(0,a_n-(halo_size-1),1),1, & + if (LUseO) call mpi_isend(a%s(0,a_ny-(halo_size-1),1),1, & halo_ns(level,m),neighbour_s_rank,sendtag, & MPI_COMM_HORIZ, requests_ns(3), ierr) sendtag = 1012 @@ -1324,7 +1403,7 @@ contains end if !print*,"mpi_isend(pztab_halo_st_haloTin,neighbour_s_rank=",neighbour_s_rank #else - call mpi_isend(a%st(1,a_n-(halo_size-1),0),1, & + call mpi_isend(a%st(1,a_ny-(halo_size-1),0),1, & halo_nst(level,m),neighbour_s_rank,sendtag, & MPI_COMM_HORIZ, requests_nsT(3), ierr) #endif @@ -1381,7 +1460,7 @@ contains end if ! Receive from east sendtag = 1001 - if (LUseO) call mpi_irecv(a%s(0,yoffset,a_n+1),blocklength, & + if (LUseO) call mpi_irecv(a%s(0,yoffset,a_nx+1),blocklength, & MPI_DOUBLE_PRECISION,neighbour_e_rank,recvtag, & MPI_COMM_HORIZ, requests_ew(2), ierr) sendtag = 1011 @@ -1397,7 +1476,7 @@ contains end if !print*,"mpi_irecv(pztab_halo_et_haloTout,neighbour_e_rank=",neighbour_e_rank #else - call mpi_irecv(a%st(a_n+1,0,0),1, & + call mpi_irecv(a%st(a_nx+1,0,0),1, & halo_wet(level,m),neighbour_e_rank,recvtag, & MPI_COMM_HORIZ, requests_ewT(2), ierr) @@ -1405,7 +1484,7 @@ contains end if ! Send to east sendtag = 1000 - if (LUseO) call mpi_isend(a%s(0,yoffset,a_n-(halo_size-1)),blocklength, & + if (LUseO) call mpi_isend(a%s(0,yoffset,a_nx-(halo_size-1)),blocklength, & MPI_DOUBLE_PRECISION,neighbour_e_rank,sendtag, & MPI_COMM_HORIZ, requests_ew(3), ierr) sendtag = 1010 @@ -1420,7 +1499,7 @@ contains end if !print*,"mpi_isend(pztab_halo_et_haloTin,neighbour_e_rank=",neighbour_e_rank #else - call mpi_isend(a%st(a_n-(halo_size-1),0,0),1, & + call mpi_isend(a%st(a_nx-(halo_size-1),0,0),1, & halo_wet(level,m),neighbour_e_rank,sendtag, & MPI_COMM_HORIZ, requests_ewT(3), ierr) #endif @@ -1460,7 +1539,7 @@ contains if (Gneighbour_n) then ! copy north halo for GPU managed !$acc kernels async(IS_NORTH) present_cr(pzst,pztab_halo_nt_haloTout) - !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) + !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 ) pzst(ii,ij-halo_size,ik-1) = pztab_halo_nt_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels @@ -1468,15 +1547,15 @@ contains if (Gneighbour_s) then ! copy south halo for GPU managed !$acc kernels async(IS_SOUTH) present_cr(pzst,pztab_halo_st_haloTout) - !$mnh_do_concurrent( ii=1:a_n,ij=1:halo_size,ik=1:nz+2 ) - pzst(ii,ij+a_n,ik-1) = pztab_halo_st_haloTout(ii,ij,ik) + !$mnh_do_concurrent( ii=1:a_nx,ij=1:halo_size,ik=1:nz+2 ) + pzst(ii,ij+a_ny,ik-1) = pztab_halo_st_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels end if if (Gneighbour_w) then ! copy west halo for GPU managed !$acc kernels async(IS_WEST) present_cr(pzst,pztab_halo_wt_haloTout) - !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) + !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) pzst(ii-halo_size,ij-halo_size,ik-1) = pztab_halo_wt_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels @@ -1484,8 +1563,8 @@ contains if (Gneighbour_e) then ! copy east halo for GPU managed !$acc kernels async(IS_EAST) present_cr(pzst,pztab_halo_et_haloTout) - !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_n+2*halo_size,ik=1:nz+2 ) - pzst(ii+a_n,ij-halo_size,ik-1) = pztab_halo_et_haloTout(ii,ij,ik) + !$mnh_do_concurrent( ii=1:halo_size,ij=1:a_ny+2*halo_size,ik=1:nz+2 ) + pzst(ii+a_nx,ij-halo_size,ik-1) = pztab_halo_et_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels end if @@ -1539,7 +1618,7 @@ contains halo_size = comm_param%halo_size ! Do nothing if we are only using one processor - if (m > 0) then + if (m >= 0) then if (comm_measuretime) then call start_timer(t_haloswap(level,m)) end if @@ -1553,11 +1632,13 @@ contains ! Work out ranks of neighbours ! W -> E +#ifndef MPIVIDE call mpi_cart_shift(MPI_COMM_HORIZ,1, stepsize, & neighbour_w_rank,neighbour_e_rank,ierr) ! N -> S call mpi_cart_shift(MPI_COMM_HORIZ,0, stepsize, & neighbour_n_rank,neighbour_s_rank,ierr) +#endif if ( (stepsize == 1) .or. & ( (mod(p_horiz(1),stepsize) == 0) .and. & (mod(p_horiz(2),stepsize) == 0) ) ) then @@ -1668,12 +1749,20 @@ contains real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_sw_m_1_haloTout real , pointer , contiguous , dimension(:,:,:) :: ztab_sub_interiorT_se_m_1_haloTout + integer :: a_nx,a_ny, b_nx,b_ny + call start_timer(t_collect(m)) stepsize = 2**(pproc-m) a_n = a%ix_max-a%ix_min+1 + a_nx = a%ix_max-a%ix_min+1 + a_ny = a%iy_max-a%iy_min+1 + b_n = b%ix_max-b%ix_min+1 + b_nx = b%ix_max-b%ix_min+1 + b_ny = b%iy_max-b%iy_min+1 + nz = b%grid_param%nz ! Work out rank, only execute on relevant processes @@ -1699,12 +1788,14 @@ contains ! NW receives from the other three processes if ( corner_nw ) then ! Receive from NE +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1),p_horiz(2)+stepsize/), & source_rank, & ierr) +#endif recv_tag = 1000 - if (LUseO) call mpi_irecv(b%s(0,1,b_n/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, & + if (LUseO) call mpi_irecv(b%s(0,1,b_nx/2+1),1,sub_interior(level,m-1),source_rank, recv_tag, MPI_COMM_HORIZ, & recv_request(1),ierr) recv_tag = 1010 if (LUseT) then @@ -1716,7 +1807,7 @@ contains recv_requestT(1),ierr) !$acc end host_data #else - call mpi_irecv(b%st(b_n/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & + call mpi_irecv(b%st(b_nx/2+1,1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & recv_requestT(1),ierr) #endif end if @@ -1725,12 +1816,14 @@ contains call fatalerror("Collect: receive from NE failed in mpi_irecv().") #endif ! Receive from SW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)+stepsize,p_horiz(2)/), & source_rank, & ierr) +#endif recv_tag = 1001 - if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & + if (LUseO) call mpi_irecv(b%s(0,b_ny/2+1,1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & recv_request(2),ierr) recv_tag = 1011 if (LUseT) then @@ -1742,7 +1835,7 @@ contains recv_requestT(2),ierr) !$acc end host_data #else - call mpi_irecv(b%st(1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & + call mpi_irecv(b%st(1,b_ny/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & recv_requestT(2),ierr) #endif endif @@ -1752,12 +1845,14 @@ contains call fatalerror("Collect: receive from SW failed in mpi_irecv().") #endif ! Receive from SE +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), & source_rank, & ierr) +#endif recv_tag = 1002 - if (LUseO) call mpi_irecv(b%s(0,b_n/2+1,b_n/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & + if (LUseO) call mpi_irecv(b%s(0,b_ny/2+1,b_nx/2+1),1,sub_interior(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & recv_request(3),ierr) recv_tag = 1012 if (LUseT) then @@ -1769,7 +1864,7 @@ contains recv_requestT(3),ierr) !$acc end host_data #else - call mpi_irecv(b%st(b_n/2+1,b_n/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & + call mpi_irecv(b%st(b_nx/2+1,b_ny/2+1,0),1,sub_interiorT(level,m-1), source_rank, recv_tag, MPI_COMM_HORIZ, & recv_requestT(3),ierr) #endif end if @@ -1778,18 +1873,18 @@ contains call fatalerror("Collect: receive from SE failed in mpi_irecv().") #endif ! Copy local data while waiting for data from other processes - if (LUseO) b%s(0:nz+1,1:a_n,1:a_n) = a%s(0:nz+1,1:a_n,1:a_n) + if (LUseO) b%s(0:nz+1,1:a_ny,1:a_nx) = a%s(0:nz+1,1:a_ny,1:a_nx) if (LUseT) then #ifdef MNH_GPUDIRECT zb_st => b%st za_st => a%st !$acc kernels present_cr(zb_st,za_st) - !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2) zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels #else - b%st(1:a_n,1:a_n,0:nz+1) = a%st(1:a_n,1:a_n,0:nz+1) + b%st(1:a_nx,1:a_ny,0:nz+1) = a%st(1:a_nx,1:a_ny,0:nz+1) #endif end if ! Wait for receives to complete before proceeding @@ -1801,20 +1896,20 @@ contains ! copy from buffer for GPU DIRECT ! Receive from NE !$acc kernels present_cr(zb_st,ztab_sub_interiorT_ne_m_1_haloTout) - !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) - zb_st(ii+b_n/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik) + !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2) + zb_st(ii+b_nx/2,ij,ik-1) = ztab_sub_interiorT_ne_m_1_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels ! Receive from SW !$acc kernels present_cr(zb_st,ztab_sub_interiorT_sw_m_1_haloTout) - !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) - zb_st(ii,ij+b_n/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik) + !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2) + zb_st(ii,ij+b_ny/2,ik-1) = ztab_sub_interiorT_sw_m_1_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels ! Receive from SE !$acc kernels present_cr(zb_st,ztab_sub_interiorT_se_m_1_haloTout) - !$mnh_do_concurrent(ii=1:b_n/2,ij=1:b_n/2,ik=1:nz+2) - zb_st(ii+b_n/2,ij+b_n/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik) + !$mnh_do_concurrent(ii=1:b_nx/2,ij=1:b_ny/2,ik=1:nz+2) + zb_st(ii+b_nx/2,ij+b_ny/2,ik-1) = ztab_sub_interiorT_se_m_1_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels end if @@ -1822,11 +1917,12 @@ contains end if if ( corner_ne ) then ! Send to NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1),p_horiz(2)-stepsize/), & dest_rank, & ierr) - +#endif za_st => a%st send_tag = 1000 @@ -1836,7 +1932,7 @@ contains #ifdef MNH_GPUDIRECT ztab_interiorT_ne_m_haloTin => tab_interiorT_ne(level,m)%haloTin !$acc kernels present_cr(ztab_interiorT_ne_m_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2) ztab_interiorT_ne_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels @@ -1855,11 +1951,12 @@ contains end if if ( corner_sw ) then ! Send to NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)-stepsize,p_horiz(2)/), & dest_rank, & ierr) - +#endif za_st => a%st send_tag = 1001 @@ -1869,7 +1966,7 @@ contains #ifdef MNH_GPUDIRECT ztab_interiorT_sw_m_haloTin => tab_interiorT_sw(level,m)%haloTin !$acc kernels present_cr(ztab_interiorT_sw_m_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2) ztab_interiorT_sw_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels @@ -1888,11 +1985,13 @@ contains end if if ( corner_se ) then ! send to NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), & dest_rank, & ierr) +#endif za_st => a%st send_tag = 1002 @@ -1902,7 +2001,7 @@ contains #ifdef MNH_GPUDIRECT ztab_interiorT_se_m_haloTin => tab_interiorT_se(level,m)%haloTin !$acc kernels present_cr(ztab_interiorT_se_m_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n,ij=1:a_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:a_nx,ij=1:a_ny,ik=1:nz+2) ztab_interiorT_se_m_haloTin(ii,ij,ik) = za_st(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels @@ -1967,12 +2066,20 @@ contains real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_sw_m_haloTout real , pointer , contiguous , dimension(:,:,:) :: ztab_interiorT_se_m_haloTout + integer :: a_nx,a_ny, b_nx,b_ny + call start_timer(t_distribute(m)) stepsize = 2**(pproc-m) a_n = a%ix_max-a%ix_min+1 + a_nx = a%ix_max-a%ix_min+1 + a_ny = a%iy_max-a%iy_min+1 + b_n = b%ix_max-b%ix_min+1 + b_nx = b%ix_max-b%ix_min+1 + b_ny = b%iy_max-b%iy_min+1 + nz = a%grid_param%nz ! Work out rank, only execute on relevant processes @@ -1995,23 +2102,25 @@ contains end if if ( corner_nw ) then ! (Asynchronous) send to NE +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1),p_horiz(2)+stepsize/), & dest_rank, & ierr) +#endif za_st => a%st send_tag = 1000 - if (LUseO) call mpi_isend(a%s(0,1,a_n/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, & + if (LUseO) call mpi_isend(a%s(0,1,a_nx/2+1), 1,sub_interior(level,m-1),dest_rank, send_tag, & MPI_COMM_HORIZ,send_request(1),ierr) send_tag = 1010 if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_ne_m_1_haloTin => tab_sub_interiorT_ne(level,m-1)%haloTin !$acc kernels present_cr(ztab_sub_interiorT_ne_m_1_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) - ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij,ik-1) + !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2) + ztab_sub_interiorT_ne_m_1_haloTin(ii,ij,ik) = za_st(ii+a_nx/2,ij,ik-1) !$mnh_end_do() !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_ne_m_1_haloTin) @@ -2020,7 +2129,7 @@ contains MPI_COMM_HORIZ,send_requestT(1),ierr) !$acc end host_data #else - call mpi_isend(a%st(a_n/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, & + call mpi_isend(a%st(a_nx/2+1,1,0), 1,sub_interiorT(level,m-1),dest_rank, send_tag, & MPI_COMM_HORIZ,send_requestT(1),ierr) #endif end if @@ -2029,20 +2138,22 @@ contains call fatalerror("Distribute: send to NE failed in mpi_isend().") #endif ! (Asynchronous) send to SW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)+stepsize,p_horiz(2)/), & dest_rank, & ierr) +#endif send_tag = 1001 - if (LUseO) call mpi_isend(a%s(0,a_n/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, & + if (LUseO) call mpi_isend(a%s(0,a_ny/2+1,1),1,sub_interior(level,m-1), dest_rank, send_tag, & MPI_COMM_HORIZ, send_request(2), ierr) send_tag = 1011 if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_sw_m_1_haloTin => tab_sub_interiorT_sw(level,m-1)%haloTin !$acc kernels present_cr(ztab_sub_interiorT_sw_m_1_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) - ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_n/2,ik-1) + !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2) + ztab_sub_interiorT_sw_m_1_haloTin(ii,ij,ik) = za_st(ii,ij+a_ny/2,ik-1) !$mnh_end_do() !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_sw_m_1_haloTin) @@ -2051,7 +2162,7 @@ contains MPI_COMM_HORIZ, send_requestT(2), ierr) !$acc end host_data #else - call mpi_isend(a%st(1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, & + call mpi_isend(a%st(1,a_ny/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, & MPI_COMM_HORIZ, send_requestT(2), ierr) #endif end if @@ -2061,20 +2172,22 @@ contains call fatalerror("Distribute: send to SW failed in mpi_isend().") #endif ! (Asynchronous) send to SE +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)+stepsize,p_horiz(2)+stepsize/), & dest_rank, & ierr) +#endif send_tag = 1002 - if (LUseO) call mpi_isend(a%s(0,a_n/2+1,a_n/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, & + if (LUseO) call mpi_isend(a%s(0,a_ny/2+1,a_nx/2+1),1,sub_interior(level,m-1), dest_rank, send_tag, & MPI_COMM_HORIZ, send_request(3), ierr) send_tag = 1012 if (LUseT) then #ifdef MNH_GPUDIRECT ztab_sub_interiorT_se_m_1_haloTin => tab_sub_interiorT_se(level,m-1)%haloTin !$acc kernels present_cr(ztab_sub_interiorT_se_m_1_haloTin,za_st) - !$mnh_do_concurrent(ii=1:a_n/2,ij=1:a_n/2,ik=1:nz+2) - ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_n/2,ij+a_n/2,ik-1) + !$mnh_do_concurrent(ii=1:a_nx/2,ij=1:a_ny/2,ik=1:nz+2) + ztab_sub_interiorT_se_m_1_haloTin(ii,ij,ik) = za_st(ii+a_nx/2,ij+a_ny/2,ik-1) !$mnh_end_do() !$acc end kernels !$acc host_data use_device(ztab_sub_interiorT_se_m_1_haloTin) @@ -2083,7 +2196,7 @@ contains MPI_COMM_HORIZ, send_requestT(3), ierr) !$acc end host_data #else - call mpi_isend(a%st(a_n/2+1,a_n/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, & + call mpi_isend(a%st(a_nx/2+1,a_ny/2+1,0),1,sub_interiorT(level,m-1), dest_rank, send_tag, & MPI_COMM_HORIZ, send_requestT(3), ierr) #endif end if @@ -2092,18 +2205,18 @@ contains call fatalerror("Distribute: send to SE failed in mpi_isend().") #endif ! While sending, copy local data - if (LUseO) b%s(0:nz+1,1:b_n,1:b_n) = a%s(0:nz+1,1:b_n,1:b_n) + if (LUseO) b%s(0:nz+1,1:b_ny,1:b_nx) = a%s(0:nz+1,1:b_ny,1:b_nx) if (LUseT) then #ifdef MNH_GPUDIRECT zb_st => b%st za_st => a%st !$acc kernels present_cr(zb_st,za_st) - !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2) zb_st(ii,ij,ik-1) = za_st(ii,ij,ik-1) !$mnh_end_do() !$acc end kernels #else - b%st(1:b_n,1:b_n,0:nz+1) = a%st(1:b_n,1:b_n,0:nz+1) + b%st(1:b_nx,1:b_ny,0:nz+1) = a%st(1:b_nx,1:b_ny,0:nz+1) #endif end if ! Only proceed when async sends to complete @@ -2113,11 +2226,12 @@ contains if ( corner_ne ) then ! Receive from NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1),p_horiz(2)-stepsize/), & source_rank, & ierr) - +#endif zb_st => b%st recv_tag = 1000 @@ -2131,7 +2245,7 @@ contains MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data !$acc kernels present_cr(zb_st,ztab_interiorT_ne_m_haloTout) - !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_ne_m_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels @@ -2146,11 +2260,12 @@ contains end if if ( corner_sw ) then ! Receive from NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)-stepsize,p_horiz(2)/), & source_rank, & ierr) - +#endif zb_st => b%st recv_tag = 1001 @@ -2164,7 +2279,7 @@ contains MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data !$acc kernels present_cr(zb_st,ztab_interiorT_sw_m_haloTout) - !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_sw_m_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels @@ -2179,11 +2294,12 @@ contains end if if ( corner_se ) then ! Receive from NW +#ifndef MPIVIDE call mpi_cart_rank(MPI_COMM_HORIZ, & (/p_horiz(1)-stepsize,p_horiz(2)-stepsize/), & source_rank, & ierr) - +#endif zb_st => b%st recv_tag = 1002 @@ -2197,7 +2313,7 @@ contains MPI_DOUBLE_PRECISION,source_rank,recv_tag,MPI_COMM_HORIZ,stat,ierr) !$acc end host_data !$acc kernels present_cr(zb_st,ztab_interiorT_se_m_haloTout) - !$mnh_do_concurrent(ii=1:b_n,ij=1:b_n,ik=1:nz+2) + !$mnh_do_concurrent(ii=1:b_nx,ij=1:b_ny,ik=1:nz+2) zb_st(ii,ij,ik-1) = ztab_interiorT_se_m_haloTout(ii,ij,ik) !$mnh_end_do() !$acc end kernels diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90 index 0a99735ec1f269d2ef824843bf861c55d71b5dc3..fddc325865d6341b19de414d7dffc746c0ca6d53 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/datatypes.f90 @@ -144,9 +144,18 @@ private real(kind=rl) , dimension(:,:,:), pointer , contiguous :: zphi_st + integer :: pproc,pprocx,pprocy + integer :: nlocalx,nlocaly + phi%grid_param = grid_param call mpi_comm_size(comm_horiz, nproc, ierr) + pproc = nint(log(1.0d0*nproc)/log(2.0d0)) + pprocx = pproc / 2 + pprocy = pproc - pprocx + pproc = pprocx nlocal = grid_param%n/sqrt(1.0*nproc) + nlocaly = grid_param%n/2**pprocy + nlocalx = grid_param%n/2**pprocx ! Work out position in 2d processor grid call mpi_comm_rank(comm_horiz, rank, ierr) @@ -154,10 +163,10 @@ private ! Set local data ranges ! NB: p_horiz stores (py,px) in that order (see comment in ! communication module) - phi%iy_min = p_horiz(1)*nlocal + 1 - phi%iy_max = (p_horiz(1)+1)*nlocal - phi%ix_min = p_horiz(2)*nlocal + 1 - phi%ix_max = (p_horiz(2)+1)*nlocal + phi%iy_min = p_horiz(1)*nlocaly + 1 + phi%iy_max = (p_horiz(1)+1)*nlocaly + phi%ix_min = p_horiz(2)*nlocalx + 1 + phi%ix_max = (p_horiz(2)+1)*nlocalx ! Set computational ranges. Note that these are different at ! the edges of the domain! if (p_horiz(1) == 0) then @@ -165,20 +174,20 @@ private else phi%icompy_min = 1 - (halo_size - 1) end if - if (p_horiz(1) == floor(sqrt(1.0_rl*nproc))-1) then - phi%icompy_max = nlocal + if (p_horiz(1) == pprocy-1) then + phi%icompy_max = nlocaly else - phi%icompy_max = nlocal + (halo_size - 1) + phi%icompy_max = nlocaly + (halo_size - 1) end if if (p_horiz(2) == 0) then phi%icompx_min = 1 else phi%icompx_min = 1 - (halo_size - 1) end if - if (p_horiz(2) == floor(sqrt(1.0_rl*nproc))-1) then - phi%icompx_max = nlocal + if (p_horiz(2) == pprocx-1) then + phi%icompx_max = nlocalx else - phi%icompx_max = nlocal + (halo_size - 1) + phi%icompx_max = nlocalx + (halo_size - 1) end if ! Set halo size phi%halo_size = halo_size @@ -192,8 +201,8 @@ private phi%s(:,:,:) = 0.0_rl end if if (LUseT) then - allocate(zphi_st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & + allocate(zphi_st(1-halo_size:nlocalx+halo_size, & + 1-halo_size:nlocaly+halo_size, & 0:grid_param%nz+1)) !$acc enter data create (zphi_st) phi%st => zphi_st @@ -483,7 +492,11 @@ private integer :: rank, nproc, ierr character(len=21) :: s + integer :: nlocalx,nlocaly + nlocal = phi%ix_max-phi%ix_min+1 + nlocalx = phi%ix_max-phi%ix_min+1 + nlocaly = phi%iy_max-phi%iy_min+1 ! Get number of processes and rank call mpi_comm_size(comm_horiz, nproc, ierr) @@ -523,8 +536,8 @@ private if (LUseT) then !$acc update host(phi%st) do iz=0,phi%grid_param%nz+1 - do iy=1-phi%halo_size,nlocal+phi%halo_size - do ix=1-phi%halo_size,nlocal+phi%halo_size + do iy=1-phi%halo_size,nlocaly+phi%halo_size + do ix=1-phi%halo_size,nlocalx+phi%halo_size write(file_id,'(E24.15)') phi%st(ix,iy,iz) end do end do diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 index edbe5ebfc57b22b3c2543f9445bb52b0d28a70fe..4929ad8a52862008152fea1c326c35d71b5356b1 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/discretisation.f90 @@ -926,6 +926,8 @@ end subroutine construct_vertical_coeff integer :: halo_size integer :: nlocal, nz + integer :: nlocalx,nlocaly + #ifdef MEASURESMOOTHINGRATE r%ix_min = u%ix_min r%ix_max = u%ix_max @@ -939,6 +941,8 @@ end subroutine construct_vertical_coeff r%isactive = u%isactive r%grid_param = u%grid_param nlocal = r%ix_max-r%ix_min+1 + nlocalx = r%ix_max-r%ix_min+1 + nlocaly = r%iy_max-r%iy_min+1 halo_size = r%halo_size nz = r%grid_param%nz @@ -949,8 +953,8 @@ end subroutine construct_vertical_coeff end if if (LUseT) then - allocate(r%st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & + allocate(r%st(1-halo_size:nlocalx+halo_size, & + 1-halo_size:nlocaly+halo_size, & 0:nz+1)) !$acc enter data create (r%st) end if @@ -1087,6 +1091,8 @@ end subroutine construct_vertical_coeff integer :: iblock logical :: overlap_comms + integer :: nlocalx,nlocaly + call boundary_mnh(u) ordering = smoother_param%ordering @@ -1100,6 +1106,8 @@ end subroutine construct_vertical_coeff allocate(c(nz)) allocate(utmp(nz)) nlocal = u%ix_max-u%ix_min+1 + nlocalx = u%ix_max-u%ix_min+1 + nlocaly = u%iy_max-u%iy_min+1 #ifdef OVERLAPCOMMS overlap_comms = (nlocal > 2) #else @@ -1107,37 +1115,37 @@ end subroutine construct_vertical_coeff #endif ! Block 1 (N) ixmin(1) = 1 - ixmax(1) = nlocal + ixmax(1) = nlocalx iymin(1) = 1 iymax(1) = 1 ! Block 2 (S) ixmin(2) = 1 - ixmax(2) = nlocal - iymin(2) = nlocal - iymax(2) = nlocal + ixmax(2) = nlocalx + iymin(2) = nlocaly + iymax(2) = nlocaly ! Block 3 (W) ixmin(3) = 1 ixmax(3) = 1 iymin(3) = 2 - iymax(3) = nlocal-1 + iymax(3) = nlocaly-1 ! Block 4 (E) - ixmin(4) = nlocal - ixmax(4) = nlocal + ixmin(4) = nlocalx + ixmax(4) = nlocalx iymin(4) = 2 - iymax(4) = nlocal-1 + iymax(4) = nlocaly-1 ! Block 5 (INTERIOR) if (overlap_comms) then ixmin(5) = 2 - ixmax(5) = nlocal-1 + ixmax(5) = nlocalx-1 iymin(5) = 2 - iymax(5) = nlocal-1 + iymax(5) = nlocaly-1 else ! If there are no interior cells, do not overlap ! communications and calculations, just loop over interior cells ixmin(5) = 1 - ixmax(5) = nlocal + ixmax(5) = nlocalx iymin(5) = 1 - iymax(5) = nlocal + iymax(5) = nlocaly end if dix = +1 diy = +1 @@ -1474,21 +1482,24 @@ end subroutine construct_vertical_coeff integer :: nib,nie,njb,nje,nkb,nke integer :: ii,ij,ik integer :: iindex - integer :: n_lin + integer :: n_lin + + integer :: nlocalx,nlocaly ! ! init size , param ! nz = u%grid_param%nz nlocal = u%ix_max -u%ix_min + 1 + nlocalx = u%ix_max -u%ix_min + 1 + nlocaly = u%iy_max -u%iy_min + 1 halo_size = u%halo_size - n_lin = (nlocal+2*halo_size)**2 * (nz+2) + n_lin = (nlocalx+2*halo_size) * (nlocaly+2*halo_size) * (nz+2) - nib = 1-halo_size - nie = nlocal+halo_size + nie = nlocalx+halo_size njb = 1-halo_size - nje = nlocal+halo_size + nje = nlocaly+halo_size nkb = 0 nke = nz + 1 @@ -1559,9 +1570,16 @@ end subroutine construct_vertical_coeff Tjacobi(level)%Sutmp%st => zSutmp_st end if end if - + +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin boundary_mnh_mnh ====== level=',level + call print_scalaprod2(level , m , u , "Befor boundary_mnh u=" ) +#endif call boundary_mnh(u) - +#ifdef MG_DEBUG + call print_scalaprod2(level , m , u , "After boundary_mnh u=" ) +#endif + #ifdef OVERLAPCOMMS overlap_comms = (nlocal > 2) #else @@ -1570,37 +1588,37 @@ end subroutine construct_vertical_coeff ! Block 1 (N) ixmin(1) = 1 - ixmax(1) = nlocal + ixmax(1) = nlocalx iymin(1) = 1 iymax(1) = 1 ! Block 2 (S) ixmin(2) = 1 - ixmax(2) = nlocal - iymin(2) = nlocal - iymax(2) = nlocal + ixmax(2) = nlocalx + iymin(2) = nlocaly + iymax(2) = nlocaly ! Block 3 (W) ixmin(3) = 1 ixmax(3) = 1 iymin(3) = 2 - iymax(3) = nlocal-1 + iymax(3) = nlocaly-1 ! Block 4 (E) - ixmin(4) = nlocal - ixmax(4) = nlocal + ixmin(4) = nlocalx + ixmax(4) = nlocalx iymin(4) = 2 - iymax(4) = nlocal-1 + iymax(4) = nlocaly-1 ! Block 5 (INTERIOR) if (overlap_comms) then ixmin(5) = 2 - ixmax(5) = nlocal-1 + ixmax(5) = nlocalx-1 iymin(5) = 2 - iymax(5) = nlocal-1 + iymax(5) = nlocaly-1 else ! If there are no interior cells, do not overlap ! communications and calculations, just loop over interior cells ixmin(5) = 1 - ixmax(5) = nlocal + ixmax(5) = nlocalx iymin(5) = 1 - iymax(5) = nlocal + iymax(5) = nlocaly end if ! Temporary vector diff --git a/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90 b/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90 index b72b933725bbe147bc38280112bbe9d0a640f866..217ad17235bfe0de7b88583e7570af4fcb6c6a82 100644 --- a/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90 +++ b/src/ZSOLVER/tensorproductmultigrid_Source/multigrid.f90 @@ -174,6 +174,8 @@ contains real , dimension(:,:,:) , pointer , contiguous :: zxu_mg_st,zxb_mg_st,zxr_mg_st + integer :: nlocalx,nlocaly + Gmg_initialized = .TRUE. if (i_am_master_mpi) & @@ -214,14 +216,16 @@ contains allocate(xr_mg(mg_param%n_lev,0:pproc)) n = grid_param%n nlocal = n/(2**pproc) + nlocalx = n/(2**pprocx) + nlocaly = n/(2**pprocy) nz = grid_param%nz L = grid_param%L H = grid_param%H level = mg_param%n_lev - m = pproc + m = pprocx ! pproc reduced_m = .false. ! - call zt1d_discretisation_init(nlocal,nlocal,nz) + call zt1d_discretisation_init(nlocalx,nlocaly,nz) ! ! Work out local processor coordinates (this is necessary to identify ! global coordinates) @@ -236,8 +240,8 @@ contains if (i_am_master_mpi) & write(STDOUT, & '(" Local gridsize (x,y,z) on level ",I3," m = ",I4," : ",I8," x ",I8," x ",I8)') & - level, m, nlocal, nlocal, nz - if (nlocal < 1) then + level, m, nlocalx, nlocaly, nz + if ( (nlocalx < 1) .or. (nlocaly < 1) ) then call fatalerror('Number of grid points < 1') end if @@ -254,16 +258,16 @@ contains icompx_min = 1 - (halo_size - 1) end if - if (p_horiz(1) == 2**pproc-1) then - icompy_max = nlocal + if (p_horiz(1) == 2**pprocy-1) then + icompy_max = nlocaly else - icompy_max = nlocal + (halo_size - 1) + icompy_max = nlocaly + (halo_size - 1) end if - if (p_horiz(2) == 2**pproc-1) then - icompx_max = nlocal + if (p_horiz(2) == 2**pprocx-1) then + icompx_max = nlocalx else - icompx_max = nlocal + (halo_size - 1) + icompx_max = nlocalx + (halo_size - 1) end if ! Allocate data @@ -283,20 +287,20 @@ contains endif if (LUseT) then - allocate(zxu_mg_st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & + allocate(zxu_mg_st(1-halo_size:nlocalx+halo_size, & + 1-halo_size:nlocaly+halo_size, & 0:nz+1)) !$acc enter data create (zxu_mg_st) xu_mg(level,m)%st => zxu_mg_st - allocate(zxb_mg_st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & + allocate(zxb_mg_st(1-halo_size:nlocalx+halo_size, & + 1-halo_size:nlocaly+halo_size, & 0:nz+1)) !$acc enter data create (zxb_mg_st) xb_mg(level,m)%st => zxb_mg_st - allocate(zxr_mg_st(1-halo_size:nlocal+halo_size, & - 1-halo_size:nlocal+halo_size, & + allocate(zxr_mg_st(1-halo_size:nlocalx+halo_size, & + 1-halo_size:nlocaly+halo_size, & 0:nz+1)) !$acc enter data create (zxr_mg_st) xr_mg(level,m)%st => zxr_mg_st @@ -311,10 +315,10 @@ contains ! NB: 1st coordinate is in the y-direction of the processor grid, ! second coordinate is in the x-direction (see comments in ! communication module) - iy_min = (p_horiz(1)/2**(pproc-m))*nlocal+1 - iy_max = (p_horiz(1)/2**(pproc-m)+1)*nlocal - ix_min = p_horiz(2)/2**(pproc-m)*nlocal+1 - ix_max = (p_horiz(2)/2**(pproc-m)+1)*nlocal + iy_min = (p_horiz(1)/2**(pproc-m))*nlocaly+1 + iy_max = (p_horiz(1)/2**(pproc-m)+1)*nlocaly + ix_min = p_horiz(2)/2**(pproc-m)*nlocalx+1 + ix_max = (p_horiz(2)/2**(pproc-m)+1)*nlocalx ! Set grid parameters and local data ranges ! Note that only n (and possibly nz) change as we ! move down the levels @@ -365,8 +369,8 @@ contains ! Are these grids active? if ( (m == pproc) .or. & - ( (mod(p_horiz(1),2**(pproc-m)) == 0) .and. & - (mod(p_horiz(2),2**(pproc-m)) == 0) ) ) then + ( (mod(p_horiz(1),2**(pprocx-m)) == 0) .and. & + (mod(p_horiz(2),2**(pprocx-m)) == 0) ) ) then grid_active = .true. else grid_active = .false. @@ -395,12 +399,16 @@ contains reduced_m = .true. m = m-1 nlocal = 2*nlocal + nlocalx = 2*nlocalx + nlocaly = 2*nlocaly cycle end if reduced_m = .false. level = level-1 n = n/2 nlocal = nlocal/2 + nlocalx = nlocalx/2 + nlocaly = nlocaly/2 end do if (i_am_master_mpi) & write(STDOUT,*) '' @@ -663,6 +671,8 @@ contains integer :: ierr integer :: iblock + integer :: nlocalx,nlocaly + ! Needed for interpolation matrix #ifdef PIECEWISELINEAR #else @@ -676,6 +686,8 @@ contains #endif nlocal = phicoarse%ix_max-phicoarse%ix_min+1 + nlocalx = phicoarse%ix_max-phicoarse%ix_min+1 + nlocaly = phicoarse%iy_max-phicoarse%iy_min+1 n = phicoarse%grid_param%n nz = phicoarse%grid_param%nz @@ -686,37 +698,37 @@ contains #endif ! Block 1 (N) ixmin(1) = 1 - ixmax(1) = nlocal + ixmax(1) = nlocalx iymin(1) = 1 iymax(1) = 1 ! Block 2 (S) ixmin(2) = 1 - ixmax(2) = nlocal - iymin(2) = nlocal - iymax(2) = nlocal + ixmax(2) = nlocalx + iymin(2) = nlocaly + iymax(2) = nlocaly ! Block 3 (W) ixmin(3) = 1 ixmax(3) = 1 iymin(3) = 2 - iymax(3) = nlocal-1 + iymax(3) = nlocaly-1 ! Block 4 (E) - ixmin(4) = nlocal - ixmax(4) = nlocal + ixmin(4) = nlocalx + ixmax(4) = nlocalx iymin(4) = 2 - iymax(4) = nlocal-1 + iymax(4) = nlocaly-1 ! Block 5 (INTERIOR) if (overlap_comms) then ixmin(5) = 2 - ixmax(5) = nlocal-1 + ixmax(5) = nlocalx-1 iymin(5) = 2 - iymax(5) = nlocal-1 + iymax(5) = nlocaly-1 else ! If there are no interior cells, do not overlap ! communications and calculations, just loop over interior cells ixmin(5) = 1 - ixmax(5) = nlocal + ixmax(5) = nlocalx iymin(5) = 1 - iymax(5) = nlocal + iymax(5) = nlocaly end if ! *** Constant prolongation or (tri-) linear prolongation *** @@ -1252,9 +1264,18 @@ contains ! Perform n_presmooth smoothing steps call start_timer(t_smooth(level,m)) call start_timer(t_total(level,m)) +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin smooth_mnh DIRECTION_FORWARD ====== level=',level + call print_scalaprod2(level , m , b(level ,m) , "Befor smooth_mnh b(level ,m )=" ) + call print_scalaprod2(level , m , u(level ,m) , "Befor smooth_mnh u(level ,m )=" ) +#endif call smooth_mnh(level,m,mg_param%n_presmooth, & DIRECTION_FORWARD, & b(level,m),u(level,m)) +#ifdef MG_DEBUG + call print_scalaprod2(level , m , b(level ,m) , "After smooth_mnh b(level ,m )=" ) + call print_scalaprod2(level , m , u(level ,m) , "After smooth_mnh u(level ,m )=" ) +#endif call finish_timer(t_total(level,m)) call finish_timer(t_smooth(level,m)) ! Calculate residual @@ -1266,15 +1287,27 @@ contains ! Restrict residual call start_timer(t_restrict(level,m)) call start_timer(t_total(level,m)) +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin restrict_mnh ====== level=',level + call print_scalaprod2(level-1 , m , b(level-1,m) , "Befor restrict_mnh b(level-1,m )=" ) + call print_scalaprod2(level , m , r(level ,m) , "Befor restrict_mnh r(level ,m )=" ) +#endif call restrict_mnh(r(level,m),b(level-1,m)) +#ifdef MG_DEBUG + call print_scalaprod2(level-1 , m , b(level-1,m) , "After restrict_mnh b(level-1,m )=" ) + call print_scalaprod2(level , m , r(level ,m) , "After restrict_mnh r(level ,m )=" ) +#endif call finish_timer(t_total(level,m)) call finish_timer(t_restrict(level,m)) if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then ! Collect data on coarser level call start_timer(t_total(level,m)) call collect(level-1,m,b(level-1,m),b(level-1,m-1)) -!!$ call print_scalaprod2(level-1, m , b(level-1,m) , "After collect b(level-1,m )=" ) -!!$ call print_scalaprod2(level-1, m-1, b(level-1,m-1), "After collect b(level-1,m-1)=" ) +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== after collect ====== level-1=',level-1 + call print_scalaprod2(level-1, m , b(level-1,m) , "After collect b(level-1,m )=" ) + call print_scalaprod2(level-1, m-1, b(level-1,m-1), "After collect b(level-1,m-1)=" ) +#endif call finish_timer(t_total(level,m)) ! Set initial solution on coarser level to zero (incl. halos!) if (LUseO) u(level-1,m-1)%s(:,:,:) = 0.0_rl @@ -1289,8 +1322,11 @@ contains ! Distribute data on coarser grid call start_timer(t_total(level,m)) call distribute(level-1,m,u(level-1,m-1),u(level-1,m)) -!!$ call print_scalaprod2(level-1, m , u(level-1,m) , "After distribute u(level-1,m )=" ) -!!$ call print_scalaprod2(level-1, m-1, u(level-1,m-1), "After distribute u(level-1,m-1)=" ) +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== after distribute ====== level-1=',level-1 + call print_scalaprod2(level-1, m , u(level-1,m) , "After distribute u(level-1,m )=" ) + call print_scalaprod2(level-1, m-1, u(level-1,m-1), "After distribute u(level-1,m-1)=" ) +#endif call finish_timer(t_total(level,m)) else ! Set initial solution on coarser level to zero (incl. halos!) @@ -1320,13 +1356,31 @@ contains ! Prolongate error call start_timer(t_prolongate(level,m)) call start_timer(t_total(level,m)) + +#ifdef MG_DEBUG + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin haloswap_mnh_mnh ====== level-1=',level-1 + call print_scalaprod2(level-1 , m , u(level-1,m) , "Befor haloswap_mnh u(level-1,m )=" ) +#endif call haloswap_mnh(level-1,m,u(level-1,m)) +#ifdef MG_DEBUG + call print_scalaprod2(level-1 , m , u(level-1,m) , "After haloswap_mnh u(level-1,m )=" ) + + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin boundary_mnh_mnh ====== level-1=',level-1 + call print_scalaprod2(level-1 , m , u(level-1,m) , "Befor boundary_mnh u(level-1,m )=" ) +#endif call boundary_mnh(u(level-1,m)) -!!$ call print_scalaprod2(level-1 , m , u(level-1,m) , "Befor prolongate_mnh u(level-1,m )=" ) -!!$ call print_scalaprod2(level , m , r(level ,m) , "Befor prolongate_mnh r(level ,m )=" ) +#ifdef MG_DEBUG + call print_scalaprod2(level-1 , m , u(level-1,m) , "After boundary_mnh u(level-1,m )=" ) + + if (i_am_master_mpi) write(STDOUT,*) ' ====== begin prolongate_mnh ====== level=',level + call print_scalaprod2(level-1 , m , u(level-1,m) , "Befor prolongate_mnh u(level-1,m )=" ) + call print_scalaprod2(level , m , r(level ,m) , "Befor prolongate_mnh r(level ,m )=" ) +#endif call prolongate_mnh(level,m,u(level-1,m),r(level,m)) -!!$ call print_scalaprod2(level-1 , m , u(level-1,m) , "After prolongate_mnh u(level-1,m )=" ) -!!$ call print_scalaprod2(level , m , r(level ,m) , "After prolongate_mnh r(level ,m )=" ) +#ifdef MG_DEBUG + call print_scalaprod2(level-1 , m , u(level-1,m) , "After prolongate_mnh u(level-1,m )=" ) + call print_scalaprod2(level , m , r(level ,m) , "After prolongate_mnh r(level ,m )=" ) +#endif call finish_timer(t_total(level,m)) call finish_timer(t_prolongate(level,m)) ! Add error to fine grid solution @@ -1350,7 +1404,7 @@ contains call start_timer(t_total(level,m)) if (mg_param%coarsegridsolver == COARSEGRIDSOLVER_CG) then call cg_solve_mnh(level,m,b(level,m),u(level,m)) -!!$ call print_scalaprod2(level, m , u(level,m), "After cg_solve_mnh u=(level,m)" ) + call print_scalaprod2(level, m , u(level,m), "After cg_solve_mnh u=(level,m)" ) else if (mg_param%coarsegridsolver == COARSEGRIDSOLVER_SMOOTHER) then ! Smooth on coarsest level call smooth_mnh(level,m, & @@ -1415,7 +1469,7 @@ contains call restrict(r(level,m),b(level-1,m)) call finish_timer(t_total(level,m)) call finish_timer(t_restrict(level,m)) - if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then + if ( ((level-1) .le. splitlevel) .and. (m >= 0) ) then ! Collect data on coarser level call start_timer(t_total(level,m)) call collect(level-1,m,b(level-1,m),b(level-1,m-1)) @@ -1500,7 +1554,7 @@ contains call haloswap(level,m,u(level,m)) end do call mpi_barrier(MPI_COMM_HORIZ,ierr) - if ( ((level-1) .le. splitlevel) .and. (m > 0) ) then + if ( ((level-1) .le. splitlevel) .and. (m >= 0) ) then call mpi_barrier(MPI_COMM_HORIZ,ierr) do i=1,nhaloswap call haloswap(level-1,m,u(level-1,m)) @@ -1627,7 +1681,7 @@ contains if (mg_param%verbose > 0) then if (i_am_master_mpi) then write(STDOUT,'(" *** Multigrid solver ***")') - write(STDOUT,'(" <MG> Initial residual : ",E10.5)') res_initial + write(STDOUT,'(" <MG> Initial residual : ",E23.10)') res_initial end if end if if (mg_param%verbose > 0) then