Skip to content
Snippets Groups Projects
parallel_mod.F90 19.2 KiB
Newer Older
  IMPLICIT NONE
  ! recieve and displacement counts for gatherv
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_y, dsp_y, rcv_zy, dsp_zy
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_e,  dsp_zp_e
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_yp_e,  dsp_yp_e
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zyp_e, dsp_zyp_e
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_i,  dsp_zp_i
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_yp_i,  dsp_yp_i
  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zyp_i, dsp_zyp_i
  ! Various buffers used
  COMPLEX(dp), ALLOCATABLE, DIMENSION(:,:,:)     :: buff_xy_zBC
  COMPLEX(dp), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: buff_pjxy_zBC_e
  COMPLEX(dp), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: buff_pjxy_zBC_i

  PUBLIC :: manual_0D_bcast, manual_3D_bcast, init_parallel_var, &
            gather_xyz, gather_pjz_i, gather_pjxyz_e, gather_pjxyz_i

CONTAINS

  SUBROUTINE init_parallel_var
    INTEGER :: i_
    !!!!!! XYZ gather variables
    !! Y reduction at constant x and z
    ! number of points recieved and displacement for the y reduction
    ALLOCATE(rcv_y(0:num_procs_ky-1),dsp_y(0:num_procs_ky-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nky,1,MPI_INTEGER,rcv_y,1,MPI_INTEGER,comm_ky,ierr)
    ! the displacement array can be build from each local_np as
    dsp_y(0)=0
    DO i_=1,num_procs_ky-1
       dsp_y(i_) =dsp_y(i_-1) + rcv_y(i_-1)
    END DO
    !! Z reduction for full slices of y data but constant x
    ! number of points recieved and displacement for the z reduction
    ALLOCATE(rcv_zy(0:num_procs_z-1),dsp_zy(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nz*Nky,1,MPI_INTEGER,rcv_zy,1,MPI_INTEGER,comm_z,ierr)
    ! the displacement array can be build from each local_np as
       dsp_zy(i_) =dsp_zy(i_-1) + rcv_zy(i_-1)

    !!!!! PJZ gather variables
    !! P reduction at constant j and z is already done in module GRID
    !! Z reduction for full slices of p data but constant j
    ! number of points recieved and displacement for the z reduction
    ALLOCATE(rcv_zp_i(0:num_procs_z-1),dsp_zp_i(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nz*Np_i,1,MPI_INTEGER,rcv_zp_i,1,MPI_INTEGER,comm_z,ierr)
    ! the displacement array can be build from each local_np as
    dsp_zp_i(0)=0
    DO i_=1,num_procs_z-1
       dsp_zp_i(i_) =dsp_zp_i(i_-1) + rcv_zp_i(i_-1)
    END DO

    !!!!! PJXYZ gather variables
    !! Y reduction for full slices of p data but constant j
    ! number of points recieved and displacement for the y reduction
    ALLOCATE(rcv_yp_i(0:num_procs_ky-1),dsp_yp_i(0:num_procs_ky-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nky*Np_i,1,MPI_INTEGER,rcv_yp_i,1,MPI_INTEGER,comm_ky,ierr)
    ! the displacement array can be build from each local_np as
    dsp_yp_i(0)=0
    DO i_=1,num_procs_ky-1
       dsp_yp_i(i_) =dsp_yp_i(i_-1) + rcv_yp_i(i_-1)
    END DO
    !! Z reduction for full slices of py data but constant j
    ! number of points recieved and displacement for the z reduction
    ALLOCATE(rcv_zyp_i(0:num_procs_z-1),dsp_zyp_i(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nz*Np_i*Nky,1,MPI_INTEGER,rcv_zyp_i,1,MPI_INTEGER,comm_z,ierr)
    ! the displacement array can be build from each local_np as
    dsp_zyp_i(0)=0
    DO i_=1,num_procs_z-1
       dsp_zyp_i(i_) =dsp_zyp_i(i_-1) + rcv_zyp_i(i_-1)
    END DO

    ! ELECTONS
    !! Z reduction for full slices of p data but constant j
    ! number of points recieved and displacement for the z reduction
    ALLOCATE(rcv_zp_e(0:num_procs_z-1),dsp_zp_e(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nz*Np_e,1,MPI_INTEGER,rcv_zp_e,1,MPI_INTEGER,comm_z,ierr)
    ! the displacement array can be build from each local_np as
    dsp_zp_e(0)=0
    DO i_=1,num_procs_z-1
       dsp_zp_e(i_) =dsp_zp_e(i_-1) + rcv_zp_e(i_-1)

    !!!!! PJXYZ gather variables
    !! Y reduction for full slices of p data but constant j
    ! number of points recieved and displacement for the y reduction
    ALLOCATE(rcv_yp_e(0:num_procs_ky-1),dsp_yp_e(0:num_procs_ky-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nky*Np_e,1,MPI_INTEGER,rcv_yp_e,1,MPI_INTEGER,comm_ky,ierr)
    ! the displacement array can be build from each local_np as
    dsp_yp_e(0)=0
    DO i_=1,num_procs_ky-1
       dsp_yp_e(i_) =dsp_yp_e(i_-1) + rcv_yp_e(i_-1)
    END DO
    !! Z reduction for full slices of py data but constant j
    ! number of points recieved and displacement for the z reduction
    ALLOCATE(rcv_zyp_e(0:num_procs_z-1),dsp_zyp_e(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
    ! all processes share their local number of points
    CALL MPI_ALLGATHER(local_nz*Np_e*Nky,1,MPI_INTEGER,rcv_zyp_e,1,MPI_INTEGER,comm_z,ierr)
    ! the displacement array can be build from each local_np as
    DO i_=1,num_procs_z-1
       dsp_zyp_e(i_) =dsp_zyp_e(i_-1) + rcv_zyp_e(i_-1)
    !! Allocate some buffers
    ALLOCATE(buff_xy_zBC(ikys:ikye,ikxs:ikxe,-2:2))
    IF(KIN_E) &
    ALLOCATE(buff_pjxy_zBC_e(ipgs_e:ipge_e,ijgs_e:ijge_e,ikys:ikye,ikxs:ikxe,-2:2))
    ALLOCATE(buff_pjxy_zBC_i(ipgs_i:ipge_i,ijgs_i:ijge_i,ikys:ikye,ikxs:ikxe,-2:2))
  END SUBROUTINE init_parallel_var

  !!!!! Gather a field in spatial coordinates on rank 0 !!!!!
  SUBROUTINE gather_xyz(field_sub,field_full)
    COMPLEX(dp), DIMENSION(ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)  :: field_sub
    COMPLEX(dp), DIMENSION(   1:Nky,  1:Nkx,   1:Nz),  INTENT(OUT) :: field_full
    COMPLEX(dp), DIMENSION(ikys:ikye)           :: buffer_ly_cz !local y, constant z
    COMPLEX(dp), DIMENSION(   1:Nky )           :: buffer_fy_cz !full  y, constant z
    COMPLEX(dp), DIMENSION(   1:Nky,  izs:ize ) :: buffer_fy_lz !full  y, local z
    COMPLEX(dp), DIMENSION(   1:Nky,    1:Nz  ) :: buffer_fy_fz !full  y, full  z
    INTEGER :: snd_y, snd_z, root_p, root_z, root_ky, ix, iz

    snd_y  = local_nky    ! Number of points to send along y (per z)
    snd_z  = Nky*local_nz ! Number of points to send along z (full y)

    root_p = 0; root_z = 0; root_ky = 0
    IF(rank_p .EQ. root_p) THEN
      DO ix = 1,Nkx
        DO iz = izs,ize
          ! fill a buffer to contain a slice of data at constant kx and z
          buffer_ly_cz(ikys:ikye) = field_sub(ikys:ikye,ix,iz)
          CALL MPI_GATHERV(buffer_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
                           buffer_fy_cz, rcv_y, dsp_y, MPI_DOUBLE_COMPLEX, &
                           root_ky, comm_ky, ierr)
          buffer_fy_lz(1:Nky,iz) = buffer_fy_cz(1:Nky)
        ENDDO

        ! send the full line on y contained by root_kyas
          CALL MPI_GATHERV(buffer_fy_lz, snd_z,          MPI_DOUBLE_COMPLEX, &
                           buffer_fy_fz, rcv_zy, dsp_zy, MPI_DOUBLE_COMPLEX, &
                           root_z, comm_z, ierr)
        ENDIF
        ! ID 0 (the one who output) rebuild the whole array
        IF(my_id .EQ. 0) &
          field_full(1:Nky,ix,1:Nz) = buffer_fy_fz(1:Nky,1:Nz)
      ENDDO
    ENDIF
  END SUBROUTINE gather_xyz

  !!!!! Gather a field in kinetic + z coordinates on rank 0 !!!!!
  SUBROUTINE gather_pjz_i(field_sub,field_full)
    REAL(dp), DIMENSION(ips_i:ipe_i, 1:Nj_i, izs:ize), INTENT(IN)  :: field_sub
    REAL(dp), DIMENSION(   1:Np_i,   1:Nj_i,   1:Nz),  INTENT(OUT) :: field_full
    REAL(dp), DIMENSION(ips_i:ipe_i)          :: buffer_lp_cz !local p, constant z
    REAL(dp), DIMENSION(   1:Np_i )           :: buffer_fp_cz !full  p, constant z
    REAL(dp), DIMENSION(   1:Np_i,  izs:ize ) :: buffer_fp_lz !full  p, local z
    REAL(dp), DIMENSION(   1:Np_i,    1:Nz  ) :: buffer_fp_fz !full  p, full  z
    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz
    snd_p  = local_np_i    ! Number of points to send along p (per z)
    snd_z  = Np_i*local_nz ! Number of points to send along z (full p)

    root_p = 0; root_z = 0; root_ky = 0
    IF(rank_ky .EQ. root_ky) THEN
          ! fill a buffer to contain a slice of data at constant j and z
          buffer_lp_cz(ips_i:ipe_i) = field_sub(ips_i:ipe_i,ij,iz)
          CALL MPI_GATHERV(buffer_lp_cz, snd_p,            MPI_DOUBLE_PRECISION, &
                           buffer_fp_cz, rcv_p_i, dsp_p_i, MPI_DOUBLE_PRECISION, &
                           root_p, comm_p, ierr)
          buffer_fp_lz(1:Np_i,iz) = buffer_fp_cz(1:Np_i)
        ENDDO

        ! send the full line on y contained by root_kyas
        IF(rank_p .EQ. 0) THEN
          CALL MPI_GATHERV(buffer_fp_lz, snd_z,              MPI_DOUBLE_PRECISION, &
                           buffer_fp_fz, rcv_zp_i, dsp_zp_i, MPI_DOUBLE_PRECISION, &
                           root_z, comm_z, ierr)
        ENDIF
        ! ID 0 (the one who output) rebuild the whole array
        IF(my_id .EQ. 0) &
          field_full(1:Np_i,ij,1:Nz) = buffer_fp_fz(1:Np_i,1:Nz)
  END SUBROUTINE gather_pjz_i
  SUBROUTINE gather_pjz_e(field_sub,field_full)
    REAL(dp), DIMENSION(ips_e:ipe_e,   1:jmaxe+1, izs:ize), INTENT(IN)    :: field_sub
    REAL(dp), DIMENSION(   1:pmaxe+1,  1:jmaxe+1,   1:Nz),  INTENT(OUT) :: field_full
    REAL(dp), DIMENSION(ips_e:ipe_e)         :: buffer_lp_cz !local p, constant z
    REAL(dp), DIMENSION(   1:pmaxe+1 )       :: buffer_fp_cz !full  p, constant z
    REAL(dp), DIMENSION(   1:pmaxe+1,  izs:ize ) :: buffer_fp_lz !full  p, local z
    REAL(dp), DIMENSION(   1:pmaxe+1,    1:Nz  ) :: buffer_fp_fz !full  p, full  z
    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz
    snd_p  = local_np_e    ! Number of points to send along p (per z)
    snd_z  = Np_e*local_nz ! Number of points to send along z (full p)

    root_p = 0; root_z = 0; root_ky = 0
    IF(rank_ky .EQ. root_ky) THEN
          ! fill a buffer to contain a slice of data at constant j and z
          buffer_lp_cz(ips_e:ipe_e) = field_sub(ips_e:ipe_e,ij,iz)
          CALL MPI_GATHERV(buffer_lp_cz, snd_p,            MPI_DOUBLE_PRECISION, &
                           buffer_fp_cz, rcv_p_e, dsp_p_e, MPI_DOUBLE_PRECISION, &
                           root_p, comm_p, ierr)
          buffer_fp_lz(1:Np_e,iz) = buffer_fp_cz(1:Np_e)
        ENDDO

        ! send the full line on y contained by root_kyas
        IF(rank_p .EQ. 0) THEN
          CALL MPI_GATHERV(buffer_fp_lz, snd_z,              MPI_DOUBLE_PRECISION, &
                           buffer_fp_fz, rcv_zp_e, dsp_zp_e, MPI_DOUBLE_PRECISION, &
                           root_z, comm_z, ierr)
        ENDIF
        ! ID 0 (the one who output) rebuild the whole array
        IF(my_id .EQ. 0) &
          field_full(1:Np_e,ij,1:Nz) = buffer_fp_fz(1:Np_e,1:Nz)
  END SUBROUTINE gather_pjz_e

  !!!!! Gather a field in kinetic + spatial coordinates on rank 0 !!!!!
  !!!!! Gather a field in spatial coordinates on rank 0 !!!!!
  SUBROUTINE gather_pjxyz_i(field_sub,field_full)
    COMPLEX(dp), DIMENSION( ips_i:ipe_i, 1:Nj_i, ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)  :: field_sub
    COMPLEX(dp), DIMENSION(     1:Np_i,  1:Nj_i,    1:Nky,  1:Nkx,   1:Nz),  INTENT(OUT) :: field_full
    COMPLEX(dp), DIMENSION(ips_i:ipe_i)       :: buffer_lp_cy_cz     !local p, constant y, constant z
    COMPLEX(dp), DIMENSION(1:Np_i)            :: buffer_fp_cy_cz     ! full p, constant y, constant z
    COMPLEX(dp), DIMENSION(1:Np_i, ikys:ikye)    :: buffer_fp_ly_cz     ! full p,    local y, constant z
    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky )    :: buffer_fp_fy_cz     ! full p,     full y, constant z
    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky, izs:ize ) :: buffer_fp_fy_lz ! full p,     full y, local    z
    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky,   1:Nz  ) :: buffer_fp_fy_fz ! full p,     full y, full     z
    INTEGER :: snd_p, snd_y, snd_z, root_p, root_z, root_ky, iy, ix, iz, ij

    snd_p  =     local_np_i    ! Number of points to send along p (per z,y)
    snd_y  = Np_i*local_nky    ! Number of points to send along y (per z, full p)
    snd_z  = Np_i*Nky*local_nz ! Number of points to send along z (full y, full p)

    root_p = 0; root_z = 0; root_ky = 0

    j: DO ij = 1,Nj_i
      x: DO ix = 1,Nkx
        z: DO iz = izs,ize
          y: DO iy = ikys,ikye
            ! fill a buffer to contain a slice of p data at constant j, ky, kx and z
            buffer_lp_cy_cz(ips_i:ipe_i) = field_sub(ips_i:ipe_i,ij,iy,ix,iz)
            CALL MPI_GATHERV(buffer_lp_cy_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
                             buffer_fp_cy_cz, rcv_p_i, dsp_p_i, MPI_DOUBLE_COMPLEX, &
                             root_p, comm_p, ierr)
            buffer_fp_ly_cz(1:Np_i,iy) = buffer_fp_cy_cz(1:Np_i)
          ENDDO y
          ! send the full line on p contained by root_p
          IF(rank_p .EQ. 0) THEN
            CALL MPI_GATHERV(buffer_fp_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
                             buffer_fp_fy_cz, rcv_yp_i, dsp_yp_i, MPI_DOUBLE_COMPLEX, &
                             root_ky, comm_ky, ierr)
            buffer_fp_fy_lz(1:Np_i,1:Nky,iz) = buffer_fp_fy_cz(1:Np_i,1:Nky)
          ENDIF
        ENDDO z
        ! send the full line on y contained by root_kyas
        IF(rank_ky .EQ. 0) THEN
          CALL MPI_GATHERV(buffer_fp_fy_lz, snd_z,                MPI_DOUBLE_COMPLEX, &
                           buffer_fp_fy_fz, rcv_zyp_i, dsp_zyp_i, MPI_DOUBLE_COMPLEX, &
                           root_z, comm_z, ierr)
        ENDIF
        ! ID 0 (the one who output) rebuild the whole array
        IF(my_id .EQ. 0) &
          field_full(1:Np_i,ij,1:Nky,ix,1:Nz) = buffer_fp_fy_fz(1:Np_i,1:Nky,1:Nz)
      ENDDO x
    ENDDO j

  END SUBROUTINE gather_pjxyz_i

  SUBROUTINE gather_pjxyz_e(field_sub,field_full)
    COMPLEX(dp), DIMENSION( ips_e:ipe_e, 1:Nj_e, ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)    :: field_sub
    COMPLEX(dp), DIMENSION(     1:Np_e,  1:Nj_e,    1:Nky,  1:Nkx,   1:Nz),  INTENT(OUT) :: field_full
    COMPLEX(dp), DIMENSION(ips_e:ipe_e)       :: buffer_lp_cy_cz     !local p, constant y, constant z
    COMPLEX(dp), DIMENSION(1:Np_e)            :: buffer_fp_cy_cz     ! full p, constant y, constant z
    COMPLEX(dp), DIMENSION(1:Np_e, ikys:ikye)    :: buffer_fp_ly_cz     ! full p,    local y, constant z
    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky )    :: buffer_fp_fy_cz     ! full p,     full y, constant z
    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky, izs:ize ) :: buffer_fp_fy_lz ! full p,     full y, local    z
    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky,   1:Nz  ) :: buffer_fp_fy_fz ! full p,     full y, full     z
    INTEGER :: snd_p, snd_y, snd_z, root_p, root_z, root_ky, iy, ix, iz, ij
    snd_p  =     local_np_e    ! Number of points to send along p (per z,y)
    snd_y  = Np_e*local_nky    ! Number of points to send along y (per z, full p)
    snd_z  = Np_e*Nky*local_nz ! Number of points to send along z (full y, full p)

    root_p = 0; root_z = 0; root_ky = 0

    j: DO ij = 1,Nj_e
      x: DO ix = 1,Nkx
        z: DO iz = izs,ize
          y: DO iy = ikys,ikye
            ! fill a buffer to contain a slice of p data at constant j, ky, kx and z
            buffer_lp_cy_cz(ips_e:ipe_e) = field_sub(ips_e:ipe_e,ij,iy,ix,iz)
            CALL MPI_GATHERV(buffer_lp_cy_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
                             buffer_fp_cy_cz, rcv_p_e, dsp_p_e, MPI_DOUBLE_COMPLEX, &
                             root_p, comm_p, ierr)
            buffer_fp_ly_cz(1:Np_e,iy) = buffer_fp_cy_cz(1:Np_e)
          ENDDO y
          ! send the full line on p contained by root_p
          IF(rank_p .EQ. 0) THEN
            CALL MPI_GATHERV(buffer_fp_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
                             buffer_fp_fy_cz, rcv_yp_e, dsp_yp_e, MPI_DOUBLE_COMPLEX, &
                             root_ky, comm_ky, ierr)
            buffer_fp_fy_lz(1:Np_e,1:Nky,iz) = buffer_fp_fy_cz(1:Np_e,1:Nky)
          ENDIF
        ENDDO z
        ! send the full line on y contained by root_kyas
        IF(rank_ky .EQ. 0) THEN
          CALL MPI_GATHERV(buffer_fp_fy_lz, snd_z,                MPI_DOUBLE_COMPLEX, &
                           buffer_fp_fy_fz, rcv_zyp_e, dsp_zyp_e, MPI_DOUBLE_COMPLEX, &
                           root_z, comm_z, ierr)
        ENDIF
        ! ID 0 (the one who output) rebuild the whole array
        IF(my_id .EQ. 0) &
          field_full(1:Np_e,ij,1:Nky,ix,1:Nz) = buffer_fp_fy_fz(1:Np_e,1:Nky,1:Nz)
      ENDDO x
    ENDDO j

  END SUBROUTINE gather_pjxyz_e

  !!!!! This is a manual way to do MPI_BCAST !!!!!!!!!!!
  SUBROUTINE manual_3D_bcast(field_)
    USE grid
    IMPLICIT NONE
    COMPLEX(dp), INTENT(INOUT) :: field_(ikys:ikye,ikxs:ikxe,izs:ize)
    COMPLEX(dp) :: buffer(ikys:ikye,ikxs:ikxe,izs:ize)
    INTEGER     :: i_, root, world_rank, world_size, count
    root = 0;
    count = (ikye-ikys+1) * (ikxe-ikxs+1) * (ize-izs+1);

    CALL MPI_COMM_RANK(comm_p,world_rank,ierr)
    CALL MPI_COMM_SIZE(comm_p,world_size,ierr)
    IF (world_size .GT. 1) THEN
      !! Broadcast phi to the other processes on the same k range (communicator along p)
      IF (world_rank .EQ. root) THEN
        ! Fill the buffer
        DO iz = izs,ize
          DO ikx = ikxs,ikxe
            DO iky = ikys,ikye
                buffer(iky,ikx,iz) = field_(iky,ikx,iz)
              ENDDO
          ENDDO
        ENDDO
        ! Send it to all the other processes
        DO i_ = 0,num_procs_p-1
          IF (i_ .NE. world_rank) &
          CALL MPI_SEND(buffer, count, MPI_DOUBLE_COMPLEX, i_, 0, comm_p, ierr)
        ENDDO
      ELSE
        ! Recieve buffer from root
        CALL MPI_RECV(buffer, count, MPI_DOUBLE_COMPLEX, root, 0, comm_p, MPI_STATUS_IGNORE, ierr)
        ! Write it in phi
        DO iz = izs,ize
          DO ikx = ikxs,ikxe
            DO iky = ikys,ikye
              field_(iky,ikx,iz) = buffer(iky,ikx,iz)
            ENDDO
          ENDDO
        ENDDO
      ENDIF
    ENDIF
  END SUBROUTINE manual_3D_bcast

  !!!!! This is a manual way to do MPI_BCAST !!!!!!!!!!!
  SUBROUTINE manual_0D_bcast(v)
    USE grid
    IMPLICIT NONE
    COMPLEX(dp), INTENT(INOUT) :: v
    COMPLEX(dp) :: buffer
    INTEGER     :: i_, root, world_rank, world_size, count
    root = 0;
    count = 1;

    CALL MPI_COMM_RANK(comm_z,world_rank,ierr)
    CALL MPI_COMM_SIZE(comm_z,world_size,ierr)
    IF (world_size .GT. 1) THEN
      !! Broadcast phi to the other processes on the same k range (communicator along p)
      IF (world_rank .EQ. root) THEN
        ! Fill the buffer
        buffer = v
        ! Send it to all the other processes
        DO i_ = 0,num_procs_z-1
          IF (i_ .NE. world_rank) &
          CALL MPI_SEND(buffer, count, MPI_DOUBLE_COMPLEX, i_, 0, comm_z, ierr)
        ENDDO
      ELSE
        ! Recieve buffer from root
        CALL MPI_RECV(buffer, count, MPI_DOUBLE_COMPLEX, root, 0, comm_z, MPI_STATUS_IGNORE, ierr)
        ! Write it in phi
        v = buffer
      ENDIF
    ENDIF
  END SUBROUTINE manual_0D_bcast


END MODULE parallel