diff --git a/src/array_mod.F90 b/src/array_mod.F90
index ef3d3a4075c0998fed3dac64e6b9b92270be8061..3dca4e309433da22a41cbeae65d4477232297d87 100644
--- a/src/array_mod.F90
+++ b/src/array_mod.F90
@@ -63,18 +63,9 @@ MODULE array
   ! Poisson operator (ikx,iky,iz)
   REAL(dp), DIMENSION(:,:,:), ALLOCATABLE :: inv_poisson_op
 
-  !! Diagnostics (full arrays to gather on process 0 for output)
-  ! moments
-  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: moments_e_full
-  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: moments_i_full
-
-  ! ES potential
-  COMPLEX(dp), DIMENSION(:,:,:), ALLOCATABLE :: phi_full
-
   ! Gyrocenter density for electron and ions (ikx,iky,iz)
   COMPLEX(dp), DIMENSION(:,:,:), ALLOCATABLE :: Ne00
   COMPLEX(dp), DIMENSION(:,:,:), ALLOCATABLE :: Ni00
-  COMPLEX(dp), DIMENSION(:,:,:), ALLOCATABLE :: Ni00_full
 
   ! Kinetic spectrum sum_kx,ky(|Napj(z)|^2), (ip,ij,iz)
   REAL(dp), DIMENSION(:,:,:), ALLOCATABLE :: Nepjz
diff --git a/src/basic_mod.F90 b/src/basic_mod.F90
index 786901f2cc3bf257d516b1f299f047192ae50776..e86e0357f00671f5fafe47a003dd30ed1205e786 100644
--- a/src/basic_mod.F90
+++ b/src/basic_mod.F90
@@ -62,6 +62,8 @@ MODULE basic
               tc_step, tc_clos, tc_ghost, tc_coll, tc_process
   real(dp) :: maxruntime = 1e9 ! Maximum simulation CPU time
 
+  LOGICAL :: GATHERV_OUTPUT = .true.
+
   INTERFACE allocate_array
     MODULE PROCEDURE allocate_array_dp1,allocate_array_dp2,allocate_array_dp3,allocate_array_dp4, allocate_array_dp5, allocate_array_dp6
     MODULE PROCEDURE allocate_array_dc1,allocate_array_dc2,allocate_array_dc3,allocate_array_dc4, allocate_array_dc5, allocate_array_dc6
diff --git a/src/diagnose.F90 b/src/diagnose.F90
index 729b0e50ad26f204531a6643b4870a045f1dbd66..1451b5e95c6fc454e9b604deb7d74203d4ef0080 100644
--- a/src/diagnose.F90
+++ b/src/diagnose.F90
@@ -6,7 +6,6 @@ SUBROUTINE diagnose(kstep)
   IMPLICIT NONE
 
   INTEGER, INTENT(in) :: kstep
-
   CALL cpu_time(t0_diag) ! Measuring time
 
   !! Basic diagnose loop for reading input file, displaying advancement and ending
@@ -67,12 +66,6 @@ SUBROUTINE init_outfile(comm,file0,file,fid)
   CALL creatg(fid, "/files", "files")
   CALL attach(fid, "/files",  "jobnum", jobnum)
 
-  !  Add input namelist variables as attributes of /data/input, defined in srcinfo.h
-  ! IF (my_id .EQ. 0) WRITE(*,*) 'VERSION=', VERSION
-  ! IF (my_id .EQ. 0) WRITE(*,*)  'BRANCH=', BRANCH
-  ! IF (my_id .EQ. 0) WRITE(*,*)  'AUTHOR=', AUTHOR
-  ! IF (my_id .EQ. 0) WRITE(*,*)    'HOST=', HOST
-
   ! Add the code info and parameters to the file
   WRITE(str,'(a,i2.2)') "/data/input"
   CALL creatd(fid, 0,(/0/),TRIM(str),'Input parameters')
@@ -101,237 +94,232 @@ SUBROUTINE init_outfile(comm,file0,file,fid)
 END SUBROUTINE init_outfile
 
 SUBROUTINE diagnose_full(kstep)
-USE basic
-USE grid
-USE diagnostics_par
-USE futils, ONLY: creatf, creatg, creatd, closef, putarr, putfile, attach, openf, putarrnd
-USE array
-USE model
-USE initial_par
-USE fields
-USE time_integration
-USE parallel
-USE prec_const
-USE collision, ONLY: coll_outputinputs
-USE geometry
-IMPLICIT NONE
-
-INTEGER, INTENT(in) :: kstep
-INTEGER, parameter  :: BUFSIZE = 2
-INTEGER :: rank = 0
-INTEGER :: dims(1) = (/0/)
-INTEGER :: cp_counter = 0
-CHARACTER(len=256) :: str,test_
-!____________________________________________________________________________
-!                   1.   Initial diagnostics
-
-IF ((kstep .EQ. 0)) THEN
-  CALL init_outfile(comm0,   resfile0,resfile,fidres)
-
-  ! Profiler time measurement
-  CALL creatg(fidres, "/profiler", "performance analysis")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_rhs",        "cumulative rhs computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_adv_field",  "cumulative adv. fields computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_clos",       "cumulative closure computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_ghost",       "cumulative communication time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_coll",       "cumulative collision computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_poisson",    "cumulative poisson computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_Sapj",       "cumulative Sapj computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_checkfield", "cumulative checkfield computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_diag",       "cumulative sym computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_process",    "cumulative process computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/Tc_step",       "cumulative total step computation time")
-  CALL creatd(fidres, 0, dims, "/profiler/time",          "current simulation time")
-
-  ! Grid info
-  CALL creatg(fidres, "/data/grid", "Grid data")
-  CALL putarr(fidres, "/data/grid/coordkx",   kxarray_full,  "kx*rho_s0", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordky",   kyarray_full,  "ky*rho_s0", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordz",    zarray_full,   "z/R", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordp_e" , parray_e_full, "p_e", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordj_e" , jarray_e_full, "j_e", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordp_i" , parray_i_full, "p_i", ionode=0)
-  CALL putarr(fidres, "/data/grid/coordj_i" , jarray_i_full, "j_i", ionode=0)
-
-  ! Metric info
-  CALL   creatg(fidres, "/data/metric", "Metric data")
-  CALL putarrnd(fidres, "/data/metric/gxx",            gxx(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gxy",            gxy(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gyy",            gyy(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gyz",            gyz(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gzz",            gzz(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/hatR",          hatR(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/hatZ",          hatZ(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/hatB",          hatB(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gradxB",      gradxB(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gradyB",      gradyB(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gradzB",      gradzB(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/Jacobian",    Jacobian(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/gradz_coeff", gradz_coeff(izs:ize,0:1), (/1, 1, 1/))
-  CALL putarrnd(fidres, "/data/metric/Ckxky",       Ckxky(ikys:ikye,ikxs:ikxe,izs:ize,0:1), (/1, 1, 3/))
-  CALL putarrnd(fidres, "/data/metric/kernel_i",    kernel_i(ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize,0:1), (/ 1, 2, 4/))
-
-  !  var0d group (gyro transport)
-  IF (nsave_0d .GT. 0) THEN
-   CALL creatg(fidres, "/data/var0d", "0d profiles")
-   CALL creatd(fidres, rank, dims,  "/data/var0d/time",     "Time t*c_s/R")
-   CALL creatd(fidres, rank, dims, "/data/var0d/cstep", "iteration number")
-
-   IF (write_gamma) THEN
-     CALL creatd(fidres, rank, dims, "/data/var0d/gflux_ri", "Radial gyro ion transport")
-     CALL creatd(fidres, rank, dims, "/data/var0d/pflux_ri", "Radial part ion transport")
-   ENDIF
-   IF (write_hf) THEN
-     CALL creatd(fidres, rank, dims, "/data/var0d/hflux_x", "Radial part ion heat flux")
-   ENDIF
-   IF (cstep==0) THEN
-     iframe0d=0
-   ENDIF
-   CALL attach(fidres,"/data/var0d/" , "frames", iframe0d)
-  END IF
+  USE basic
+  USE grid
+  USE diagnostics_par
+  USE futils, ONLY: creatf, creatg, creatd, closef, putarr, putfile, attach, openf, putarrnd
+  USE array
+  USE model
+  USE initial_par
+  USE fields
+  USE time_integration
+  USE parallel
+  USE prec_const
+  USE collision, ONLY: coll_outputinputs
+  USE geometry
+  IMPLICIT NONE
 
+  INTEGER, INTENT(in) :: kstep
+  INTEGER, parameter  :: BUFSIZE = 2
+  INTEGER :: rank = 0
+  INTEGER :: dims(1) = (/0/)
+  INTEGER :: cp_counter = 0
+  CHARACTER(len=256) :: str,test_
+  !____________________________________________________________________________
+  !                   1.   Initial diagnostics
 
-  !  var2d group (??)
-  IF (nsave_2d .GT. 0) THEN
-   CALL creatg(fidres, "/data/var2d", "2d profiles")
-   CALL creatd(fidres, rank, dims,  "/data/var2d/time",     "Time t*c_s/R")
-   CALL creatd(fidres, rank, dims, "/data/var2d/cstep", "iteration number")
-   IF (cstep==0) THEN
-     iframe2d=0
-   ENDIF
-   CALL attach(fidres,"/data/var2d/" , "frames", iframe2d)
-  END IF
+  IF ((kstep .EQ. 0)) THEN
+    CALL init_outfile(comm0,   resfile0,resfile,fidres)
+
+    ! Profiler time measurement
+    CALL creatg(fidres, "/profiler", "performance analysis")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_rhs",        "cumulative rhs computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_adv_field",  "cumulative adv. fields computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_clos",       "cumulative closure computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_ghost",       "cumulative communication time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_coll",       "cumulative collision computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_poisson",    "cumulative poisson computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_Sapj",       "cumulative Sapj computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_checkfield", "cumulative checkfield computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_diag",       "cumulative sym computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_process",    "cumulative process computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/Tc_step",       "cumulative total step computation time")
+    CALL creatd(fidres, 0, dims, "/profiler/time",          "current simulation time")
+
+    ! Grid info
+    CALL creatg(fidres, "/data/grid", "Grid data")
+    CALL putarr(fidres, "/data/grid/coordkx",   kxarray_full,  "kx*rho_s0", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordky",   kyarray_full,  "ky*rho_s0", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordz",    zarray_full,   "z/R", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordp_e" , parray_e_full, "p_e", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordj_e" , jarray_e_full, "j_e", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordp_i" , parray_i_full, "p_i", ionode=0)
+    CALL putarr(fidres, "/data/grid/coordj_i" , jarray_i_full, "j_i", ionode=0)
+
+    ! Metric info
+    CALL   creatg(fidres, "/data/metric", "Metric data")
+    CALL putarrnd(fidres, "/data/metric/gxx",            gxx(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gxy",            gxy(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gyy",            gyy(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gyz",            gyz(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gzz",            gzz(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/hatR",          hatR(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/hatZ",          hatZ(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/hatB",          hatB(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gradxB",      gradxB(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gradyB",      gradyB(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gradzB",      gradzB(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/Jacobian",    Jacobian(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/gradz_coeff", gradz_coeff(izs:ize,0:1), (/1, 1, 1/))
+    CALL putarrnd(fidres, "/data/metric/Ckxky",       Ckxky(ikys:ikye,ikxs:ikxe,izs:ize,0:1), (/1, 1, 3/))
+    CALL putarrnd(fidres, "/data/metric/kernel_i",    kernel_i(ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize,0:1), (/ 1, 2, 4/))
+
+    !  var0d group (gyro transport)
+    IF (nsave_0d .GT. 0) THEN
+     CALL creatg(fidres, "/data/var0d", "0d profiles")
+     CALL creatd(fidres, rank, dims,  "/data/var0d/time",     "Time t*c_s/R")
+     CALL creatd(fidres, rank, dims, "/data/var0d/cstep", "iteration number")
+
+     IF (write_gamma) THEN
+       CALL creatd(fidres, rank, dims, "/data/var0d/gflux_ri", "Radial gyro ion transport")
+       CALL creatd(fidres, rank, dims, "/data/var0d/pflux_ri", "Radial part ion transport")
+     ENDIF
+     IF (write_hf) THEN
+       CALL creatd(fidres, rank, dims, "/data/var0d/hflux_x", "Radial part ion heat flux")
+     ENDIF
+     IF (cstep==0) THEN
+       iframe0d=0
+     ENDIF
+     CALL attach(fidres,"/data/var0d/" , "frames", iframe0d)
+    END IF
 
-  !  var3d group (electro. pot., Ni00 moment)
-  IF (nsave_3d .GT. 0) THEN
-   CALL creatg(fidres, "/data/var3d", "3d profiles")
-   CALL creatd(fidres, rank, dims,  "/data/var3d/time",     "Time t*c_s/R")
-   CALL creatd(fidres, rank, dims, "/data/var3d/cstep", "iteration number")
 
-   IF (write_phi) CALL creatg(fidres, "/data/var3d/phi", "phi")
-   IF (write_phi) THEN
-    CALL creatg(fidres, "/data/var3d/phi_gatherv", "phi_gatherv")
-   ENDIF
+    !  var2d group (??)
+    IF (nsave_2d .GT. 0) THEN
+     CALL creatg(fidres, "/data/var2d", "2d profiles")
+     CALL creatd(fidres, rank, dims,  "/data/var2d/time",     "Time t*c_s/R")
+     CALL creatd(fidres, rank, dims, "/data/var2d/cstep", "iteration number")
+     IF (cstep==0) THEN
+       iframe2d=0
+     ENDIF
+     CALL attach(fidres,"/data/var2d/" , "frames", iframe2d)
+    END IF
 
-   IF (write_Na00) THEN
-    IF(KIN_E)&
-    CALL creatg(fidres, "/data/var3d/Ne00", "Ne00")
-    CALL creatg(fidres, "/data/var3d/Ni00", "Ni00")
-    CALL creatg(fidres, "/data/var3d/Ni00_gatherv", "Ni00_gatherv")
-    IF(KIN_E)&
-    CALL creatg(fidres, "/data/var3d/Nepjz", "Nepjz")
-    CALL creatg(fidres, "/data/var3d/Nipjz", "Nipjz")
-    CALL creatg(fidres, "/data/var3d/Nipjz_gatherv", "Nipjz_gatherv")
-   ENDIF
-
-   IF (write_dens) THEN
-     IF(KIN_E)&
-    CALL creatg(fidres, "/data/var3d/dens_e", "dens_e")
-    CALL creatg(fidres, "/data/var3d/dens_i", "dens_i")
-   ENDIF
-
-   IF (write_fvel) THEN
-     IF(KIN_E) THEN
-     CALL creatg(fidres, "/data/var3d/upar_e", "upar_e")
-     CALL creatg(fidres, "/data/var3d/uper_e", "uper_e")
+    !  var3d group (electro. pot., Ni00 moment)
+    IF (nsave_3d .GT. 0) THEN
+     CALL creatg(fidres, "/data/var3d", "3d profiles")
+     CALL creatd(fidres, rank, dims,  "/data/var3d/time",     "Time t*c_s/R")
+     CALL creatd(fidres, rank, dims, "/data/var3d/cstep", "iteration number")
+
+     IF (write_phi) CALL creatg(fidres, "/data/var3d/phi", "phi")
+
+     IF (write_Na00) THEN
+      IF(KIN_E)&
+      CALL creatg(fidres, "/data/var3d/Ne00", "Ne00")
+      CALL creatg(fidres, "/data/var3d/Ni00", "Ni00")
+      IF(KIN_E)&
+      CALL creatg(fidres, "/data/var3d/Nepjz", "Nepjz")
+      CALL creatg(fidres, "/data/var3d/Nipjz", "Nipjz")
      ENDIF
-     CALL creatg(fidres, "/data/var3d/upar_i", "upar_i")
-     CALL creatg(fidres, "/data/var3d/uper_i", "uper_i")
-   ENDIF
-
-   IF (write_temp) THEN
-     IF(KIN_E) THEN
-     CALL creatg(fidres, "/data/var3d/Tper_e", "Tper_e")
-     CALL creatg(fidres, "/data/var3d/Tpar_e", "Tpar_e")
-     CALL creatg(fidres, "/data/var3d/temp_e", "temp_e")
+
+     IF (write_dens) THEN
+       IF(KIN_E)&
+      CALL creatg(fidres, "/data/var3d/dens_e", "dens_e")
+      CALL creatg(fidres, "/data/var3d/dens_i", "dens_i")
      ENDIF
-     CALL creatg(fidres, "/data/var3d/Tper_i", "Tper_i")
-     CALL creatg(fidres, "/data/var3d/Tpar_i", "Tpar_i")
-     CALL creatg(fidres, "/data/var3d/temp_i", "temp_i")
-   ENDIF
-
-   IF (cstep==0) THEN
-     iframe3d=0
-   ENDIF
-   CALL attach(fidres,"/data/var3d/" , "frames", iframe3d)
-  END IF
 
-  !  var5d group (moments)
-  IF (nsave_5d .GT. 0) THEN
-    CALL creatg(fidres, "/data/var5d", "5d profiles")
-    CALL creatd(fidres, rank, dims,  "/data/var5d/time",     "Time t*c_s/R")
-    CALL creatd(fidres, rank, dims, "/data/var5d/cstep", "iteration number")
+     IF (write_fvel) THEN
+       IF(KIN_E) THEN
+       CALL creatg(fidres, "/data/var3d/upar_e", "upar_e")
+       CALL creatg(fidres, "/data/var3d/uper_e", "uper_e")
+       ENDIF
+       CALL creatg(fidres, "/data/var3d/upar_i", "upar_i")
+       CALL creatg(fidres, "/data/var3d/uper_i", "uper_i")
+     ENDIF
 
-    IF (write_Napj) THEN
-      IF(KIN_E)&
-     CALL creatg(fidres, "/data/var5d/moments_e", "moments_e")
-     CALL creatg(fidres, "/data/var5d/moments_i", "moments_i")
-    ENDIF
+     IF (write_temp) THEN
+       IF(KIN_E) THEN
+       CALL creatg(fidres, "/data/var3d/Tper_e", "Tper_e")
+       CALL creatg(fidres, "/data/var3d/Tpar_e", "Tpar_e")
+       CALL creatg(fidres, "/data/var3d/temp_e", "temp_e")
+       ENDIF
+       CALL creatg(fidres, "/data/var3d/Tper_i", "Tper_i")
+       CALL creatg(fidres, "/data/var3d/Tpar_i", "Tpar_i")
+       CALL creatg(fidres, "/data/var3d/temp_i", "temp_i")
+     ENDIF
 
-    IF (write_Sapj) THEN
-      IF(KIN_E)&
-      CALL creatg(fidres, "/data/var5d/Sepj", "Sepj")
-      CALL creatg(fidres, "/data/var5d/Sipj", "Sipj")
-    ENDIF
+     IF (cstep==0) THEN
+       iframe3d=0
+     ENDIF
+     CALL attach(fidres,"/data/var3d/" , "frames", iframe3d)
+    END IF
 
-    IF (cstep==0) THEN
-     iframe5d=0
+    !  var5d group (moments)
+    IF (nsave_5d .GT. 0) THEN
+      CALL creatg(fidres, "/data/var5d", "5d profiles")
+      CALL creatd(fidres, rank, dims,  "/data/var5d/time",     "Time t*c_s/R")
+      CALL creatd(fidres, rank, dims, "/data/var5d/cstep", "iteration number")
+
+      IF (write_Napj) THEN
+        IF(KIN_E)&
+       CALL creatg(fidres, "/data/var5d/moments_e", "moments_e")
+       CALL creatg(fidres, "/data/var5d/moments_i", "moments_i")
+      ENDIF
+
+      IF (write_Sapj) THEN
+        IF(KIN_E)&
+        CALL creatg(fidres, "/data/var5d/Sepj", "Sepj")
+        CALL creatg(fidres, "/data/var5d/Sipj", "Sipj")
+      ENDIF
+
+      IF (cstep==0) THEN
+       iframe5d=0
+      END IF
+      CALL attach(fidres,"/data/var5d/" , "frames", iframe5d)
     END IF
-    CALL attach(fidres,"/data/var5d/" , "frames", iframe5d)
-  END IF
-ENDIF
+  ENDIF
 
-!_____________________________________________________________________________
-!                   2.   Periodic diagnostics
-!
-IF (kstep .GE. 0) THEN
+  !_____________________________________________________________________________
+  !                   2.   Periodic diagnostics
+  !
+  IF (kstep .GE. 0) THEN
 
-   !                       2.1   0d history arrays
-   IF (nsave_0d .GT. 0) THEN
-      IF ( MOD(cstep, nsave_0d) == 0 ) THEN
-         CALL diagnose_0d
-      END IF
-   END IF
+     !                       2.1   0d history arrays
+     IF (nsave_0d .GT. 0) THEN
+        IF ( MOD(cstep, nsave_0d) == 0 ) THEN
+           CALL diagnose_0d
+        END IF
+     END IF
 
-   !                       2.2   1d profiles
-   ! empty in our case
+     !                       2.2   1d profiles
+     ! empty in our case
 
-   !                       2.3   2d profiles
-   ! empty in our case
+     !                       2.3   2d profiles
+     ! empty in our case
 
 
-   !                       2.3   2d profiles
-   IF (nsave_3d .GT. 0) THEN
-      IF (MOD(cstep, nsave_3d) == 0) THEN
-        CALL diagnose_3d
-      END IF
-   END IF
+     !                       2.3   2d profiles
+     IF (nsave_3d .GT. 0) THEN
+        IF (MOD(cstep, nsave_3d) == 0) THEN
+          CALL diagnose_3d
+        END IF
+     END IF
 
-   !                       2.4   3d profiles
-   IF (nsave_5d .GT. 0 .AND. cstep .GT. 0) THEN
-      IF (MOD(cstep, nsave_5d) == 0) THEN
-         CALL diagnose_5d
-      END IF
-   END IF
+     !                       2.4   3d profiles
+     IF (nsave_5d .GT. 0 .AND. cstep .GT. 0) THEN
+        IF (MOD(cstep, nsave_5d) == 0) THEN
+           CALL diagnose_5d
+        END IF
+     END IF
 
-!_____________________________________________________________________________
-!                   3.   Final diagnostics
+  !_____________________________________________________________________________
+  !                   3.   Final diagnostics
 
-ELSEIF (kstep .EQ. -1) THEN
-   CALL attach(fidres, "/data/input","cpu_time",finish-start)
+  ELSEIF (kstep .EQ. -1) THEN
+     CALL attach(fidres, "/data/input","cpu_time",finish-start)
 
-   ! make a checkpoint at last timestep if not crashed
-   IF(.NOT. crashed) THEN
-     IF(my_id .EQ. 0) write(*,*) 'Saving last state'
-     IF (nsave_5d .GT. 0) &
-     CALL diagnose_5d
-   ENDIF
+     ! make a checkpoint at last timestep if not crashed
+     IF(.NOT. crashed) THEN
+       IF(my_id .EQ. 0) write(*,*) 'Saving last state'
+       IF (nsave_5d .GT. 0) &
+       CALL diagnose_5d
+     ENDIF
 
-   !   Close all diagnostic files
-   CALL mpi_barrier(MPI_COMM_WORLD, ierr)
-   CALL closef(fidres)
+     !   Close all diagnostic files
+     CALL mpi_barrier(MPI_COMM_WORLD, ierr)
+     CALL closef(fidres)
 
-END IF
+  END IF
 
 END SUBROUTINE diagnose_full
 
@@ -388,7 +376,6 @@ SUBROUTINE diagnose_3d
   USE diagnostics_par
   USE prec_const
   USE processing
-  USE parallel, ONLY : gather_xyz
   USE model, ONLY: KIN_E
 
   IMPLICIT NONE
@@ -402,12 +389,6 @@ SUBROUTINE diagnose_3d
   CALL attach(fidres,"/data/var3d/" , "frames", iframe3d)
 
   IF (write_phi) CALL write_field3d_kykxz(phi (ikys:ikye,ikxs:ikxe,izs:ize), 'phi')
-  IF (write_phi) THEN
-    CALL gather_xyz(phi(ikys:ikye,1:Nkx,izs:ize),phi_full(1:Nky,1:Nkx,1:Nz))
-    WRITE(dset_name, "(A, '/', i6.6)") "/data/var3d/phi_gatherv", iframe3d
-    CALL putarr(fidres, dset_name, phi_full(1:Nky,1:Nkx,1:Nz), ionode=0)
-    CALL attach(fidres, dset_name, "time", time)
-  ENDIF
 
   IF (write_Na00) THEN
     IF(KIN_E)THEN
@@ -419,12 +400,6 @@ SUBROUTINE diagnose_3d
       Ni00(ikys:ikye,ikxs:ikxe,izs:ize) = moments_i(ips_i,1,ikys:ikye,ikxs:ikxe,izs:ize,updatetlevel)
     CALL write_field3d_kykxz(Ni00(ikys:ikye,ikxs:ikxe,izs:ize), 'Ni00')
 
-    IF (ips_i .EQ. 1) &
-      CALL gather_xyz(Ni00(ikys:ikye,1:Nkx,izs:ize),Ni00_full(1:Nky,1:Nkx,1:Nz))
-    WRITE(dset_name, "(A, '/', i6.6)") "/data/var3d/Ni00_gatherv", iframe3d
-    CALL putarr(fidres, dset_name, Ni00_full(1:Nky,1:Nkx,1:Nz), ionode=0)
-    CALL attach(fidres, dset_name, "time", time)
-
     ! CALL compute_Napjz_spectrum
     ! IF(KIN_E) &
     ! CALL write_field3d_pjz_e(Nepjz(ips_e:ipe_e,ijs_e:ije_e,izs:ize), 'Nepjz')
@@ -465,14 +440,21 @@ SUBROUTINE diagnose_3d
   CONTAINS
 
   SUBROUTINE write_field3d_kykxz(field, text)
+    USE parallel, ONLY : gather_xyz
     IMPLICIT NONE
     COMPLEX(dp), DIMENSION(ikys:ikye,ikxs:ikxe, izs:ize), INTENT(IN) :: field
     CHARACTER(*), INTENT(IN) :: text
+    COMPLEX(dp), DIMENSION(1:Nky,1:Nkx,1:Nz) :: field_full
     CHARACTER(256) :: dset_name
     WRITE(dset_name, "(A, '/', A, '/', i6.6)") "/data/var3d", TRIM(text), iframe3d
+
     IF (num_procs .EQ. 1) THEN ! no data distribution
       CALL putarr(fidres, dset_name, field(ikys:ikye,ikxs:ikxe, izs:ize), ionode=0)
-    ELSE
+
+    ELSEIF(GATHERV_OUTPUT) THEN ! output using one node (gatherv)
+      CALL gather_xyz(field(ikys:ikye,1:Nkx,izs:ize),field_full(1:Nky,1:Nkx,1:Nz))
+      CALL putarr(fidres, dset_name, field_full(1:Nky,1:Nkx,1:Nz), ionode=0)
+    ELSE ! output using putarrnd (very slow on marconi)
       CALL putarrnd(fidres, dset_name, field(ikys:ikye,ikxs:ikxe, izs:ize),  (/1, 1, 3/))
     ENDIF
     CALL attach(fidres, dset_name, "time", time)
@@ -511,17 +493,19 @@ END SUBROUTINE diagnose_3d
 SUBROUTINE diagnose_5d
 
   USE basic
-  USE futils, ONLY: append, getatt, attach, putarrnd
+  USE futils, ONLY: append, getatt, attach, putarrnd, putarr
   USE fields
   USE array!, ONLY: Sepj, Sipj
   USE grid, ONLY: ips_e,ipe_e, ips_i, ipe_i, &
                  ijs_e,ije_e, ijs_i, ije_i, &
+                 Np_i, Nj_i, Np_e, Nj_e, Nky, Nkx, Nz, &
                  ikxs,ikxe,ikys,ikye,izs,ize
   USE time_integration
   USE diagnostics_par
   USE prec_const
   USE model, ONLY: KIN_E
   IMPLICIT NONE
+  CHARACTER(LEN=50) :: dset_name
 
   CALL append(fidres,  "/data/var5d/time",           time,ionode=0)
   CALL append(fidres, "/data/var5d/cstep", real(cstep,dp),ionode=0)
@@ -545,6 +529,7 @@ SUBROUTINE diagnose_5d
 
   SUBROUTINE write_field5d_e(field, text)
     USE futils, ONLY: attach, putarr, putarrnd
+    USE parallel, ONLY: gather_pjxyz_e
     USE grid,   ONLY: ips_e,ipe_e, ijs_e,ije_e, ikxs,ikxe, ikys,ikye, izs,ize
     USE prec_const
     IMPLICIT NONE
@@ -552,12 +537,17 @@ SUBROUTINE diagnose_5d
     COMPLEX(dp), DIMENSION(ips_e:ipe_e,ijs_e:ije_e,ikys:ikye,ikxs:ikxe,izs:ize), INTENT(IN) :: field
     CHARACTER(*), INTENT(IN) :: text
 
+    COMPLEX(dp), DIMENSION(1:Np_e,1:Nj_e,1:Nky,1:Nkx,1:Nz) :: field_full
     CHARACTER(LEN=50) :: dset_name
 
     WRITE(dset_name, "(A, '/', A, '/', i6.6)") "/data/var5d", TRIM(text), iframe5d
     IF (num_procs .EQ. 1) THEN
      CALL putarr(fidres, dset_name, field(ips_e:ipe_e,ijs_e:ije_e,ikys:ikye,ikxs:ikxe,izs:ize), ionode=0)
-    ELSE
+   ELSEIF(GATHERV_OUTPUT) THEN ! output using one node (gatherv)
+     CALL gather_pjxyz_e(field(ips_e:ipe_e,ijs_e:ije_e,ikys:ikye,ikxs:ikxe,izs:ize),&
+                         field_full(1:Np_e,1:Nj_e,1:Nky,1:Nkx,1:Nz))
+     CALL putarr(fidres, dset_name, field_full(1:Np_i,1:Nj_i,1:Nky,1:Nkx,1:Nz), ionode=0)
+   ELSE
      CALL putarrnd(fidres, dset_name, field(ips_e:ipe_e,ijs_e:ije_e,ikys:ikye,ikxs:ikxe,izs:ize),  (/1,3,5/))
     ENDIF
     CALL attach(fidres, dset_name, 'cstep', cstep)
@@ -571,6 +561,7 @@ SUBROUTINE diagnose_5d
 
   SUBROUTINE write_field5d_i(field, text)
     USE futils, ONLY: attach, putarr, putarrnd
+    USE parallel, ONLY: gather_pjxyz_i
     USE grid, ONLY: ips_i,ipe_i, ijs_i,ije_i, ikxs,ikxe, ikys,ikye, izs,ize
     USE prec_const
     IMPLICIT NONE
@@ -578,11 +569,16 @@ SUBROUTINE diagnose_5d
     COMPLEX(dp), DIMENSION(ips_i:ipe_i,ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize), INTENT(IN) :: field
     CHARACTER(*), INTENT(IN) :: text
 
+    COMPLEX(dp), DIMENSION(1:Np_i,1:Nj_i,1:Nky,1:Nkx,1:Nz) :: field_full
     CHARACTER(LEN=50) :: dset_name
 
     WRITE(dset_name, "(A, '/', A, '/', i6.6)") "/data/var5d", TRIM(text), iframe5d
     IF (num_procs .EQ. 1) THEN
       CALL putarr(fidres, dset_name, field(ips_i:ipe_i,ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize), ionode=0)
+    ELSEIF(GATHERV_OUTPUT) THEN ! output using one node (gatherv)
+      CALL gather_pjxyz_i(field(ips_i:ipe_i,ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize),&
+                        field_full(1:Np_i,1:Nj_i,1:Nky,1:Nkx,1:Nz))
+      CALL putarr(fidres, dset_name, field_full(1:Np_i,1:Nj_i,1:Nky,1:Nkx,1:Nz), ionode=0)
     ELSE
       CALL putarrnd(fidres, dset_name, field(ips_i:ipe_i,ijs_i:ije_i,ikys:ikye,ikxs:ikxe,izs:ize),  (/1,3,5/))
     ENDIF
diff --git a/src/grid_mod.F90 b/src/grid_mod.F90
index 62154f9db49ee8960da2c38eafd673824c3c1f5e..77ea54e5313f6d0f3be9f0a5ac11efd1be01a220 100644
--- a/src/grid_mod.F90
+++ b/src/grid_mod.F90
@@ -56,7 +56,7 @@ MODULE grid
   INTEGER, DIMENSION(:), ALLOCATABLE, PUBLIC :: displs_nkx, displs_nky
   ! "" for p
   INTEGER,             PUBLIC :: local_np_e, local_np_i
-  INTEGER,             PUBLIC :: total_np_e, total_np_i
+  INTEGER,             PUBLIC :: total_np_e, total_np_i, Np_e, Np_i
   integer(C_INTPTR_T), PUBLIC :: local_np_e_offset, local_np_i_offset
   INTEGER, DIMENSION(:), ALLOCATABLE, PUBLIC :: rcv_p_e, rcv_p_i
   INTEGER, DIMENSION(:), ALLOCATABLE, PUBLIC :: dsp_p_e, dsp_p_i
@@ -67,7 +67,7 @@ MODULE grid
   INTEGER, DIMENSION(:), ALLOCATABLE, PUBLIC :: counts_nz
   INTEGER, DIMENSION(:), ALLOCATABLE, PUBLIC :: displs_nz
   ! "" for j (not parallelized)
-  INTEGER,             PUBLIC :: local_nj_e, local_nj_i
+  INTEGER,             PUBLIC :: local_nj_e, local_nj_i, Nj_e, Nj_i
   ! Grids containing position in fourier space
   REAL(dp), DIMENSION(:),     ALLOCATABLE, PUBLIC :: kxarray, kxarray_full
   REAL(dp), DIMENSION(:),     ALLOCATABLE, PUBLIC :: kyarray, kyarray_full
@@ -163,6 +163,8 @@ CONTAINS
     ! Total number of Hermite polynomials we will evolve
     total_np_e = (Pmaxe/deltape) + 1
     total_np_i = (Pmaxi/deltapi) + 1
+    Np_e = total_np_e ! Reduced names (redundant)
+    Np_i = total_np_i
     ! Build the full grids on process 0 to diagnose it without comm
     ALLOCATE(parray_e_full(1:total_np_e))
     ALLOCATE(parray_i_full(1:total_np_i))
@@ -246,7 +248,9 @@ CONTAINS
     USE prec_const
     IMPLICIT NONE
     INTEGER :: ij
-
+    ! Total number of J degrees
+    Nj_e = jmaxe+1
+    Nj_i = jmaxi+1
     ! Build the full grids on process 0 to diagnose it without comm
     ALLOCATE(jarray_e_full(1:jmaxe+1))
     ALLOCATE(jarray_i_full(1:jmaxi+1))
diff --git a/src/memory.F90 b/src/memory.F90
index 3c313f5394e3c4e996f7140ef8e53677fccfc9fb..c9509278094ed98800c478c2e1469bf47a0728db 100644
--- a/src/memory.F90
+++ b/src/memory.F90
@@ -14,7 +14,6 @@ SUBROUTINE memory
 
   ! Electrostatic potential
   CALL allocate_array(           phi, ikys,ikye, ikxs,ikxe, izgs,izge)
-  CALL allocate_array(      phi_full,     1,Nky,     1,Nkx,      1,Nz)
   CALL allocate_array(        phi_ZF, ikxs,ikxe, izs,ize)
   CALL allocate_array(        phi_EM, ikys,ikye, izs,ize)
   CALL allocate_array(inv_poisson_op, ikys,ikye, ikxs,ikxe, izs,ize)
@@ -58,7 +57,6 @@ SUBROUTINE memory
 
   !Ions arrays
   CALL allocate_array(             Ni00, ikys,ikye, ikxs,ikxe, izs,ize)
-  CALL allocate_array(        Ni00_full,     1,Nky,     1,Nkx,    1,Nz)
   CALL allocate_array(           dens_i, ikys,ikye, ikxs,ikxe, izs,ize)
   CALL allocate_array(           upar_i, ikys,ikye, ikxs,ikxe, izs,ize)
   CALL allocate_array(           uper_i, ikys,ikye, ikxs,ikxe, izs,ize)
@@ -127,6 +125,4 @@ SUBROUTINE memory
       CALL allocate_array(  Ciipj, 1,(pmaxi+1)*(jmaxi+1), 1,(pmaxi+1)*(jmaxi+1), 1,1, 1,1, 1,1)
  ENDIF
 
- !____________DIAGNOSTIC PURPOSE ONLY ARRAYS_______________
-
 END SUBROUTINE memory
diff --git a/src/parallel_mod.F90 b/src/parallel_mod.F90
index 7a268243636ae3146ca207a169ad615bb643cd3a..3efe15f5fc29da6ac97b2dfb6d5f47679edc5a14 100644
--- a/src/parallel_mod.F90
+++ b/src/parallel_mod.F90
@@ -1,7 +1,7 @@
 MODULE parallel
   USE basic
   USE grid, ONLY: Nkx,Nky,Nz, ikys,ikye, izs,ize, local_nky, local_nz, &
-                  local_np_e, local_np_i, total_np_e, total_np_i, &
+                  local_np_e, local_np_i, Np_e, Np_i, Nj_e, Nj_i, &
                   pmaxi, pmaxe, ips_e, ipe_e, ips_i, ipe_i, &
                   jmaxi, jmaxe, ijs_e, ije_e, ijs_i, ije_i, &
                   rcv_p_e, rcv_p_i, dsp_p_e, dsp_p_i
@@ -9,16 +9,21 @@ MODULE parallel
   IMPLICIT NONE
   ! recieve and displacement counts for gatherv
   INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_y, dsp_y, rcv_zy, dsp_zy
-  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_e, dsp_zp_e
-  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_i, dsp_zp_i
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_e,  dsp_zp_e
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_yp_e,  dsp_yp_e
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zyp_e, dsp_zyp_e
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zp_i,  dsp_zp_i
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_yp_i,  dsp_yp_i
+  INTEGER, DIMENSION(:), ALLOCATABLE :: rcv_zyp_i, dsp_zyp_i
 
   PUBLIC :: manual_0D_bcast, manual_3D_bcast, init_parallel_var, &
-            gather_xyz, gather_pjz_e, gather_pjz_i!, gather_pjxyz
+            gather_xyz, gather_pjz_i, gather_pjxyz_e, gather_pjxyz_i
 
 CONTAINS
 
   SUBROUTINE init_parallel_var
     INTEGER :: i_
+
     !!!!!! XYZ gather variables
     !! Y reduction at constant x and z
     ! number of points recieved and displacement for the y reduction
@@ -43,29 +48,72 @@ CONTAINS
     print*, rcv_zy, dsp_zy
 
     !!!!! PJZ gather variables
-    ! ELECTRONS
-    ! P variables are already defined in grid module (see rcv_p_a, dsp_p_a)
+    ! IONS
+    !! Z reduction for full slices of p data but constant j
+    ! number of points recieved and displacement for the z reduction
+    ALLOCATE(rcv_zp_i(0:num_procs_z-1),dsp_zp_i(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
+    ! all processes share their local number of points
+    CALL MPI_ALLGATHER(local_nz*Np_i,1,MPI_INTEGER,rcv_zp_i,1,MPI_INTEGER,comm_z,ierr)
+    ! the displacement array can be build from each local_np as
+    dsp_zp_i(0)=0
+    DO i_=1,num_procs_z-1
+       dsp_zp_i(i_) =dsp_zp_i(i_-1) + rcv_zp_i(i_-1)
+    END DO
+
+    !!!!! PJXYZ gather variables
+    !! Y reduction for full slices of p data but constant j
+    ! number of points recieved and displacement for the y reduction
+    ALLOCATE(rcv_yp_i(0:num_procs_ky-1),dsp_yp_i(0:num_procs_ky-1)) !Displacement sizes for balance diagnostic
+    ! all processes share their local number of points
+    CALL MPI_ALLGATHER(local_nky*Np_i,1,MPI_INTEGER,rcv_yp_i,1,MPI_INTEGER,comm_ky,ierr)
+    ! the displacement array can be build from each local_np as
+    dsp_yp_i(0)=0
+    DO i_=1,num_procs_ky-1
+       dsp_yp_i(i_) =dsp_yp_i(i_-1) + rcv_yp_i(i_-1)
+    END DO
+    !! Z reduction for full slices of py data but constant j
+    ! number of points recieved and displacement for the z reduction
+    ALLOCATE(rcv_zyp_i(0:num_procs_z-1),dsp_zyp_i(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
+    ! all processes share their local number of points
+    CALL MPI_ALLGATHER(local_nz*Np_i*Nky,1,MPI_INTEGER,rcv_zyp_i,1,MPI_INTEGER,comm_z,ierr)
+    ! the displacement array can be build from each local_np as
+    dsp_zyp_i(0)=0
+    DO i_=1,num_procs_z-1
+       dsp_zyp_i(i_) =dsp_zyp_i(i_-1) + rcv_zyp_i(i_-1)
+    END DO
+
+    ! ELECTONS
     !! Z reduction for full slices of p data but constant j
     ! number of points recieved and displacement for the z reduction
     ALLOCATE(rcv_zp_e(0:num_procs_z-1),dsp_zp_e(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
     ! all processes share their local number of points
-    CALL MPI_ALLGATHER(local_nz*total_np_e,1,MPI_INTEGER,rcv_zp_e,1,MPI_INTEGER,comm_z,ierr)
+    CALL MPI_ALLGATHER(local_nz*Np_e,1,MPI_INTEGER,rcv_zp_e,1,MPI_INTEGER,comm_z,ierr)
     ! the displacement array can be build from each local_np as
     dsp_zp_e(0)=0
     DO i_=1,num_procs_z-1
-       dsp_zp_e(i_) =dsp_zp_e(i_-1) + dsp_zp_e(i_-1)
+       dsp_zp_e(i_) =dsp_zp_e(i_-1) + rcv_zp_e(i_-1)
     END DO
-    ! IONS
-    ! P variables are already defined in grid module (see rcv_p_a, dsp_p_a)
-    !! Z reduction for full slices of p data but constant j
+
+    !!!!! PJXYZ gather variables
+    !! Y reduction for full slices of p data but constant j
+    ! number of points recieved and displacement for the y reduction
+    ALLOCATE(rcv_yp_e(0:num_procs_ky-1),dsp_yp_e(0:num_procs_ky-1)) !Displacement sizes for balance diagnostic
+    ! all processes share their local number of points
+    CALL MPI_ALLGATHER(local_nky*Np_e,1,MPI_INTEGER,rcv_yp_e,1,MPI_INTEGER,comm_ky,ierr)
+    ! the displacement array can be build from each local_np as
+    dsp_yp_e(0)=0
+    DO i_=1,num_procs_ky-1
+       dsp_yp_e(i_) =dsp_yp_e(i_-1) + rcv_yp_e(i_-1)
+    END DO
+    !! Z reduction for full slices of py data but constant j
     ! number of points recieved and displacement for the z reduction
-    ALLOCATE(rcv_zp_i(0:num_procs_z-1),dsp_zp_i(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
+    ALLOCATE(rcv_zyp_e(0:num_procs_z-1),dsp_zyp_e(0:num_procs_z-1)) !Displacement sizes for balance diagnostic
     ! all processes share their local number of points
-    CALL MPI_ALLGATHER(local_nz*total_np_i,1,MPI_INTEGER,rcv_zp_i,1,MPI_INTEGER,comm_z,ierr)
+    CALL MPI_ALLGATHER(local_nz*Np_e*Nky,1,MPI_INTEGER,rcv_zyp_e,1,MPI_INTEGER,comm_z,ierr)
     ! the displacement array can be build from each local_np as
-    dsp_zp_i(0)=0
+    dsp_zyp_e(0)=0
     DO i_=1,num_procs_z-1
-       dsp_zp_i(i_) =dsp_zp_i(i_-1) + dsp_zp_i(i_-1)
+       dsp_zyp_e(i_) =dsp_zyp_e(i_-1) + rcv_zyp_e(i_-1)
     END DO
 
   END SUBROUTINE init_parallel_var
@@ -109,122 +157,181 @@ CONTAINS
   END SUBROUTINE gather_xyz
 
   !!!!! Gather a field in kinetic + z coordinates on rank 0 !!!!!
-  SUBROUTINE gather_pjz_e(field_sub,field_full)
-    COMPLEX(dp), DIMENSION(ips_e:ipe_e, ijs_e:ije_e, izs:ize), INTENT(IN)    :: field_sub
-    COMPLEX(dp), DIMENSION(   1:pmaxe+1,  1:jmaxe+1,   1:Nz),  INTENT(INOUT) :: field_full
-    COMPLEX(dp), DIMENSION(ips_e:ipe_e)         :: buffer_lp_cz !local p, constant z
-    COMPLEX(dp), DIMENSION(   1:pmaxe+1 )       :: buffer_fp_cz !full  p, constant z
-    COMPLEX(dp), DIMENSION(   1:pmaxe+1,  izs:ize ) :: buffer_fp_lz !full  p, local z
-    COMPLEX(dp), DIMENSION(   1:pmaxe+1,    1:Nz  ) :: buffer_fp_fz !full  p, full  z
-    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz, Npe
+  SUBROUTINE gather_pjz_i(field_sub,field_full)
+    COMPLEX(dp), DIMENSION(ips_i:ipe_i, ijs_i:ije_i, izs:ize), INTENT(IN)    :: field_sub
+    COMPLEX(dp), DIMENSION(   1:pmaxi+1,  1:jmaxi+1,   1:Nz),  INTENT(INOUT) :: field_full
+    COMPLEX(dp), DIMENSION(ips_i:ipe_i)         :: buffer_lp_cz !local p, constant z
+    COMPLEX(dp), DIMENSION(   1:pmaxi+1 )       :: buffer_fp_cz !full  p, constant z
+    COMPLEX(dp), DIMENSION(   1:pmaxi+1,  izs:ize ) :: buffer_fp_lz !full  p, local z
+    COMPLEX(dp), DIMENSION(   1:pmaxi+1,    1:Nz  ) :: buffer_fp_fz !full  p, full  z
+    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz
 
-    Npe    = pmaxe+1      ! total number of hermite moments
-    snd_p  = local_np_e   ! Number of points to send along y (per z)
-    snd_z  = Npe*local_nz ! Number of points to send along z (full y)
+    snd_p  = local_np_i   ! Number of points to send along y (per z)
+    snd_z  = Np_i*local_nz ! Number of points to send along z (full y)
 
     root_p = 0; root_z = 0; root_ky = 0
     IF(rank_ky .EQ. root_ky) THEN
-      DO ij = 1,jmaxe+1
+      DO ij = 1,jmaxi+1
         DO iz = izs,ize
           ! fill a buffer to contain a slice of data at constant kx and z
-          buffer_lp_cz(ips_e:ipe_e) = field_sub(ips_e:ipe_e,ij,iz)
-          CALL MPI_GATHERV(buffer_lp_cz, snd_p,           MPI_DOUBLE_COMPLEX, &
-                           buffer_fp_cz, rcv_p_e, dsp_p_e, MPI_DOUBLE_COMPLEX, &
+          buffer_lp_cz(ips_i:ipe_i) = field_sub(ips_i:ipe_i,ij,iz)
+          CALL MPI_GATHERV(buffer_lp_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_cz, rcv_p_i, dsp_p_i, MPI_DOUBLE_COMPLEX, &
                            root_p, comm_p, ierr)
-          buffer_fp_lz(1:Npe,iz) = buffer_fp_cz(1:Npe)
+          buffer_fp_lz(1:Np_i,iz) = buffer_fp_cz(1:Np_i)
         ENDDO
 
         ! send the full line on y contained by root_kyas
         IF(rank_p .EQ. 0) THEN
           CALL MPI_GATHERV(buffer_fp_lz, snd_z,              MPI_DOUBLE_COMPLEX, &
-                           buffer_fp_fz, rcv_zp_e, dsp_zp_e, MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_fz, rcv_zp_i, dsp_zp_i, MPI_DOUBLE_COMPLEX, &
                            root_z, comm_z, ierr)
         ENDIF
         ! ID 0 (the one who output) rebuild the whole array
         IF(my_id .EQ. 0) &
-          field_full(1:Npe,ij,1:Nz) = buffer_fp_fz(1:Npe,1:Nz)
+          field_full(1:Np_i,ij,1:Nz) = buffer_fp_fz(1:Np_i,1:Nz)
       ENDDO
     ENDIF
-  END SUBROUTINE gather_pjz_e
+  END SUBROUTINE gather_pjz_i
 
-  SUBROUTINE gather_pjz_i(field_sub,field_full)
-    COMPLEX(dp), DIMENSION(ips_i:ipe_i, ijs_i:ije_i, izs:ize), INTENT(IN)    :: field_sub
+  SUBROUTINE gather_pjz_e(field_sub,field_full)
+    COMPLEX(dp), DIMENSION(ips_e:ipe_e, ijs_e:ije_e, izs:ize), INTENT(IN)    :: field_sub
     COMPLEX(dp), DIMENSION(   1:pmaxi+1,  1:jmaxi+1,   1:Nz),  INTENT(INOUT) :: field_full
-    COMPLEX(dp), DIMENSION(ips_i:ipe_i)         :: buffer_lp_cz !local p, constant z
+    COMPLEX(dp), DIMENSION(ips_e:ipe_e)         :: buffer_lp_cz !local p, constant z
     COMPLEX(dp), DIMENSION(   1:pmaxi+1 )       :: buffer_fp_cz !full  p, constant z
     COMPLEX(dp), DIMENSION(   1:pmaxi+1,  izs:ize ) :: buffer_fp_lz !full  p, local z
     COMPLEX(dp), DIMENSION(   1:pmaxi+1,    1:Nz  ) :: buffer_fp_fz !full  p, full  z
-    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz, Npi
+    INTEGER :: snd_p, snd_z, root_p, root_z, root_ky, ij, iz
 
-    Npi    = pmaxi+1      ! total number of hermite moments
-    snd_p  = local_np_i   ! Number of points to send along y (per z)
-    snd_z  = Npi*local_nz ! Number of points to send along z (full y)
+    snd_p  = local_np_e   ! Number of points to send along y (per z)
+    snd_z  = Np_e*local_nz ! Number of points to send along z (full y)
 
     root_p = 0; root_z = 0; root_ky = 0
     IF(rank_ky .EQ. root_ky) THEN
       DO ij = 1,jmaxi+1
         DO iz = izs,ize
           ! fill a buffer to contain a slice of data at constant kx and z
-          buffer_lp_cz(ips_i:ipe_i) = field_sub(ips_i:ipe_i,ij,iz)
+          buffer_lp_cz(ips_e:ipe_e) = field_sub(ips_e:ipe_e,ij,iz)
           CALL MPI_GATHERV(buffer_lp_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
-                           buffer_fp_cz, rcv_p_i, dsp_p_i, MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_cz, rcv_p_e, dsp_p_e, MPI_DOUBLE_COMPLEX, &
                            root_p, comm_p, ierr)
-          buffer_fp_lz(1:Npi,iz) = buffer_fp_cz(1:Npi)
+          buffer_fp_lz(1:Np_e,iz) = buffer_fp_cz(1:Np_e)
         ENDDO
 
         ! send the full line on y contained by root_kyas
         IF(rank_p .EQ. 0) THEN
           CALL MPI_GATHERV(buffer_fp_lz, snd_z,              MPI_DOUBLE_COMPLEX, &
-                           buffer_fp_fz, rcv_zp_i, dsp_zp_i, MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_fz, rcv_zp_e, dsp_zp_e, MPI_DOUBLE_COMPLEX, &
                            root_z, comm_z, ierr)
         ENDIF
         ! ID 0 (the one who output) rebuild the whole array
         IF(my_id .EQ. 0) &
-          field_full(1:Npi,ij,1:Nz) = buffer_fp_fz(1:Npi,1:Nz)
+          field_full(1:Np_e,ij,1:Nz) = buffer_fp_fz(1:Np_e,1:Nz)
       ENDDO
     ENDIF
-  END SUBROUTINE gather_pjz_i
+  END SUBROUTINE gather_pjz_e
 
   !!!!! Gather a field in kinetic + spatial coordinates on rank 0 !!!!!
   !!!!! Gather a field in spatial coordinates on rank 0 !!!!!
+  SUBROUTINE gather_pjxyz_i(field_sub,field_full)
+    COMPLEX(dp), DIMENSION( ips_i:ipe_i, 1:Nj_i, ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)    :: field_sub
+    COMPLEX(dp), DIMENSION(     1:Np_i,  1:Nj_i,    1:Nky,  1:Nkx,   1:Nz),  INTENT(INOUT) :: field_full
+    COMPLEX(dp), DIMENSION(ips_i:ipe_i)       :: buffer_lp_cy_cz     !local p, constant y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_i)            :: buffer_fp_cy_cz     ! full p, constant y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_i, ikys:ikye)    :: buffer_fp_ly_cz     ! full p,    local y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky )    :: buffer_fp_fy_cz     ! full p,     full y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky, izs:ize ) :: buffer_fp_fy_lz ! full p,     full y, local    z
+    COMPLEX(dp), DIMENSION(1:Np_i,    1:Nky,   1:Nz  ) :: buffer_fp_fy_fz ! full p,     full y, full     z
+    INTEGER :: snd_p, snd_y, snd_z, root_p, root_z, root_ky, iy, ix, iz, ij
+
+    snd_p  =     local_np_i    ! Number of points to send along p (per z,y)
+    snd_y  = Np_i*local_nky    ! Number of points to send along y (per z, full p)
+    snd_z  = Np_i*Nky*local_nz ! Number of points to send along z (full y, full p)
+
+    root_p = 0; root_z = 0; root_ky = 0
+
+    j: DO ij = 1,Nj_i
+      x: DO ix = 1,Nkx
+        z: DO iz = izs,ize
+          y: DO iy = ikys,ikye
+            ! fill a buffer to contain a slice of p data at constant j, ky, kx and z
+            buffer_lp_cy_cz(ips_i:ipe_i) = field_sub(ips_i:ipe_i,ij,iy,ix,iz)
+            CALL MPI_GATHERV(buffer_lp_cy_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
+                             buffer_fp_cy_cz, rcv_p_i, dsp_p_i, MPI_DOUBLE_COMPLEX, &
+                             root_p, comm_p, ierr)
+            buffer_fp_ly_cz(1:Np_i,iy) = buffer_fp_cy_cz(1:Np_i)
+          ENDDO y
+          ! send the full line on p contained by root_p
+          IF(rank_p .EQ. 0) THEN
+            CALL MPI_GATHERV(buffer_fp_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
+                             buffer_fp_fy_cz, rcv_yp_i, dsp_yp_i, MPI_DOUBLE_COMPLEX, &
+                             root_ky, comm_ky, ierr)
+            buffer_fp_fy_lz(1:Np_i,1:Nky,iz) = buffer_fp_fy_cz(1:Np_i,1:Nky)
+          ENDIF
+        ENDDO z
+        ! send the full line on y contained by root_kyas
+        IF(rank_ky .EQ. 0) THEN
+          CALL MPI_GATHERV(buffer_fp_fy_lz, snd_z,                MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_fy_fz, rcv_zyp_i, dsp_zyp_i, MPI_DOUBLE_COMPLEX, &
+                           root_z, comm_z, ierr)
+        ENDIF
+        ! ID 0 (the one who output) rebuild the whole array
+        IF(my_id .EQ. 0) &
+          field_full(1:Np_i,ij,1:Nky,ix,1:Nz) = buffer_fp_fy_fz(1:Np_i,1:Nky,1:Nz)
+      ENDDO x
+    ENDDO j
+
+  END SUBROUTINE gather_pjxyz_i
+
   SUBROUTINE gather_pjxyz_e(field_sub,field_full)
-    COMPLEX(dp), DIMENSION( ips_e:ipe_e, ijs_e:ije_e, ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)    :: field_sub
-    COMPLEX(dp), DIMENSION(1:total_np_e, ijs_e:ije_e,   1:Nky,  1:Nkx,   1:Nz),  INTENT(INOUT) :: field_full
-    COMPLEX(dp), DIMENSION(ips_e:ipe_e)         :: buffer_lp_cy_cz              !local p, constant y, constant z
-    COMPLEX(dp), DIMENSION(1:total_np_e)        :: buffer_fp_cy_cz              ! full p, constant y, constant z
-    COMPLEX(dp), DIMENSION(1:total_np_e, ikys:ikye) :: buffer_fp_ly_cz          ! full p,    local y, constant z
-    COMPLEX(dp), DIMENSION(1:total_np_e,    1:Nky ) :: buffer_fp_fy_cz          ! full p,     full y, constant z
-    COMPLEX(dp), DIMENSION(1:total_np_e,    1:Nky, izs:ize ) :: buffer_fp_fy_lz ! full p,     full y, local    z
-    COMPLEX(dp), DIMENSION(1:total_np_e,    1:Nky,   1:Nz  ) :: buffer_fp_fy_fz ! full p,     full y, full     z
-    INTEGER :: snd_y, snd_z, root_p, root_z, root_ky, ix, iz
+    COMPLEX(dp), DIMENSION( ips_e:ipe_e, 1:Nj_e, ikys:ikye, 1:Nkx, izs:ize), INTENT(IN)    :: field_sub
+    COMPLEX(dp), DIMENSION(     1:Np_e,  1:Nj_e,    1:Nky,  1:Nkx,   1:Nz),  INTENT(INOUT) :: field_full
+    COMPLEX(dp), DIMENSION(ips_e:ipe_e)       :: buffer_lp_cy_cz     !local p, constant y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_e)            :: buffer_fp_cy_cz     ! full p, constant y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_e, ikys:ikye)    :: buffer_fp_ly_cz     ! full p,    local y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky )    :: buffer_fp_fy_cz     ! full p,     full y, constant z
+    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky, izs:ize ) :: buffer_fp_fy_lz ! full p,     full y, local    z
+    COMPLEX(dp), DIMENSION(1:Np_e,    1:Nky,   1:Nz  ) :: buffer_fp_fy_fz ! full p,     full y, full     z
+    INTEGER :: snd_p, snd_y, snd_z, root_p, root_z, root_ky, iy, ix, iz, ij
 
-    snd_y  = local_nky    ! Number of points to send along y (per z)
-    snd_z  = Nky*local_nz ! Number of points to send along z (full y)
+    snd_p  =     local_np_e    ! Number of points to send along p (per z,y)
+    snd_y  = Np_e*local_nky    ! Number of points to send along y (per z, full p)
+    snd_z  = Np_e*Nky*local_nz ! Number of points to send along z (full y, full p)
 
     root_p = 0; root_z = 0; root_ky = 0
-    IF(rank_p .EQ. root_p) THEN
-      DO ix = 1,Nkx
-        DO iz = izs,ize
-          ! fill a buffer to contain a slice of data at constant kx and z
-          buffer_ly_cz(ikys:ikye) = field_sub(ikys:ikye,ix,iz)
-          CALL MPI_GATHERV(buffer_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
-                           buffer_fy_cz, rcv_y, dsp_y, MPI_DOUBLE_COMPLEX, &
-                           root_ky, comm_ky, ierr)
-          buffer_fy_lz(1:Nky,iz) = buffer_fy_cz(1:Nky)
-        ENDDO
 
+    j: DO ij = 1,Nj_e
+      x: DO ix = 1,Nkx
+        z: DO iz = izs,ize
+          y: DO iy = ikys,ikye
+            ! fill a buffer to contain a slice of p data at constant j, ky, kx and z
+            buffer_lp_cy_cz(ips_e:ipe_e) = field_sub(ips_e:ipe_e,ij,iy,ix,iz)
+            CALL MPI_GATHERV(buffer_lp_cy_cz, snd_p,            MPI_DOUBLE_COMPLEX, &
+                             buffer_fp_cy_cz, rcv_p_e, dsp_p_e, MPI_DOUBLE_COMPLEX, &
+                             root_p, comm_p, ierr)
+            buffer_fp_ly_cz(1:Np_e,iy) = buffer_fp_cy_cz(1:Np_e)
+          ENDDO y
+          ! send the full line on p contained by root_p
+          IF(rank_p .EQ. 0) THEN
+            CALL MPI_GATHERV(buffer_fp_ly_cz, snd_y,        MPI_DOUBLE_COMPLEX, &
+                             buffer_fp_fy_cz, rcv_yp_e, dsp_yp_e, MPI_DOUBLE_COMPLEX, &
+                             root_ky, comm_ky, ierr)
+            buffer_fp_fy_lz(1:Np_e,1:Nky,iz) = buffer_fp_fy_cz(1:Np_e,1:Nky)
+          ENDIF
+        ENDDO z
         ! send the full line on y contained by root_kyas
         IF(rank_ky .EQ. 0) THEN
-          CALL MPI_GATHERV(buffer_fy_lz, snd_z,        MPI_DOUBLE_COMPLEX, &
-                           buffer_fy_fz, rcv_zy, dsp_zy, MPI_DOUBLE_COMPLEX, &
+          CALL MPI_GATHERV(buffer_fp_fy_lz, snd_z,                MPI_DOUBLE_COMPLEX, &
+                           buffer_fp_fy_fz, rcv_zyp_e, dsp_zyp_e, MPI_DOUBLE_COMPLEX, &
                            root_z, comm_z, ierr)
         ENDIF
         ! ID 0 (the one who output) rebuild the whole array
         IF(my_id .EQ. 0) &
-          field_full(1:Nky,ix,1:Nz) = buffer_fy_fz(1:Nky,1:Nz)
-      ENDDO
-    ENDIF
-  END SUBROUTINE gather_pjxyz
+          field_full(1:Np_e,ij,1:Nky,ix,1:Nz) = buffer_fp_fy_fz(1:Np_e,1:Nky,1:Nz)
+      ENDDO x
+    ENDDO j
+
+  END SUBROUTINE gather_pjxyz_e
 
   !!!!! This is a manual way to do MPI_BCAST !!!!!!!!!!!
   SUBROUTINE manual_3D_bcast(field_)