From 72ce6017aa73239b11a3511cf9d10b3cd9ef9514 Mon Sep 17 00:00:00 2001
From: Antoine Cyril David Hoffmann <ahoffman@spcpc606.epfl.ch>
Date: Fri, 29 Apr 2022 15:14:34 +0200
Subject: [PATCH] Small loops optimization

---
 src/advance_field.F90      |  29 ++---
 src/array_mod.F90          |  10 +-
 src/collision_mod.F90      |  48 +++----
 src/moments_eq_rhs_mod.F90 | 253 +++++++++++++++++--------------------
 src/stepon.F90             |  51 ++++----
 5 files changed, 184 insertions(+), 207 deletions(-)

diff --git a/src/advance_field.F90 b/src/advance_field.F90
index 9ebdba45..a26bda44 100644
--- a/src/advance_field.F90
+++ b/src/advance_field.F90
@@ -66,27 +66,16 @@ CONTAINS
 
     SELECT CASE (updatetlevel)
     CASE(1)
-      DO iky=ikys,ikye
-          DO ikx=ikxs,ikxe
-            DO iz=izs,ize
-              DO istage=1,ntimelevel
-                f(ikx,iky,iz,1) = f(ikx,iky,iz,1) + dt*b_E(istage)*f_rhs(ikx,iky,iz,istage)
-              END DO
-            END DO
-          END DO
-      END DO
+        DO istage=1,ntimelevel
+          f(ikxs:ikxe,ikys:ikye,izs:ize,1) = f(ikxs:ikxe,ikys:ikye,izs:ize,1) &
+                   + dt*b_E(istage)*f_rhs(ikxs:ikxe,ikys:ikye,izs:ize,istage)
+        END DO
     CASE DEFAULT
-      DO iky=ikys,ikye
-          DO ikx=ikxs,ikxe
-            DO iz=izs,ize
-              f(ikx,iky,iz,updatetlevel) = f(ikx,iky,iz,1);
-              DO istage=1,updatetlevel-1
-                f(ikx,iky,iz,updatetlevel) = f(ikx,iky,iz,updatetlevel) + &
-                                      dt*A_E(updatetlevel,istage)*f_rhs(ikx,iky,iz,istage)
-              END DO
-            END DO
-          END DO
-      END DO
+        f(ikxs:ikxe,ikys:ikye,izs:ize,updatetlevel) = f(ikxs:ikxe,ikys:ikye,izs:ize,1);
+        DO istage=1,updatetlevel-1
+          f(ikxs:ikxe,ikys:ikye,izs:ize,updatetlevel) = f(ikxs:ikxe,ikys:ikye,izs:ize,updatetlevel) + &
+                            dt*A_E(updatetlevel,istage)*f_rhs(ikxs:ikxe,ikys:ikye,izs:ize,istage)
+        END DO
     END SELECT
   END SUBROUTINE advance_field
 
diff --git a/src/array_mod.F90 b/src/array_mod.F90
index 5ac91e6e..419d3c6c 100644
--- a/src/array_mod.F90
+++ b/src/array_mod.F90
@@ -11,6 +11,14 @@ MODULE array
   COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: nadiab_moments_e
   COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: nadiab_moments_i
 
+  ! Derivatives and interpolated moments
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: ddz_nepj
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: interp_nepj
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: ddz2_Nepj
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: ddz_nipj
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: interp_nipj
+  COMPLEX(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: ddz2_Nipj
+
   ! Arrays to store special initial modes (semi linear simulation)
   ! Zonal ones (ky=0)
   COMPLEX(dp), DIMENSION(:,:,:,:), ALLOCATABLE :: moments_e_ZF
@@ -48,7 +56,7 @@ MODULE array
   REAL(dp), DIMENSION(:,:), ALLOCATABLE :: zNipm1j, zNipm1jp1, zNipm1jm1            ! mirror lin coeff for adiab mom
   REAL(dp), DIMENSION(:,:), ALLOCATABLE :: xphij_e, xphijp1_e, xphijm1_e
   REAL(dp), DIMENSION(:,:), ALLOCATABLE :: xphij_i, xphijp1_i, xphijm1_i
-  
+
   ! Kernel function evaluation (ij,ikx,iky,iz,odd/even p)
   REAL(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: kernel_e
   REAL(dp), DIMENSION(:,:,:,:,:), ALLOCATABLE :: kernel_i
diff --git a/src/collision_mod.F90 b/src/collision_mod.F90
index eed589b7..33954f78 100644
--- a/src/collision_mod.F90
+++ b/src/collision_mod.F90
@@ -478,32 +478,32 @@ CONTAINS
     COMPLEX(dp), DIMENSION(ips_i:ipe_i) :: TColl_distr_i
     COMPLEX(dp) :: TColl
     INTEGER :: ikxs_C, ikxe_C, ikys_C, ikye_C
-    DO ikx = ikxs,ikxe
+    DO iz = izs,ize
       DO iky = ikys,ikye
-        DO iz = izs,ize
+        DO ikx = ikxs,ikxe
           IF(KIN_E) THEN
-          ! Electrons
-          DO ij = 1,Jmaxe+1
-            ! Loop over all p to compute sub collision term
-            DO ip = 1,total_np_e
-              CALL apply_COSOlver_mat_e(ip,ij,ikx,iky,iz,TColl)
-              local_sum_e(ip) = TColl
-            ENDDO
-            IF (num_procs_p .GT. 1) THEN
-              ! Sum up all the sub collision terms on root 0
-              CALL MPI_REDUCE(local_sum_e, buffer_e, total_np_e, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm_p, ierr)
-              ! distribute the sum over the process among p
-              CALL MPI_SCATTERV(buffer_e, counts_np_e, displs_np_e, MPI_DOUBLE_COMPLEX,&
-                                TColl_distr_e, local_np_e, MPI_DOUBLE_COMPLEX,&
-                                0, comm_p, ierr)
-            ELSE
-              TColl_distr_e = local_sum_e
-            ENDIF
-            ! Write in output variable
-            DO ip = ips_e,ipe_e
-              TColl_e(ip,ij,ikx,iky,iz) = TColl_distr_e(ip)
+            DO ij = 1,Jmaxe+1
+              ! Electrons
+                ! Loop over all p to compute sub collision term
+                DO ip = 1,total_np_e
+                  CALL apply_COSOlver_mat_e(ip,ij,ikx,iky,iz,TColl)
+                  local_sum_e(ip) = TColl
+                ENDDO
+                IF (num_procs_p .GT. 1) THEN
+                  ! Sum up all the sub collision terms on root 0
+                  CALL MPI_REDUCE(local_sum_e, buffer_e, total_np_e, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm_p, ierr)
+                  ! distribute the sum over the process among p
+                  CALL MPI_SCATTERV(buffer_e, counts_np_e, displs_np_e, MPI_DOUBLE_COMPLEX,&
+                                    TColl_distr_e, local_np_e, MPI_DOUBLE_COMPLEX,&
+                                    0, comm_p, ierr)
+                ELSE
+                  TColl_distr_e = local_sum_e
+                ENDIF
+                ! Write in output variable
+                DO ip = ips_e,ipe_e
+                  TColl_e(ip,ij,ikx,iky,iz) = TColl_distr_e(ip)
+                ENDDO
             ENDDO
-          ENDDO
           ENDIF
           ! Ions
           DO ij = 1,Jmaxi+1
@@ -533,7 +533,7 @@ CONTAINS
   END SUBROUTINE compute_cosolver_coll
 
   !******************************************************************************!
-  !!!!!!! Compute ion collision term
+  !!!!!!! Compute electron collision term
   !******************************************************************************!
   SUBROUTINE apply_COSOlver_mat_e(ip_,ij_,ikx_,iky_,iz_,TColl_)
     IMPLICIT NONE
diff --git a/src/moments_eq_rhs_mod.F90 b/src/moments_eq_rhs_mod.F90
index ac081a88..27012952 100644
--- a/src/moments_eq_rhs_mod.F90
+++ b/src/moments_eq_rhs_mod.F90
@@ -35,41 +35,30 @@ SUBROUTINE moments_eq_rhs_e
   COMPLEX(dp) :: Tnepp1j, Tnepm1j, Tnepp1jm1, Tnepm1jm1 ! Terms from mirror force with non adiab moments
   COMPLEX(dp) :: Tperp, Tpar, Tmir, Tphi
   COMPLEX(dp) :: Unepm1j, Unepm1jp1, Unepm1jm1 ! Terms from mirror force with adiab moments
-  COMPLEX(dp) :: i_ky
-  ! To store derivatives and odd-even z grid interpolations
-  COMPLEX(dp), DIMENSION(izs:ize) :: ddznepp1j, ddznepm1j, &
-              nepp1j, nepp1jm1, nepm1j, nepm1jm1, nepm1jp1, ddz2Nepj
+  COMPLEX(dp) :: i_ky,phikxkyz
 
    ! Measuring execution time
   CALL cpu_time(t0_rhs)
 
-  ! Kinetic loops
-  ploope : DO ip = ips_e, ipe_e ! loop over Hermite degree indices
-    p_int = parray_e(ip)    ! Hermite polynom. degree
-    eo    = MODULO(p_int,2) ! Indicates if we are on even or odd z grid
-    jloope : DO ij = ijs_e, ije_e ! loop over Laguerre degree indices
-    j_int = jarray_e(ij)
-      ! Spatial loops
-      kxloope : DO ikx = ikxs,ikxe
-      kx     = kxarray(ikx)   ! radial wavevector
-      kyloope : DO iky = ikys,ikye
+  ! Spatial loops
+  zloope : DO  iz = izs,ize
+    kyloope : DO iky = ikys,ikye
       ky     = kyarray(iky)   ! toroidal wavevector
-        i_ky   = imagu * ky     ! toroidal derivative
-        IF (Nky .EQ. 1) i_ky = imagu * kxarray(ikx) ! If 1D simulation we put kx as ky
-        ! Compute z derivatives and odd-even z interpolations
-        CALL   grad_z(eo,nadiab_moments_e(ip+1,ij  ,ikx,iky,izgs:izge), ddznepp1j(izs:ize))
-        CALL   grad_z(eo,nadiab_moments_e(ip-1,ij  ,ikx,iky,izgs:izge), ddznepm1j(izs:ize))
-        CALL interp_z(eo,nadiab_moments_e(ip+1,ij  ,ikx,iky,izgs:izge),  nepp1j  (izs:ize))
-        CALL interp_z(eo,nadiab_moments_e(ip+1,ij-1,ikx,iky,izgs:izge),  nepp1jm1(izs:ize))
-        CALL interp_z(eo,nadiab_moments_e(ip-1,ij  ,ikx,iky,izgs:izge),  nepm1j  (izs:ize))
-        CALL interp_z(eo,nadiab_moments_e(ip-1,ij+1,ikx,iky,izgs:izge),  nepm1jp1(izs:ize))
-        CALL interp_z(eo,nadiab_moments_e(ip-1,ij-1,ikx,iky,izgs:izge),  nepm1jm1(izs:ize))
-        ! Parallel hyperdiffusion
-        CALL  grad_z2(moments_e(ip,ij,ikx,iky,izgs:izge,updatetlevel),   ddz2Nepj(izs:ize))
+      i_ky   = imagu * ky     ! toroidal derivative
+
+      kxloope : DO ikx = ikxs,ikxe
+        kx       = kxarray(ikx)   ! radial wavevector
+        phikxkyz = phi(ikx,iky,iz)! tmp phi value
+
+        ! Kinetic loops
+        jloope : DO ij = ijs_e, ije_e  ! This loop is from 1 to jmaxi+1
+          j_int = jarray_e(ij)
+
+          ploope : DO ip = ips_e, ipe_e  ! Hermite loop
+            p_int = parray_e(ip)    ! Hermite degree
+            eo    = MODULO(p_int,2) ! Indicates if we are on odd or even z grid
+            kperp2= kparray(ikx,iky,iz,eo)**2
 
-        zloope : DO  iz = izs,ize
-          ! kperp
-          kperp2= kparray(ikx,iky,iz,eo)**2
 
           !! Compute moments mixing terms
           Tperp = 0._dp; Tpar = 0._dp; Tmir = 0._dp
@@ -84,24 +73,26 @@ SUBROUTINE moments_eq_rhs_e
           Tnepjp1 = xnepjp1(ij) * nadiab_moments_e(ip,ij+1,ikx,iky,iz)
           ! term propto n_e^{p,j-1}
           Tnepjm1 = xnepjm1(ij) * nadiab_moments_e(ip,ij-1,ikx,iky,iz)
+          ! Parallel dynamic
           ! ddz derivative for Landau damping term
-          Tpar = xnepp1j(ip) * ddznepp1j(iz) + xnepm1j(ip) * ddznepm1j(iz)
-          ! Mirror terms (trapping)
-          Tnepp1j   = ynepp1j  (ip,ij) * nepp1j  (iz)
-          Tnepp1jm1 = ynepp1jm1(ip,ij) * nepp1jm1(iz)
-          Tnepm1j   = ynepm1j  (ip,ij) * nepm1j  (iz)
-          Tnepm1jm1 = ynepm1jm1(ip,ij) * nepm1jm1(iz)
+          Tpar      = xnepp1j(ip) * ddz_nepj(ip+1,ij,ikx,iky,iz) &
+                    + xnepm1j(ip) * ddz_nepj(ip-1,ij,ikx,iky,iz)
+          ! Mirror terms
+          Tnepp1j   = ynepp1j  (ip,ij) * interp_nepj(ip+1,ij  ,ikx,iky,iz)
+          Tnepp1jm1 = ynepp1jm1(ip,ij) * interp_nepj(ip+1,ij-1,ikx,iky,iz)
+          Tnepm1j   = ynepm1j  (ip,ij) * interp_nepj(ip-1,ij  ,ikx,iky,iz)
+          Tnepm1jm1 = ynepm1jm1(ip,ij) * interp_nepj(ip-1,ij-1,ikx,iky,iz)
           ! Trapping terms
-          Unepm1j   = znepm1j  (ip,ij) * nepm1j  (iz)
-          Unepm1jp1 = znepm1jp1(ip,ij) * nepm1jp1(iz)
-          Unepm1jm1 = znepm1jm1(ip,ij) * nepm1jm1(iz)
+          Unepm1j   = znepm1j  (ip,ij) * interp_nepj(ip-1,ij  ,ikx,iky,iz)
+          Unepm1jp1 = znepm1jp1(ip,ij) * interp_nepj(ip-1,ij+1,ikx,iky,iz)
+          Unepm1jm1 = znepm1jm1(ip,ij) * interp_nepj(ip-1,ij-1,ikx,iky,iz)
 
           Tmir = Tnepp1j + Tnepp1jm1 + Tnepm1j + Tnepm1jm1 + Unepm1j + Unepm1jp1 + Unepm1jm1
           !! Electrical potential term
           IF ( p_int .LE. 2 ) THEN ! kronecker p0 p1 p2
             Tphi = (xphij_i  (ip,ij)*kernel_e(ij  ,ikx,iky,iz,eo) &
                   + xphijp1_i(ip,ij)*kernel_e(ij+1,ikx,iky,iz,eo) &
-                  + xphijm1_i(ip,ij)*kernel_e(ij-1,ikx,iky,iz,eo))*phi(ikx,iky,iz)
+                  + xphijm1_i(ip,ij)*kernel_e(ij-1,ikx,iky,iz,eo))*phikxkyz
           ELSE
             Tphi = 0._dp
           ENDIF
@@ -119,17 +110,17 @@ SUBROUTINE moments_eq_rhs_e
               ! Numerical perpendicular hyperdiffusion (totally artificial, for stability purpose)
               - (mu_x*kx**4 + mu_y*ky**4)*moments_e(ip,ij,ikx,iky,iz,updatetlevel) &
               ! Numerical parallel hyperdiffusion "+ (mu_z*kz**4)"
-              + mu_z * diff_dz_coeff * ddz2Nepj(iz) &
+              + mu_z * diff_dz_coeff * ddz2_Nepj(ip,ij,ikx,iky,iz) &
               ! Collision term
               + TColl_e(ip,ij,ikx,iky,iz) &
               ! Nonlinear term
               - Sepj(ip,ij,ikx,iky,iz)
 
-         END DO zloope
-        END DO kyloope
+          END DO ploope
+        END DO jloope
       END DO kxloope
-    END DO jloope
-  END DO ploope
+    END DO kyloope
+  END DO zloope
 
   ! Execution time end
   CALL cpu_time(t1_rhs)
@@ -162,103 +153,95 @@ SUBROUTINE moments_eq_rhs_i
   COMPLEX(dp) :: Tnipp1j, Tnipm1j, Tnipp1jm1, Tnipm1jm1 ! Terms from mirror force with non adiab moments
   COMPLEX(dp) :: Unipm1j, Unipm1jp1, Unipm1jm1 ! Terms from mirror force with adiab moments
   COMPLEX(dp) :: Tperp, Tpar, Tmir, Tphi
-  COMPLEX(dp) :: i_ky
-  ! To store derivatives and odd-even z grid interpolations
-  COMPLEX(dp), DIMENSION(izs:ize) :: ddznipp1j, ddznipm1j, &
-              nipp1j, nipp1jm1, nipm1j, nipm1jm1, nipm1jp1, ddz2Nipj
+  COMPLEX(dp) :: i_ky, phikxkyz
+
   ! Measuring execution time
   CALL cpu_time(t0_rhs)
 
-  ! Kinetic loops
-  ploopi : DO ip = ips_i, ipe_i  ! Hermite loop
-    p_int = parray_i(ip)    ! Hermite degree
-    eo    = MODULO(p_int,2) ! Indicates if we are on odd or even z grid
-    jloopi : DO ij = ijs_i, ije_i  ! This loop is from 1 to jmaxi+1
-      j_int = jarray_i(ij)
-      ! Spatial loops
-      kxloopi : DO ikx = ikxs,ikxe
-      kx     = kxarray(ikx)   ! radial wavevector
-      kyloopi : DO iky = ikys,ikye
-      ky     = kyarray(iky)   ! toroidal wavevector
-        i_ky   = imagu * ky     ! toroidal derivative
-        IF (Nky .EQ. 1) i_ky = imagu * kxarray(ikx) ! If 1D simulation we put kx as ky
-        ! Compute z derivatives and odd-even z interpolations
-        CALL   grad_z(eo,nadiab_moments_i(ip+1,ij  ,ikx,iky,izgs:izge),ddznipp1j(izs:ize))
-        CALL   grad_z(eo,nadiab_moments_i(ip-1,ij  ,ikx,iky,izgs:izge),ddznipm1j(izs:ize))
-        CALL interp_z(eo,nadiab_moments_i(ip+1,ij  ,ikx,iky,izgs:izge), nipp1j  (izs:ize))
-        CALL interp_z(eo,nadiab_moments_i(ip+1,ij-1,ikx,iky,izgs:izge), nipp1jm1(izs:ize))
-        CALL interp_z(eo,nadiab_moments_i(ip-1,ij  ,ikx,iky,izgs:izge), nipm1j  (izs:ize))
-        CALL interp_z(eo,nadiab_moments_i(ip-1,ij+1,ikx,iky,izgs:izge), nipm1jp1(izs:ize))
-        CALL interp_z(eo,nadiab_moments_i(ip-1,ij-1,ikx,iky,izgs:izge), nipm1jm1(izs:ize))
-        ! for hyperdiffusion
-        CALL  grad_z2(moments_i(ip,ij,ikx,iky,izgs:izge,updatetlevel),  ddz2Nipj(:))
-
-        zloopi : DO  iz = izs,ize
-          kperp2= kparray(ikx,iky,iz,eo)**2
-
-          !! Compute moments mixing terms
-          Tperp = 0._dp; Tpar = 0._dp; Tmir = 0._dp
-          ! Perpendicular dynamic
-          ! term propto n_i^{p,j}
-          Tnipj   = xnipj(ip,ij) * nadiab_moments_i(ip    ,ij  ,ikx,iky,iz)
-          ! term propto n_i^{p+2,j}
-          Tnipp2j = xnipp2j(ip)  * nadiab_moments_i(ip+pp2,ij  ,ikx,iky,iz)
-          ! term propto n_i^{p-2,j}
-          Tnipm2j = xnipm2j(ip)  * nadiab_moments_i(ip-pp2,ij  ,ikx,iky,iz)
-          ! term propto n_i^{p,j+1}
-          Tnipjp1 = xnipjp1(ij)  * nadiab_moments_i(ip    ,ij+1,ikx,iky,iz)
-          ! term propto n_i^{p,j-1}
-          Tnipjm1 = xnipjm1(ij)  * nadiab_moments_i(ip    ,ij-1,ikx,iky,iz)
-          ! Tperp
-          Tperp = Tnipj + Tnipp2j + Tnipm2j + Tnipjp1 + Tnipjm1
-          ! Parallel dynamic
-          ! ddz derivative for Landau damping term
-          Tpar = xnipp1j(ip) * ddznipp1j(iz) + xnipm1j(ip) * ddznipm1j(iz)
-          ! Mirror terms
-          Tnipp1j   = ynipp1j  (ip,ij) * nipp1j  (iz)
-          Tnipp1jm1 = ynipp1jm1(ip,ij) * nipp1jm1(iz)
-          Tnipm1j   = ynipm1j  (ip,ij) * nipm1j  (iz)
-          Tnipm1jm1 = ynipm1jm1(ip,ij) * nipm1jm1(iz)
-          ! Trapping terms
-          Unipm1j   = znipm1j  (ip,ij) * nipm1j  (iz)
-          Unipm1jp1 = znipm1jp1(ip,ij) * nipm1jp1(iz)
-          Unipm1jm1 = znipm1jm1(ip,ij) * nipm1jm1(iz)
-
-          Tmir = Tnipp1j + Tnipp1jm1 + Tnipm1j + Tnipm1jm1 + Unipm1j + Unipm1jp1 + Unipm1jm1
 
-          !! Electrical potential term
-          IF ( p_int .LE. 2 ) THEN ! kronecker p0 p1 p2
-            Tphi = (xphij_i  (ip,ij)*kernel_i(ij  ,ikx,iky,iz,eo) &
-                  + xphijp1_i(ip,ij)*kernel_i(ij+1,ikx,iky,iz,eo) &
-                  + xphijm1_i(ip,ij)*kernel_i(ij-1,ikx,iky,iz,eo))*phi(ikx,iky,iz)
-          ELSE
-            Tphi = 0._dp
-          ENDIF
-
-          !! Sum of all RHS terms
-          moments_rhs_i(ip,ij,ikx,iky,iz,updatetlevel) = &
-              ! Perpendicular magnetic gradient/curvature effects
-              - imagu*Ckxky(ikx,iky,iz,eo)*hatR(iz,eo) * Tperp &
-              ! Parallel coupling (Landau damping)
-              - gradz_coeff(iz,eo) * Tpar &
-              ! Mirror term (parallel magnetic gradient)
-              - gradzB(iz,eo) * gradz_coeff(iz,eo) * Tmir &
-              ! Drives (density + temperature gradients)
-              - i_ky * Tphi &
-              ! Numerical hyperdiffusion (totally artificial, for stability purpose)
-              - (mu_x*kx**4 + mu_y*ky**4)*moments_i(ip,ij,ikx,iky,iz,updatetlevel) &
-              ! Numerical parallel hyperdiffusion "+ (mu_z*kz**4)"
-              + mu_z * diff_dz_coeff * ddz2Nipj(iz) &
-              ! Collision term
-              + TColl_i(ip,ij,ikx,iky,iz)&
-              ! Nonlinear term
-              - Sipj(ip,ij,ikx,iky,iz)
+  ! Spatial loops
+  zloopi : DO  iz = izs,ize
+    kyloopi : DO iky = ikys,ikye
+      ky     = kyarray(iky)   ! toroidal wavevector
+      i_ky   = imagu * ky     ! toroidal derivative
 
-          END DO zloopi
-        END DO kyloopi
+      kxloopi : DO ikx = ikxs,ikxe
+        kx       = kxarray(ikx)   ! radial wavevector
+        phikxkyz = phi(ikx,iky,iz)! tmp phi value
+
+        ! Kinetic loops
+        jloopi : DO ij = ijs_i, ije_i  ! This loop is from 1 to jmaxi+1
+          j_int = jarray_i(ij)
+
+          ploopi : DO ip = ips_i, ipe_i  ! Hermite loop
+            p_int = parray_i(ip)    ! Hermite degree
+            eo    = MODULO(p_int,2) ! Indicates if we are on odd or even z grid
+            kperp2= kparray(ikx,iky,iz,eo)**2
+
+            !! Compute moments mixing terms
+            Tperp = 0._dp; Tpar = 0._dp; Tmir = 0._dp
+            ! Perpendicular dynamic
+            ! term propto n_i^{p,j}
+            Tnipj   = xnipj(ip,ij) * nadiab_moments_i(ip    ,ij  ,ikx,iky,iz)
+            ! term propto n_i^{p+2,j}
+            Tnipp2j = xnipp2j(ip)  * nadiab_moments_i(ip+pp2,ij  ,ikx,iky,iz)
+            ! term propto n_i^{p-2,j}
+            Tnipm2j = xnipm2j(ip)  * nadiab_moments_i(ip-pp2,ij  ,ikx,iky,iz)
+            ! term propto n_i^{p,j+1}
+            Tnipjp1 = xnipjp1(ij)  * nadiab_moments_i(ip    ,ij+1,ikx,iky,iz)
+            ! term propto n_i^{p,j-1}
+            Tnipjm1 = xnipjm1(ij)  * nadiab_moments_i(ip    ,ij-1,ikx,iky,iz)
+            ! Tperp
+            Tperp   = Tnipj + Tnipp2j + Tnipm2j + Tnipjp1 + Tnipjm1
+            ! Parallel dynamic
+            ! ddz derivative for Landau damping term
+            Tpar      = xnipp1j(ip) * ddz_nipj(ip+1,ij,ikx,iky,iz) &
+                      + xnipm1j(ip) * ddz_nipj(ip-1,ij,ikx,iky,iz)
+            ! Mirror terms
+            Tnipp1j   = ynipp1j  (ip,ij) * interp_nipj(ip+1,ij  ,ikx,iky,iz)
+            Tnipp1jm1 = ynipp1jm1(ip,ij) * interp_nipj(ip+1,ij-1,ikx,iky,iz)
+            Tnipm1j   = ynipm1j  (ip,ij) * interp_nipj(ip-1,ij  ,ikx,iky,iz)
+            Tnipm1jm1 = ynipm1jm1(ip,ij) * interp_nipj(ip-1,ij-1,ikx,iky,iz)
+            ! Trapping terms
+            Unipm1j   = znipm1j  (ip,ij) * interp_nipj(ip-1,ij  ,ikx,iky,iz)
+            Unipm1jp1 = znipm1jp1(ip,ij) * interp_nipj(ip-1,ij+1,ikx,iky,iz)
+            Unipm1jm1 = znipm1jm1(ip,ij) * interp_nipj(ip-1,ij-1,ikx,iky,iz)
+
+            Tmir = Tnipp1j + Tnipp1jm1 + Tnipm1j + Tnipm1jm1 + Unipm1j + Unipm1jp1 + Unipm1jm1
+
+            !! Electrical potential term
+            IF ( p_int .LE. 2 ) THEN ! kronecker p0 p1 p2
+              Tphi = (xphij_i  (ip,ij)*kernel_i(ij  ,ikx,iky,iz,eo) &
+                    + xphijp1_i(ip,ij)*kernel_i(ij+1,ikx,iky,iz,eo) &
+                    + xphijm1_i(ip,ij)*kernel_i(ij-1,ikx,iky,iz,eo))*phikxkyz
+            ELSE
+              Tphi = 0._dp
+            ENDIF
+
+            !! Sum of all RHS terms
+            moments_rhs_i(ip,ij,ikx,iky,iz,updatetlevel) = &
+                ! Perpendicular magnetic gradient/curvature effects
+                - imagu*Ckxky(ikx,iky,iz,eo)*hatR(iz,eo) * Tperp &
+                ! Parallel coupling (Landau damping)
+                - gradz_coeff(iz,eo) * Tpar &
+                ! Mirror term (parallel magnetic gradient)
+                - gradzB(iz,eo) * gradz_coeff(iz,eo) * Tmir &
+                ! Drives (density + temperature gradients)
+                - i_ky * Tphi &
+                ! Numerical hyperdiffusion (totally artificial, for stability purpose)
+                - (mu_x*kx**4 + mu_y*ky**4)*moments_i(ip,ij,ikx,iky,iz,updatetlevel) &
+                ! Numerical parallel hyperdiffusion "+ (mu_z*kz**4)"
+                + mu_z * diff_dz_coeff * ddz2_Nipj(ip,ij,ikx,iky,iz) &
+                ! Collision term
+                + TColl_i(ip,ij,ikx,iky,iz)&
+                ! Nonlinear term
+                - Sipj(ip,ij,ikx,iky,iz)
+
+          END DO ploopi
+        END DO jloopi
       END DO kxloopi
-    END DO jloopi
-  END DO ploopi
+    END DO kyloopi
+  END DO zloopi
 
   ! Execution time end
   CALL cpu_time(t1_rhs)
diff --git a/src/stepon.F90 b/src/stepon.F90
index c518a430..d928a4b8 100644
--- a/src/stepon.F90
+++ b/src/stepon.F90
@@ -1,18 +1,14 @@
 SUBROUTINE stepon
   !   Advance one time step, (num_step=4 for Runge Kutta 4 scheme)
   USE advance_field_routine, ONLY: advance_time_level, advance_field, advance_moments
-  USE array , ONLY: moments_rhs_e, moments_rhs_i, Sepj, Sipj
   USE basic
   USE closure
-  USE fields, ONLY: moments_e, moments_i, phi
-  USE initial_par, ONLY: ACT_ON_MODES
   USE ghosts
   USE grid
   USE model, ONLY : LINEARITY, KIN_E
   use prec_const
   USE time_integration
   USE numerics, ONLY: play_with_modes
-  USE processing, ONLY: compute_nadiab_moments, compute_fluid_moments
   USE utility, ONLY: checkfield
   IMPLICIT NONE
 
@@ -20,7 +16,7 @@ SUBROUTINE stepon
   LOGICAL :: mlend
 
    DO num_step=1,ntimelevel ! eg RK4 compute successively k1, k2, k3, k4
-   !----- BEFORE: All fields are updated for step = n
+   !----- BEFORE: All fields+ghosts are updated for step = n
       ! Compute right hand side from current fields
       ! N_rhs(N_n, nadia_n, phi_n, S_n, Tcoll_n)
       CALL assemble_RHS
@@ -61,9 +57,10 @@ SUBROUTINE stepon
        USE moments_eq_rhs, ONLY: compute_moments_eq_rhs
        USE collision,      ONLY: compute_TColl
        USE nonlinear,      ONLY: compute_Sapj
+       USE processing,     ONLY: compute_nadiab_moments_z_gradients_and_interp
        IMPLICIT NONE
-         ! compute auxiliary non adiabatic moments arrays
-         CALL compute_nadiab_moments
+         ! compute auxiliary non adiabatic moments array, gradients and interp
+         CALL compute_nadiab_moments_z_gradients_and_interp
          ! compute nonlinear term ("if linear" is included inside)
          CALL compute_Sapj
          ! compute collision term ("if coll, if nu >0" is included inside)
@@ -83,14 +80,14 @@ SUBROUTINE stepon
         IF(.NOT.nlend) THEN
            mlend=mlend .or. checkfield(phi,' phi')
            IF(KIN_E) THEN
-           DO ip=ipgs_e,ipge_e
-             DO ij=ijgs_e,ijge_e
+           DO ij=ijgs_e,ijge_e
+             DO ip=ipgs_e,ipge_e
               mlend=mlend .or. checkfield(moments_e(ip,ij,:,:,:,updatetlevel),' moments_e')
              ENDDO
            ENDDO
            ENDIF
-           DO ip=ipgs_i,ipge_i
-             DO ij=ijgs_i,ijge_i
+           DO ij=ijgs_i,ijge_i
+             DO ip=ipgs_i,ipge_i
               mlend=mlend .or. checkfield(moments_i(ip,ij,:,:,:,updatetlevel),' moments_i')
              ENDDO
            ENDDO
@@ -104,11 +101,11 @@ SUBROUTINE stepon
 
       SUBROUTINE anti_aliasing
         IF(KIN_E)THEN
-        DO ip=ipgs_e,ipge_e
-          DO ij=ijgs_e,ijge_e
+        DO iz=izgs,izge
+          DO iky=ikys,ikye
             DO ikx=ikxs,ikxe
-              DO iky=ikys,ikye
-                DO iz=izgs,izge
+              DO ij=ijgs_e,ijge_e
+                DO ip=ipgs_e,ipge_e
                   moments_e( ip,ij,ikx,iky,iz,:) = AA_x(ikx)* AA_y(iky) * moments_e( ip,ij,ikx,iky,iz,:)
                 END DO
               END DO
@@ -116,11 +113,11 @@ SUBROUTINE stepon
           END DO
         END DO
         ENDIF
-        DO ip=ipgs_i,ipge_i
-          DO ij=ijs_i,ije_i
+        DO iz=izgs,izge
+          DO iky=ikys,ikye
             DO ikx=ikxs,ikxe
-              DO iky=ikys,ikye
-                DO iz=izgs,izge
+              DO ij=ijs_i,ije_i
+                DO ip=ipgs_i,ipge_i
                   moments_i( ip,ij,ikx,iky,iz,:) = AA_x(ikx)* AA_y(iky) * moments_i( ip,ij,ikx,iky,iz,:)
                 END DO
               END DO
@@ -133,12 +130,12 @@ SUBROUTINE stepon
         IF ( contains_kx0 ) THEN
           ! Electron moments
           IF(KIN_E) THEN
-          DO ip=ipgs_e,ipge_e
-            DO ij=ijgs_e,ijge_e
-              DO iz=izs,ize
-                DO iky=2,Nky/2 !symmetry at kx = 0
-                  moments_e( ip,ij,ikx_0,iky,iz, :) = CONJG(moments_e( ip,ij,ikx_0,Nky+2-iky,iz, :))
-                END DO
+            DO iz=izs,ize
+              DO ij=ijgs_e,ijge_e
+                DO ip=ipgs_e,ipge_e
+                  DO iky=2,Nky/2 !symmetry at kx = 0
+                    moments_e( ip,ij,ikx_0,iky,iz, :) = CONJG(moments_e( ip,ij,ikx_0,Nky+2-iky,iz, :))
+                  END DO
                 ! must be real at origin
                 moments_e(ip,ij, ikx_0,iky_0,iz, :) = REAL(moments_e(ip,ij, ikx_0,iky_0,iz, :))
               END DO
@@ -146,9 +143,9 @@ SUBROUTINE stepon
           END DO
           ENDIF
           ! Ion moments
-          DO ip=ipgs_i,ipge_i
+          DO iz=izs,ize
             DO ij=ijgs_i,ijge_i
-              DO iz=izs,ize
+              DO ip=ipgs_i,ipge_i
                 DO iky=2,Nky/2 !symmetry at kx = 0
                   moments_i( ip,ij,ikx_0,iky,iz, :) = CONJG(moments_i( ip,ij,ikx_0,Nky+2-iky,iz, :))
                 END DO
-- 
GitLab