From 6640dd9ffb1473694b86c3a569ab93515015f2a7 Mon Sep 17 00:00:00 2001
From: Antoine Cyril David Hoffmann <ahoffman@spcpc606.epfl.ch>
Date: Tue, 8 Dec 2020 17:07:33 +0100
Subject: [PATCH] Optimized version of nonlinear term computation

---
 src/compute_Sapj.F90     | 396 +++++++++++++++++++--------------------
 src/compute_Sapj_old.F90 | 207 ++++++++++++++++++++
 src/compute_Sapj_opt.F90 | 189 +++++++++++++++++++
 3 files changed, 585 insertions(+), 207 deletions(-)
 create mode 100644 src/compute_Sapj_old.F90
 create mode 100644 src/compute_Sapj_opt.F90

diff --git a/src/compute_Sapj.F90 b/src/compute_Sapj.F90
index b4d02513..a6cefadb 100644
--- a/src/compute_Sapj.F90
+++ b/src/compute_Sapj.F90
@@ -1,207 +1,189 @@
-SUBROUTINE compute_Sapj
-  ! This routine is meant to compute the non linear term for each specie and degree
-  !! In real space Sapj ~ b*(grad(phi) x grad(g)) which in moments in fourier becomes
-  !! Sapj = Sum_n (ikr Kn phi)#(ikz Sum_s d_njs Naps) - (ikz Kn phi)#(ikr Sum_s d_njs Naps)
-  !! where # denotes the convolution.
-  USE array, ONLY : dnjs, Sepj, Sipj
-  USE basic
-  USE fourier
-  USE fields!, ONLY : phi, moments_e, moments_i
-  USE grid
-  USE model
-  USE prec_const
-  USE time_integration!, ONLY : updatetlevel
-  IMPLICIT NONE
-
-  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: F_, G_, CONV
-  INTEGER :: in, is
-  REAL(dp):: kr, kz, kerneln, be_2, bi_2, factn
-  REAL(dp):: sigmae2_taue_o2, sigmai2_taui_o2
-
-  ! Execution time start
-  CALL cpu_time(t0_Sapj)
-
-  !!!!!!!!!!!!!!!!!!!! ELECTRON non linear term computation (Sepj)!!!!!!!!!!
-  sigmae2_taue_o2 = sigma_e**2 * tau_e/2._dp ! factor of the kerneln argument
-
-  ploope: DO ip = ips_e,ipe_e ! Loop over Hermite moments
-    jloope: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
-      Sepj(ip,ij,:,:)  = 0._dp
-      factn = 1
-
-      nloope: DO in = 1,jmaxe+1 ! Loop over laguerre for the sum
-
-        krloope1: DO ikr = ikrs,ikre ! Loop over kr
-          kzloope1: DO ikz = ikzs,ikze ! Loop over kz
-            kr     = krarray(ikr)
-            kz     = kzarray(ikz)
-            be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
-            kerneln = be_2**(in-1)/factn * EXP(-be_2)
-            ! First convolution term
-            IF ( NON_LIN ) THEN
-              F_(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
-            ELSE
-              F_(ikr,ikz) = 0._dp
-            ENDIF
-            ! Background sinusoidal electrostatic potential phi_0 for KH inst.
-            IF ( q_e .NE. 0._dp ) THEN ! If electron are not removed
-              IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
-                IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=kr0
-                  F_(ikr,ikz) = -A0KH/2._dp + F_(ikr,ikz)
-                ENDIF
-              ENDIF
-            ENDIF
-            ! Second convolution term
-            G_(ikr,ikz) = 0._dp ! initialization of the sum
-            DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
-              G_(ikr,ikz) = G_(ikr,ikz) + &
-               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
-            ENDDO
-            G_(ikr,ikz) = imagu*kz*G_(ikr,ikz)
-
-          ENDDO kzloope1
-        ENDDO krloope1
-
-        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve Fourier to Fourier
-
-        DO ikr = ikrs,ikre ! Loop over kr
-          DO ikz = ikzs,ikze ! Loop over kz
-            Sepj(ip,ij,ikr,ikz) = Sepj(ip,ij,ikr,ikz) + CONV(ikr,ikz) ! Add it to Sepj
-          ENDDO
-        ENDDO
-
-        IF ( NON_LIN ) THEN ! Fully non linear term ikz phi * ikr Napj
-          krloope2: DO ikr = ikrs,ikre ! Loop over kr
-            kzloope2: DO ikz = ikzs,ikze ! Loop over kz
-              kr     = krarray(ikr)
-              kz     = kzarray(ikz)
-              be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
-              kerneln = be_2**(in-1)/factn * EXP(-be_2)
-              ! First convolution term
-              F_(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
-              ! Second convolution term
-              G_(ikr,ikz) = 0._dp ! initialization of the sum
-              DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
-                G_(ikr,ikz) = G_(ikr,ikz) + &
-                 dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
-              ENDDO
-              G_(ikr,ikz) = imagu*kr*G_(ikr,ikz)
-
-            ENDDO kzloope2
-          ENDDO krloope2
-
-          CALL convolve_2D_F2F( F_, G_, CONV )
-
-          DO ikr = ikrs,ikre
-            DO ikz = ikzs,ikze
-              Sepj(ip,ij,ikr,ikz) = Sepj(ip,ij,ikr,ikz) - CONV(ikr,ikz) ! substract it to Sepj
-            ENDDO
-          ENDDO
-
-        ENDIF
-
-        IF ( in + 1 .LE. jmaxe+1 ) THEN
-          factn = real(in,dp) * factn ! compute (n+1)!
-        ENDIF
-      ENDDO nloope
-
-    ENDDO jloope
-  ENDDO ploope
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  !!!!!!!!!!!!!!!!!!!! ION non linear term computation (Sipj)!!!!!!!!!!
-  sigmai2_taui_o2 = sigma_i**2 * tau_i/2._dp
-
-  ploopi: DO ip = ips_i,ipe_i ! Loop over Hermite moments
-    jloopi: DO ij = ijs_i, ije_i ! Loop over Laguerre moments
-      Sipj(ip,ij,:,:)  = 0._dp
-      factn = 1
-
-      nloopi: DO in = 1,jmaxi+1 ! Loop over laguerre for the sum
-
-        krloopi1: DO ikr = ikrs,ikre ! Loop over kr
-          kzloopi1: DO ikz = ikzs,ikze ! Loop over kz
-            kr      = krarray(ikr)
-            kz      = kzarray(ikz)
-            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
-            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
-
-            ! First convolution term
-            IF ( NON_LIN ) THEN
-              F_(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
-            ELSE
-              F_(ikr,ikz) = 0._dp
-            ENDIF
-            ! Background sinusoidal electrostatic potential phi_0 for KH inst.
-            IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
-              IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=kr0
-                F_(ikr,ikz) = -A0KH/2._dp + F_(ikr,ikz)
-              ENDIF
-            ENDIF
-            ! Second convolution term
-            G_(ikr,ikz) = 0._dp ! initialization of the sum
-            DO is = 1, MIN( in+ij-1, jmaxi+1 )
-              G_(ikr,ikz) = G_(ikr,ikz) + &
-               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
-            ENDDO
-            G_(ikr,ikz) = imagu*kz*G_(ikr,ikz)
-
-          ENDDO kzloopi1
-        ENDDO krloopi1
-
-        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve Fourier to Fourier
-        DO ikr = ikrs,ikre
-          DO ikz = ikzs,ikze
-            Sipj(ip,ij,ikr,ikz) = Sipj(ip,ij,ikr,ikz) + CONV(ikr,ikz) ! add it to Sipj
-          ENDDO
-        ENDDO
-
-        krloopi2: DO ikr = ikrs,ikre ! Loop over kr
-          kzloopi2: DO ikz = ikzs,ikze ! Loop over kz
-            kr      = krarray(ikr)
-            kz      = kzarray(ikz)
-            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
-            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
-            ! First convolution term
-            F_(ikr,ikz) = imagu*kz*phi(ikr,ikz) * kerneln
-            ! Second convolution term
-            G_(ikr,ikz) = 0._dp ! initialization of the sum
-            IF ( NON_LIN ) THEN
-              DO is = 1, MIN( in+ij-1, jmaxi+1 )
-                G_(ikr,ikz) = G_(ikr,ikz) + &
-                 dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
-              ENDDO
-              G_(ikr,ikz) = imagu*kr*G_(ikr,ikz)
-            ENDIF
-            ! Background sinusoidal ion denstiy n_i0 for KH inst.
-            IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
-              IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=+-kr0
-                G_(ikr,ikz) = -A0KH*kr0KH**2/2._dp + G_(ikr,ikz)
-              ENDIF
-            ENDIF
-
-          ENDDO kzloopi2
-        ENDDO krloopi2
-
-        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve and back to Fourier
-        DO ikr = ikrs,ikre
-          DO ikz = ikzs,ikze
-            Sipj(ip,ij,ikr,ikz) = Sipj(ip,ij,ikr,ikz) - CONV(ikr,ikz) ! substract it to Sepj
-          ENDDO
-        ENDDO
-
-        IF ( in + 1 .LE. jmaxi+1 ) THEN
-          factn = real(in,dp) * factn ! factorial(n+1)
-        ENDIF
-
-      ENDDO nloopi
-
-    ENDDO jloopi
-  ENDDO ploopi
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  ! Execution time end
-  CALL cpu_time(t1_Sapj)
-  tc_Sapj = tc_Sapj + (t1_Sapj - t0_Sapj)
-
-END SUBROUTINE compute_Sapj
+SUBROUTINE compute_Sapj
+  ! This routine is meant to compute the non linear term for each specie and degree
+  !! In real space Sapj ~ b*(grad(phi) x grad(g)) which in moments in fourier becomes
+  !! Sapj = Sum_n (ikr Kn phi)#(ikz Sum_s d_njs Naps) - (ikz Kn phi)#(ikr Sum_s d_njs Naps)
+  !! where # denotes the convolution.
+  USE array, ONLY : dnjs, Sepj, Sipj
+  USE basic
+  USE fourier
+  USE fields!, ONLY : phi, moments_e, moments_i
+  USE grid
+  USE model
+  USE prec_const
+  USE time_integration!, ONLY : updatetlevel
+  IMPLICIT NONE
+  INCLUDE 'fftw3-mpi.f03'
+
+  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: Fr_cmpx, Gz_cmpx
+  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: Fz_cmpx, Gr_cmpx, F_conv_G
+  REAL(dp),    DIMENSION(irs:ire,izs:ize)     :: fr_real, gz_real
+  REAL(dp),    DIMENSION(irs:ire,izs:ize)     :: fz_real, gr_real, f_times_g
+
+  INTEGER :: in, is
+  REAL(dp):: kr, kz, kerneln, be_2, bi_2, factn
+  REAL(dp):: sigmae2_taue_o2, sigmai2_taui_o2
+
+  ! Execution time start
+  CALL cpu_time(t0_Sapj)
+
+  !!!!!!!!!!!!!!!!!!!! ELECTRON non linear term computation (Sepj)!!!!!!!!!!
+  sigmae2_taue_o2 = sigma_e**2 * tau_e/2._dp ! factor of the kerneln argument
+
+  ploope: DO ip = ips_e,ipe_e ! Loop over Hermite moments
+    jloope: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
+      real_data_c = 0._dp ! initialize sum over real nonlinear term
+      factn = 1
+
+      nloope: DO in = 1,jmaxe+1 ! Loop over laguerre for the sum
+
+        krloope: DO ikr = ikrs,ikre ! Loop over kr
+          kzloope: DO ikz = ikzs,ikze ! Loop over kz
+            kr     = krarray(ikr)
+            kz     = kzarray(ikz)
+            be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
+            kerneln = be_2**(in-1)/factn * EXP(-be_2)
+            ! First convolution terms
+            Fr_cmpx(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            Fz_cmpx(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
+            ! Second convolution terms
+            Gz_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            Gr_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
+              Gz_cmpx(ikr,ikz) = Gz_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+              Gr_cmpx(ikr,ikz) = Gr_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            Gz_cmpx(ikr,ikz) = imagu*kz*Gz_cmpx(ikr,ikz)
+            Gr_cmpx(ikr,ikz) = imagu*kr*Gr_cmpx(ikr,ikz)
+          ENDDO kzloope
+        ENDDO krloope
+
+        ! First term drphi x dzf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fr_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gz_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c + real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        ! Second term -dzphi x drf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fz_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gr_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c - real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        IF ( in + 1 .LE. jmaxe+1 ) THEN
+          factn = real(in,dp) * factn ! compute (n+1)!
+        ENDIF
+      ENDDO nloope
+
+      ! Put the real nonlinear product into k-space
+      call fftw_mpi_execute_dft_r2c(planf, real_data_c, cmpx_data_c)
+
+      ! Retrieve convolution in input format
+      DO ikr = ikrs, ikre
+        DO ikz = ikzs, ikze
+          Sepj(ip,ij,ikr,ikz) = cmpx_data_c(ikz,ikr-local_nkr_offset)*AA_r(ikr)*AA_z(ikz)
+        ENDDO
+      ENDDO
+
+    ENDDO jloope
+  ENDDO ploope
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  !!!!!!!!!!!!!!!!!!!! ION non linear term computation (Sipj)!!!!!!!!!!
+  sigmai2_taui_o2 = sigma_i**2 * tau_i/2._dp ! factor of the kerneln argument
+
+  ploopi: DO ip = ips_e,ipe_e ! Loop over Hermite moments
+    jloopi: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
+      real_data_c = 0._dp ! initialize sum over real nonlinear term
+      factn = 1
+
+      nloopi: DO in = 1,jmaxi+1 ! Loop over laguerre for the sum
+
+        krloopi: DO ikr = ikrs,ikre ! Loop over kr
+          kzloopi: DO ikz = ikzs,ikze ! Loop over kz
+            kr      = krarray(ikr)
+            kz      = kzarray(ikz)
+            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
+            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
+            ! First convolution terms
+            Fr_cmpx(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            Fz_cmpx(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
+            ! Second convolution terms
+            Gz_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            Gr_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxi+1 ) ! sum truncation on number of moments
+              Gz_cmpx(ikr,ikz) = Gz_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+              Gr_cmpx(ikr,ikz) = Gr_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            Gz_cmpx(ikr,ikz) = imagu*kz*Gz_cmpx(ikr,ikz)
+            Gr_cmpx(ikr,ikz) = imagu*kr*Gr_cmpx(ikr,ikz)
+          ENDDO kzloopi
+        ENDDO krloopi
+
+        ! First term drphi x dzf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fr_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gz_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c + real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        ! Second term -dzphi x drf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fz_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gr_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c - real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        IF ( in + 1 .LE. jmaxe+1 ) THEN
+          factn = real(in,dp) * factn ! compute (n+1)!
+        ENDIF
+      ENDDO nloopi
+
+      ! Put the real nonlinear product into k-space
+      call fftw_mpi_execute_dft_r2c(planf, real_data_c, cmpx_data_c)
+
+      ! Retrieve convolution in input format
+      DO ikr = ikrs, ikre
+        DO ikz = ikzs, ikze
+          Sipj(ip,ij,ikr,ikz) = cmpx_data_c(ikz,ikr-local_nkr_offset)*AA_r(ikr)*AA_z(ikz)
+        ENDDO
+      ENDDO
+
+    ENDDO jloopi
+  ENDDO ploopi
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  ! Execution time END
+  CALL cpu_time(t1_Sapj)
+  tc_Sapj = tc_Sapj + (t1_Sapj - t0_Sapj)
+
+END SUBROUTINE compute_Sapj
diff --git a/src/compute_Sapj_old.F90 b/src/compute_Sapj_old.F90
new file mode 100644
index 00000000..b4d02513
--- /dev/null
+++ b/src/compute_Sapj_old.F90
@@ -0,0 +1,207 @@
+SUBROUTINE compute_Sapj
+  ! This routine is meant to compute the non linear term for each specie and degree
+  !! In real space Sapj ~ b*(grad(phi) x grad(g)) which in moments in fourier becomes
+  !! Sapj = Sum_n (ikr Kn phi)#(ikz Sum_s d_njs Naps) - (ikz Kn phi)#(ikr Sum_s d_njs Naps)
+  !! where # denotes the convolution.
+  USE array, ONLY : dnjs, Sepj, Sipj
+  USE basic
+  USE fourier
+  USE fields!, ONLY : phi, moments_e, moments_i
+  USE grid
+  USE model
+  USE prec_const
+  USE time_integration!, ONLY : updatetlevel
+  IMPLICIT NONE
+
+  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: F_, G_, CONV
+  INTEGER :: in, is
+  REAL(dp):: kr, kz, kerneln, be_2, bi_2, factn
+  REAL(dp):: sigmae2_taue_o2, sigmai2_taui_o2
+
+  ! Execution time start
+  CALL cpu_time(t0_Sapj)
+
+  !!!!!!!!!!!!!!!!!!!! ELECTRON non linear term computation (Sepj)!!!!!!!!!!
+  sigmae2_taue_o2 = sigma_e**2 * tau_e/2._dp ! factor of the kerneln argument
+
+  ploope: DO ip = ips_e,ipe_e ! Loop over Hermite moments
+    jloope: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
+      Sepj(ip,ij,:,:)  = 0._dp
+      factn = 1
+
+      nloope: DO in = 1,jmaxe+1 ! Loop over laguerre for the sum
+
+        krloope1: DO ikr = ikrs,ikre ! Loop over kr
+          kzloope1: DO ikz = ikzs,ikze ! Loop over kz
+            kr     = krarray(ikr)
+            kz     = kzarray(ikz)
+            be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
+            kerneln = be_2**(in-1)/factn * EXP(-be_2)
+            ! First convolution term
+            IF ( NON_LIN ) THEN
+              F_(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            ELSE
+              F_(ikr,ikz) = 0._dp
+            ENDIF
+            ! Background sinusoidal electrostatic potential phi_0 for KH inst.
+            IF ( q_e .NE. 0._dp ) THEN ! If electron are not removed
+              IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
+                IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=kr0
+                  F_(ikr,ikz) = -A0KH/2._dp + F_(ikr,ikz)
+                ENDIF
+              ENDIF
+            ENDIF
+            ! Second convolution term
+            G_(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
+              G_(ikr,ikz) = G_(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            G_(ikr,ikz) = imagu*kz*G_(ikr,ikz)
+
+          ENDDO kzloope1
+        ENDDO krloope1
+
+        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve Fourier to Fourier
+
+        DO ikr = ikrs,ikre ! Loop over kr
+          DO ikz = ikzs,ikze ! Loop over kz
+            Sepj(ip,ij,ikr,ikz) = Sepj(ip,ij,ikr,ikz) + CONV(ikr,ikz) ! Add it to Sepj
+          ENDDO
+        ENDDO
+
+        IF ( NON_LIN ) THEN ! Fully non linear term ikz phi * ikr Napj
+          krloope2: DO ikr = ikrs,ikre ! Loop over kr
+            kzloope2: DO ikz = ikzs,ikze ! Loop over kz
+              kr     = krarray(ikr)
+              kz     = kzarray(ikz)
+              be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
+              kerneln = be_2**(in-1)/factn * EXP(-be_2)
+              ! First convolution term
+              F_(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
+              ! Second convolution term
+              G_(ikr,ikz) = 0._dp ! initialization of the sum
+              DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
+                G_(ikr,ikz) = G_(ikr,ikz) + &
+                 dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+              ENDDO
+              G_(ikr,ikz) = imagu*kr*G_(ikr,ikz)
+
+            ENDDO kzloope2
+          ENDDO krloope2
+
+          CALL convolve_2D_F2F( F_, G_, CONV )
+
+          DO ikr = ikrs,ikre
+            DO ikz = ikzs,ikze
+              Sepj(ip,ij,ikr,ikz) = Sepj(ip,ij,ikr,ikz) - CONV(ikr,ikz) ! substract it to Sepj
+            ENDDO
+          ENDDO
+
+        ENDIF
+
+        IF ( in + 1 .LE. jmaxe+1 ) THEN
+          factn = real(in,dp) * factn ! compute (n+1)!
+        ENDIF
+      ENDDO nloope
+
+    ENDDO jloope
+  ENDDO ploope
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  !!!!!!!!!!!!!!!!!!!! ION non linear term computation (Sipj)!!!!!!!!!!
+  sigmai2_taui_o2 = sigma_i**2 * tau_i/2._dp
+
+  ploopi: DO ip = ips_i,ipe_i ! Loop over Hermite moments
+    jloopi: DO ij = ijs_i, ije_i ! Loop over Laguerre moments
+      Sipj(ip,ij,:,:)  = 0._dp
+      factn = 1
+
+      nloopi: DO in = 1,jmaxi+1 ! Loop over laguerre for the sum
+
+        krloopi1: DO ikr = ikrs,ikre ! Loop over kr
+          kzloopi1: DO ikz = ikzs,ikze ! Loop over kz
+            kr      = krarray(ikr)
+            kz      = kzarray(ikz)
+            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
+            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
+
+            ! First convolution term
+            IF ( NON_LIN ) THEN
+              F_(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            ELSE
+              F_(ikr,ikz) = 0._dp
+            ENDIF
+            ! Background sinusoidal electrostatic potential phi_0 for KH inst.
+            IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
+              IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=kr0
+                F_(ikr,ikz) = -A0KH/2._dp + F_(ikr,ikz)
+              ENDIF
+            ENDIF
+            ! Second convolution term
+            G_(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxi+1 )
+              G_(ikr,ikz) = G_(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            G_(ikr,ikz) = imagu*kz*G_(ikr,ikz)
+
+          ENDDO kzloopi1
+        ENDDO krloopi1
+
+        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve Fourier to Fourier
+        DO ikr = ikrs,ikre
+          DO ikz = ikzs,ikze
+            Sipj(ip,ij,ikr,ikz) = Sipj(ip,ij,ikr,ikz) + CONV(ikr,ikz) ! add it to Sipj
+          ENDDO
+        ENDDO
+
+        krloopi2: DO ikr = ikrs,ikre ! Loop over kr
+          kzloopi2: DO ikz = ikzs,ikze ! Loop over kz
+            kr      = krarray(ikr)
+            kz      = kzarray(ikz)
+            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
+            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
+            ! First convolution term
+            F_(ikr,ikz) = imagu*kz*phi(ikr,ikz) * kerneln
+            ! Second convolution term
+            G_(ikr,ikz) = 0._dp ! initialization of the sum
+            IF ( NON_LIN ) THEN
+              DO is = 1, MIN( in+ij-1, jmaxi+1 )
+                G_(ikr,ikz) = G_(ikr,ikz) + &
+                 dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+              ENDDO
+              G_(ikr,ikz) = imagu*kr*G_(ikr,ikz)
+            ENDIF
+            ! Background sinusoidal ion denstiy n_i0 for KH inst.
+            IF ( kz .EQ. 0._dp ) THEN ! Kronecker kz=0
+              IF ( ABS(kr) .EQ. ABS(kr0KH) ) THEN ! kronecker kr=+-kr0
+                G_(ikr,ikz) = -A0KH*kr0KH**2/2._dp + G_(ikr,ikz)
+              ENDIF
+            ENDIF
+
+          ENDDO kzloopi2
+        ENDDO krloopi2
+
+        CALL convolve_2D_F2F( F_, G_, CONV ) ! Convolve and back to Fourier
+        DO ikr = ikrs,ikre
+          DO ikz = ikzs,ikze
+            Sipj(ip,ij,ikr,ikz) = Sipj(ip,ij,ikr,ikz) - CONV(ikr,ikz) ! substract it to Sepj
+          ENDDO
+        ENDDO
+
+        IF ( in + 1 .LE. jmaxi+1 ) THEN
+          factn = real(in,dp) * factn ! factorial(n+1)
+        ENDIF
+
+      ENDDO nloopi
+
+    ENDDO jloopi
+  ENDDO ploopi
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  ! Execution time end
+  CALL cpu_time(t1_Sapj)
+  tc_Sapj = tc_Sapj + (t1_Sapj - t0_Sapj)
+
+END SUBROUTINE compute_Sapj
diff --git a/src/compute_Sapj_opt.F90 b/src/compute_Sapj_opt.F90
new file mode 100644
index 00000000..3fed2287
--- /dev/null
+++ b/src/compute_Sapj_opt.F90
@@ -0,0 +1,189 @@
+SUBROUTINE compute_Sapj
+  ! This routine is meant to compute the non linear term for each specie and degree
+  !! In real space Sapj ~ b*(grad(phi) x grad(g)) which in moments in fourier becomes
+  !! Sapj = Sum_n (ikr Kn phi)#(ikz Sum_s d_njs Naps) - (ikz Kn phi)#(ikr Sum_s d_njs Naps)
+  !! where # denotes the convolution.
+  USE array, ONLY : dnjs, Sepj, Sipj
+  USE basic
+  USE fourier
+  USE fields!, ONLY : phi, moments_e, moments_i
+  USE grid
+  USE model
+  USE prec_const
+  USE time_integration!, ONLY : updatetlevel
+  IMPLICIT NONE
+  INCLUDE 'fftw3-mpi.f03'
+
+  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: Fr_cmpx, Gz_cmpx
+  COMPLEX(dp), DIMENSION(ikrs:ikre,ikzs:ikze) :: Fz_cmpx, Gr_cmpx, F_conv_G
+  REAL(dp),    DIMENSION(irs:ire,izs:ize)     :: fr_real, gz_real
+  REAL(dp),    DIMENSION(irs:ire,izs:ize)     :: fz_real, gr_real, f_times_g
+
+  INTEGER :: in, is
+  REAL(dp):: kr, kz, kerneln, be_2, bi_2, factn
+  REAL(dp):: sigmae2_taue_o2, sigmai2_taui_o2
+
+  ! Execution time start
+  CALL cpu_time(t0_Sapj)
+
+  !!!!!!!!!!!!!!!!!!!! ELECTRON non linear term computation (Sepj)!!!!!!!!!!
+  sigmae2_taue_o2 = sigma_e**2 * tau_e/2._dp ! factor of the kerneln argument
+
+  ploope: DO ip = ips_e,ipe_e ! Loop over Hermite moments
+    jloope: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
+      real_data_c = 0._dp ! initialize sum over real nonlinear term
+      factn = 1
+
+      nloope: DO in = 1,jmaxe+1 ! Loop over laguerre for the sum
+
+        krloope: DO ikr = ikrs,ikre ! Loop over kr
+          kzloope: DO ikz = ikzs,ikze ! Loop over kz
+            kr     = krarray(ikr)
+            kz     = kzarray(ikz)
+            be_2    = sigmae2_taue_o2 * (kr**2 + kz**2)
+            kerneln = be_2**(in-1)/factn * EXP(-be_2)
+            ! First convolution terms
+            Fr_cmpx(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            Fz_cmpx(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
+            ! Second convolution terms
+            Gz_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            Gr_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxe+1 ) ! sum truncation on number of moments
+              Gz_cmpx(ikr,ikz) = Gz_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+              Gr_cmpx(ikr,ikz) = Gr_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_e(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            Gz_cmpx(ikr,ikz) = imagu*kz*Gz_cmpx(ikr,ikz)
+            Gr_cmpx(ikr,ikz) = imagu*kr*Gr_cmpx(ikr,ikz)
+          ENDDO kzloope
+        ENDDO krloope
+
+        ! First term drphi x dzf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fr_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gz_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c + real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        ! Second term -dzphi x drf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fz_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gr_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c - real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        IF ( in + 1 .LE. jmaxe+1 ) THEN
+          factn = real(in,dp) * factn ! compute (n+1)!
+        ENDIF
+      ENDDO nloope
+
+      ! Put the real nonlinear product into k-space
+      call fftw_mpi_execute_dft_r2c(planf, real_data_c, cmpx_data_c)
+
+      ! Retrieve convolution in input format
+      DO ikr = ikrs, ikre
+        DO ikz = ikzs, ikze
+          Sepj(ip,ij,ikr,ikz) = cmpx_data_c(ikz,ikr-local_nkr_offset)*AA_r(ikr)*AA_z(ikz)
+        ENDDO
+      ENDDO
+
+    ENDDO jloope
+  ENDDO ploope
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  !!!!!!!!!!!!!!!!!!!! ION non linear term computation (Sipj)!!!!!!!!!!
+  sigmai2_taui_o2 = sigma_i**2 * tau_i/2._dp ! factor of the kerneln argument
+
+  ploopi: DO ip = ips_e,ipe_e ! Loop over Hermite moments
+    jloopi: DO ij = ijs_e, ije_e ! Loop over Laguerre moments
+      real_data_c = 0._dp ! initialize sum over real nonlinear term
+      factn = 1
+
+      nloopi: DO in = 1,jmaxi+1 ! Loop over laguerre for the sum
+
+        krloopi: DO ikr = ikrs,ikre ! Loop over kr
+          kzloopi: DO ikz = ikzs,ikze ! Loop over kz
+            kr      = krarray(ikr)
+            kz      = kzarray(ikz)
+            bi_2    = sigmai2_taui_o2 * (kr**2 + kz**2)
+            kerneln = bi_2**(in-1)/factn * EXP(-bi_2)
+            ! First convolution terms
+            Fr_cmpx(ikr,ikz) = imagu*kr* phi(ikr,ikz) * kerneln
+            Fz_cmpx(ikr,ikz) = imagu*kz* phi(ikr,ikz) * kerneln
+            ! Second convolution terms
+            Gz_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            Gr_cmpx(ikr,ikz) = 0._dp ! initialization of the sum
+            DO is = 1, MIN( in+ij-1, jmaxi+1 ) ! sum truncation on number of moments
+              Gz_cmpx(ikr,ikz) = Gz_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+              Gr_cmpx(ikr,ikz) = Gr_cmpx(ikr,ikz) + &
+               dnjs(in,ij,is) * moments_i(ip,is,ikr,ikz,updatetlevel)
+            ENDDO
+            Gz_cmpx(ikr,ikz) = imagu*kz*Gz_cmpx(ikr,ikz)
+            Gr_cmpx(ikr,ikz) = imagu*kr*Gr_cmpx(ikr,ikz)
+          ENDDO kzloopi
+        ENDDO krloopi
+
+        ! First term drphi x dzf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fr_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gz_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c + real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        ! Second term -dzphi x drf
+        DO ikr = ikrs, ikre
+          DO ikz = ikzs, ikze
+            cmpx_data_f(ikz,ikr-local_nkr_offset) = Fz_cmpx(ikr,ikz)
+            cmpx_data_g(ikz,ikr-local_nkr_offset) = Gr_cmpx(ikr,ikz)
+          ENDDO
+        ENDDO
+
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_f, real_data_f)
+        call fftw_mpi_execute_dft_c2r(planb, cmpx_data_g, real_data_g)
+
+        real_data_c = real_data_c - real_data_f/Nz/Nr  * real_data_g/Nz/Nr
+
+        IF ( in + 1 .LE. jmaxe+1 ) THEN
+          factn = real(in,dp) * factn ! compute (n+1)!
+        ENDIF
+      ENDDO nloopi
+
+      ! Put the real nonlinear product into k-space
+      call fftw_mpi_execute_dft_r2c(planf, real_data_c, cmpx_data_c)
+
+      ! Retrieve convolution in input format
+      DO ikr = ikrs, ikre
+        DO ikz = ikzs, ikze
+          Sipj(ip,ij,ikr,ikz) = cmpx_data_c(ikz,ikr-local_nkr_offset)*AA_r(ikr)*AA_z(ikz)
+        ENDDO
+      ENDDO
+
+    ENDDO jloopi
+  ENDDO ploopi
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  ! Execution time END
+  CALL cpu_time(t1_Sapj)
+  tc_Sapj = tc_Sapj + (t1_Sapj - t0_Sapj)
+
+END SUBROUTINE compute_Sapj
-- 
GitLab