diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91e072bf8..db77c210f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,8 +23,12 @@ add_compile_options (-Wunused)
 add_compile_options (-Wextra)
 add_compile_options (-Werror)
 
+
 # allow deprecated functions
 add_compile_options (-Wno-deprecated-declarations)
+add_compile_options (-Wno-stringop-overflow)
+add_compile_options (-Wno-array-bounds)
+add_compile_options (-Wno-restrict)
 
 option (USE_STATIC_LIBRARIES "Link with static libraries if available" ON)
 
@@ -62,6 +66,13 @@ if (ENABLE_FFT)
     message (STATUS "Found Heffte_DIR: ${Heffte_DIR}")
 endif ()
 
+option (ENABLE_NUFFT "Enable NUFFT transform" OFF)
+if (ENABLE_NUFFT)
+    add_definitions (-DENABLE_NUFFT)
+    find_package(CUFINUFFT REQUIRED)
+    message (STATUS "Found CUFINUFFT_DIR: ${CUFINUFFT_DIR}")
+endif ()
+
 option (ENABLE_SOLVERS "Enable IPPL solvers" OFF)
 
 add_subdirectory (src)
diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
new file mode 100644
index 000000000..202a044a3
--- /dev/null
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -0,0 +1,32 @@
+#
+# Find CUFINUFFT includes and library
+#
+# CUFINUFFT_INCLUDE_DIR - where to find cufinufft.h
+# CUFINUFFT_LIBRARY     - libcufinufft.so path
+# CUFINUFFT_FOUND       - do not attempt to use if "no" or undefined.
+
+FIND_PATH(CUFINUFFT_INCLUDE_DIR cufinufft.h
+    HINTS $ENV{CUFINUFFT_INCLUDE_PATH} $ENV{CUFINUFFT_INCLUDE_DIR} $ENV{CUFINUFFT_PREFIX}/include $ENV{CUFINUFFT_DIR}/include ${PROJECT_SOURCE_DIR}/include
+    PATHS ENV CPP_INCLUDE_PATH
+)
+#Static library has some issues and gives a cuda error at the end of compilation
+FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.so
+    HINTS $ENV{CUFINUFFT_LIBRARY_PATH} $ENV{CUFINUFFT_LIBRARY_DIR} $ENV{CUFINUFFT_PREFIX}/lib $ENV{CUFINUFFT_DIR}/lib $ENV{CUFINUFFT}/lib ${PROJECT_SOURCE_DIR}/lib
+    PATHS ENV LIBRARY_PATH
+)
+
+IF(CUFINUFFT_INCLUDE_DIR AND CUFINUFFT_LIBRARY_DIR)
+    SET( CUFINUFFT_FOUND "YES" )
+    SET( CUFINUFFT_DIR $ENV{CUFINUFFT_DIR} )
+ENDIF()
+
+IF (CUFINUFFT_FOUND)
+   IF (NOT CUFINUFFT_FIND_QUIETLY)
+       MESSAGE(STATUS "Found cufinufft library dir: ${CUFINUFFT_LIBRARY_DIR}")
+       MESSAGE(STATUS "Found cufinufft include dir: ${CUFINUFFT_INCLUDE_DIR}")
+   ENDIF (NOT CUFINUFFT_FIND_QUIETLY)
+ELSE (CUFINUFFT_FOUND)
+    IF (CUFINUFFT_FIND_REQUIRED)
+      MESSAGE(FATAL_ERROR "Could not find CUFINUFFT!")
+  ENDIF (CUFINUFFT_FIND_REQUIRED)
+ENDIF (CUFINUFFT_FOUND)
diff --git a/alpine/CMakeLists.txt b/alpine/CMakeLists.txt
index 68dc41f4b..a3884ca14 100644
--- a/alpine/CMakeLists.txt
+++ b/alpine/CMakeLists.txt
@@ -1,29 +1,22 @@
-file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
-message (STATUS "Adding index test found in ${_relPath}")
-
-include_directories (
-    ${CMAKE_SOURCE_DIR}/src
-)
-
-link_directories (
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${Kokkos_DIR}/..
-)
-
-set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
-set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
-
-add_executable (PenningTrap PenningTrap.cpp)
-target_link_libraries (PenningTrap ${IPPL_LIBS})
-
-add_executable (UniformPlasmaTest UniformPlasmaTest.cpp)
-target_link_libraries (UniformPlasmaTest ${IPPL_LIBS})
-
-add_executable (LandauDamping LandauDamping.cpp)
-target_link_libraries (LandauDamping ${IPPL_LIBS})
-
-add_executable (BumponTailInstability BumponTailInstability.cpp)
-target_link_libraries (BumponTailInstability ${IPPL_LIBS})
+macro(list_subdirectories retval curdir)
+  file(GLOB sub-dir RELATIVE ${curdir} *)
+  set(list_of_dirs "")
+  foreach(dir ${sub-dir})
+    if(IS_DIRECTORY ${curdir}/${dir})
+      set(list_of_dirs ${list_of_dirs} ${dir})
+    endif()
+  endforeach()
+  set(${retval} ${list_of_dirs})
+endmacro()
+
+#list_subdirectories("TESTS" ${CMAKE_CURRENT_SOURCE_DIR})
+#foreach (test ${TESTS})
+#  add_subdirectory (${test})
+#endforeach()
+
+add_subdirectory (ElectrostaticPIC)
+add_subdirectory (ElectrostaticPIF)
+add_subdirectory (PinT)
 
 # vi: set et ts=4 sw=4 sts=4:
 
diff --git a/alpine/BumponTailInstability.cpp b/alpine/ElectrostaticPIC/BumponTailInstability.cpp
similarity index 99%
rename from alpine/BumponTailInstability.cpp
rename to alpine/ElectrostaticPIC/BumponTailInstability.cpp
index c15cf60aa..07e595cbc 100644
--- a/alpine/BumponTailInstability.cpp
+++ b/alpine/ElectrostaticPIC/BumponTailInstability.cpp
@@ -252,7 +252,7 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.5*dx;//0.05
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -383,6 +383,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpBumponTail();
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -442,6 +443,7 @@ int main(int argc, char *argv[]){
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpBumponTail();
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/ElectrostaticPIC/CMakeLists.txt b/alpine/ElectrostaticPIC/CMakeLists.txt
new file mode 100644
index 000000000..68dc41f4b
--- /dev/null
+++ b/alpine/ElectrostaticPIC/CMakeLists.txt
@@ -0,0 +1,35 @@
+file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+message (STATUS "Adding index test found in ${_relPath}")
+
+include_directories (
+    ${CMAKE_SOURCE_DIR}/src
+)
+
+link_directories (
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${Kokkos_DIR}/..
+)
+
+set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
+set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
+
+add_executable (PenningTrap PenningTrap.cpp)
+target_link_libraries (PenningTrap ${IPPL_LIBS})
+
+add_executable (UniformPlasmaTest UniformPlasmaTest.cpp)
+target_link_libraries (UniformPlasmaTest ${IPPL_LIBS})
+
+add_executable (LandauDamping LandauDamping.cpp)
+target_link_libraries (LandauDamping ${IPPL_LIBS})
+
+add_executable (BumponTailInstability BumponTailInstability.cpp)
+target_link_libraries (BumponTailInstability ${IPPL_LIBS})
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:
diff --git a/alpine/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
similarity index 83%
rename from alpine/ChargedParticles.hpp
rename to alpine/ElectrostaticPIC/ChargedParticles.hpp
index e64417e19..61730648d 100644
--- a/alpine/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -508,7 +508,126 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         
         Ippl::Comm->barrier();
      }
-     
+    
+     void dumpLandauParticle(size_type totalP) {
+        
+        auto Eview = E.getView();
+
+        double fieldEnergy, ExAmp;
+        double temp = 0.0;
+
+        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = Eview(i)[0] * Eview(i)[0];
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy = globaltemp * volume / totalP ;
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const size_t i, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        ExAmp = 0.0;
+        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldLandau_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+            }
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << ExAmp << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+
+     void dumpEnergy(size_type /*totalP*/) {
+        
+
+        double potentialEnergy, kineticEnergy;
+        //auto Eview = E.getView();
+        double temp = 0.0;
+
+        //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const int i, double& valL){
+        //                            double myVal = dot(Eview(i), Eview(i)).apply();
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(temp));
+
+        double globaltemp = 0.0;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+
+        rho_m = dot(E_m, E_m);
+        potentialEnergy = 0.5 * hr_m[0] * hr_m[1] * hr_m[2] * rho_m.sum();
+
+        auto Pview = P.getView();
+        auto qView = q.getView();
+
+        temp = 0.0;
+
+        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    myVal *= -qView(i);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        temp *= 0.5;
+        globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        kineticEnergy = globaltemp;
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/Energy_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+            }
+
+            csvout << time_m << " "
+                   << potentialEnergy << " "
+                   << kineticEnergy << " "
+                   << potentialEnergy + kineticEnergy << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+
      void dumpBumponTail() {
 
         const int nghostE = E_m.getNghost();
diff --git a/alpine/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
similarity index 98%
rename from alpine/LandauDamping.cpp
rename to alpine/ElectrostaticPIC/LandauDamping.cpp
index 26f8a3a88..0aed5ebc8 100644
--- a/alpine/LandauDamping.cpp
+++ b/alpine/ElectrostaticPIC/LandauDamping.cpp
@@ -1,6 +1,6 @@
 // Landau Damping Test
 //   Usage:
-//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> --info 10
+//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> <dt> --info 10
 //     nx       = No. cell-centered points in the x-direction
 //     ny       = No. cell-centered points in the y-direction
 //     nz       = No. cell-centered points in the z-direction
@@ -13,6 +13,7 @@
 //                simulations.
 //     ovfactor = Over-allocation factor for the buffers used in the communication. Typical
 //                values are 1.0, 2.0. Value 1.0 means no over-allocation.
+//     dt       = Time stepize
 //     Example:
 //     srun ./LandauDamping 128 128 128 10000 10 FFT 0.01 2.0 --info 10
 //
@@ -177,6 +178,7 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[4]);
     const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     msg << "Landau damping"
         << endl
@@ -200,7 +202,7 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
-    double alpha = 0.05;
+    double alpha = 0.5;
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     double dx = rmax[0] / nr[0];
@@ -209,7 +211,6 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -331,7 +332,8 @@ int main(int argc, char *argv[]){
     P->gatherCIC();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpLandau();
+    P->dumpLandauParticle(totalP);
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -390,7 +392,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpLandau();
+        P->dumpLandauParticle(totalP);
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
similarity index 93%
rename from alpine/PenningTrap.cpp
rename to alpine/ElectrostaticPIC/PenningTrap.cpp
index 87a9a6534..4cb27474c 100644
--- a/alpine/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -205,17 +205,18 @@ int main(int argc, char *argv[]){
     }
 
     // create mesh and layout objects for this problem domain
-    Vector_t rmin(0.0);
-    Vector_t rmax(20.0);
+    Vector_t rmin = {0.0, 0.0, 0.0};
+    //Vector_t rmax = {20.0, 20.0, 20.0};
+    Vector_t rmax = {25.0, 25.0, 25.0};
     double dx = rmax[0] / nr[0];
     double dy = rmax[1] / nr[1];
     double dz = rmax[2] / nr[2];
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    unsigned int nrMax = 2048;// Max grid size in our studies
-    double dxFinest = rmax[0] / nrMax;  
-    const double dt = 0.5 * dxFinest;//size of timestep
+    //unsigned int nrMax = 2048;// Max grid size in our studies
+    //double dxFinest = rmax[0] / nrMax;  
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -236,9 +237,12 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
-    sd[0] = 0.15*length[0];
-    sd[1] = 0.05*length[1];
-    sd[2] = 0.20*length[2];
+    //sd[0] = 0.15*length[0];
+    //sd[1] = 0.05*length[1];
+    //sd[2] = 0.20*length[2];
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
 
     P->E_m.initialize(mesh, FL);
     P->rho_m.initialize(mesh, FL);
@@ -352,6 +356,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpData();
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -381,13 +386,13 @@ int main(int argc, char *argv[]){
             double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            Eview(j)[0] += Eext_x;
-            Eview(j)[1] += Eext_y;
-            Eview(j)[2] += Eext_z;
+            Eext_x += Eview(j)[0];
+            Eext_y += Eview(j)[1];
+            Eext_z += Eview(j)[2];
             
-            Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
-            Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
-            Pview(j)[2] += alpha * Eview(j)[2];
+            Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+            Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+            Pview(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
@@ -434,20 +439,22 @@ int main(int argc, char *argv[]){
             double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            E2view(j)[0] += Eext_x;
-            E2view(j)[1] += Eext_y;
-            E2view(j)[2] += Eext_z;
-            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
-                            + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
-            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
-                            - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
-            P2view(j)[2] += alpha * E2view(j)[2];
+            Eext_x += E2view(j)[0];
+            Eext_y += E2view(j)[1];
+            Eext_z += E2view(j)[2];
+            
+            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                            + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                            - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+            P2view(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpData();
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/UniformPlasmaTest.cpp b/alpine/ElectrostaticPIC/UniformPlasmaTest.cpp
similarity index 100%
rename from alpine/UniformPlasmaTest.cpp
rename to alpine/ElectrostaticPIC/UniformPlasmaTest.cpp
diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
new file mode 100644
index 000000000..3ef320c57
--- /dev/null
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -0,0 +1,401 @@
+// Electrostatic Two-stream/Bump-on-tail instability test with Particle-in-Fourier schemes
+//   Usage:
+//     srun ./BumponTailInstabilityPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
+//     Example:
+//     srun ./BumponTailInstabilityPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
+//
+// Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPIF.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, delta, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& delta_, 
+           const T& u_) 
+  : k(k_), delta(delta_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (delta  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (delta  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type delta, sigma, muBulk, muBeam;
+  size_type nlocBulk; 
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& delta_, T& k_, value_type& sigma_, 
+                  value_type& muBulk_, value_type& muBeam_, 
+                  size_type& nlocBulk_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        delta(delta_), sigma(sigma_), muBulk(muBulk_), muBeam(muBeam_),
+        nlocBulk(nlocBulk_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    bool isBeam = (i >= nlocBulk);
+    
+    value_type muZ = (value_type)(((!isBeam) * muBulk) + (isBeam * muBeam));
+    
+    for (unsigned d = 0; d < Dim-1; ++d) {
+        
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]); 
+        v(i)[d] = rand_gen.normal(0.0, sigma);
+    }
+    v(i)[Dim-1] = rand_gen.normal(muZ, sigma);
+    
+    value_type u = rand_gen.drand(minU[Dim-1], maxU[Dim-1]);
+    x(i)[Dim-1] = u / (1 + delta);
+    Newton1D<value_type> solver(k[Dim-1], delta, u);
+    solver.solve(x(i)[Dim-1]);
+    
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& delta, const double& k,
+           const unsigned& dim) {
+
+   bool isDimZ = (dim == (Dim-1)); 
+   double cdf = x + (double)(isDimZ * ((delta / k) * std::sin(k * x)));
+   return cdf;
+}
+
+const char* TestName = "TwoStreamInstabilityPIF";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
+
+
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw;
+    double sigma, muBulk, muBeam, epsilon, delta;
+    
+    if(std::strcmp(TestName,"TwoStreamInstabilityPIF") == 0) {
+        // Parameters for two stream instability as in 
+        //  https://www.frontiersin.org/articles/10.3389/fphy.2018.00105/full
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    else if(std::strcmp(TestName,"BumponTailInstabilityPIF") == 0) {
+        kw = {0.21, 0.21, 0.21};
+        sigma = 1.0 / std::sqrt(2.0);
+        epsilon = 0.1;
+        muBulk = 0.0;
+        muBeam = 4.0;
+        delta = 0.01;
+    }
+    else {
+        //Default value is two stream instability
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    
+    
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    double factorConf = 1.0/Ippl::Comm->size();
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+    
+    msg << TestName 
+        << endl
+        << "nt " << nt << " Np= "
+        << Total_particles << " Fourier modes = " << nr
+        << endl;
+    
+    
+    //Q = -\int\int f dx dv
+    double Q = -rmax[0] * rmax[1] * rmax[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
+
+    P->nr_m = nr;
+
+    P->rho_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
+    P->time_m = 0.0;
+
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], delta, kw[d], d);
+        maxU[d]   = CDF(rmax[d], delta, kw[d], d);
+    }
+
+
+    //int rest = (int) (totalP - Total_particles);
+
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, delta, kw, sigma, muBulk, 
+                         muBeam, nlocBulk, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/Total_particles;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+    
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
+
+    P->scatter();
+
+    P->gather();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpBumponTail();
+    P->dumpEnergy();
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
+
+        //scatter the charge onto the underlying grid
+        P->scatter();
+
+        // Solve for and gather E field
+        P->gather();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpBumponTail();
+        P->dumpEnergy();
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << "BumponTailInstability: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/ElectrostaticPIF/CMakeLists.txt b/alpine/ElectrostaticPIF/CMakeLists.txt
new file mode 100644
index 000000000..969202fa2
--- /dev/null
+++ b/alpine/ElectrostaticPIF/CMakeLists.txt
@@ -0,0 +1,32 @@
+file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+message (STATUS "Adding index test found in ${_relPath}")
+
+include_directories (
+    ${CMAKE_SOURCE_DIR}/src
+)
+
+link_directories (
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${Kokkos_DIR}/..
+)
+
+set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
+set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
+
+add_executable (LandauDampingPIF LandauDampingPIF.cpp)
+target_link_libraries (LandauDampingPIF ${IPPL_LIBS})
+
+add_executable (BumponTailInstabilityPIF BumponTailInstabilityPIF.cpp)
+target_link_libraries (BumponTailInstabilityPIF ${IPPL_LIBS})
+
+add_executable (PenningTrapPIF PenningTrapPIF.cpp)
+target_link_libraries (PenningTrapPIF ${IPPL_LIBS})
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
new file mode 100644
index 000000000..ecc061798
--- /dev/null
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -0,0 +1,703 @@
+// ChargedParticlesPIF header file
+//   Defines a particle attribute for charged particles to be used in
+//   test programs
+//
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "Ippl.h"
+
+// dimension of our positions
+constexpr unsigned Dim = 3;
+
+// some typedefs
+typedef ippl::ParticleSpatialLayout<double,Dim>   PLayout_t;
+typedef ippl::UniformCartesian<double, Dim>        Mesh_t;
+typedef ippl::FieldLayout<Dim> FieldLayout_t;
+
+using size_type = ippl::detail::size_type;
+
+template<typename T, unsigned Dim>
+using Vector = ippl::Vector<T, Dim>;
+
+template<typename T, unsigned Dim>
+using Field = ippl::Field<T, Dim>;
+
+template<typename T>
+using ParticleAttrib = ippl::ParticleAttrib<T>;
+
+typedef Vector<double, Dim>  Vector_t;
+typedef Field<double, Dim>   Field_t;
+typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
+typedef Field<Vector_t, Dim> VField_t;
+
+typedef ippl::FFT<ippl::RCTransform, Dim, double> FFT_t;
+
+const double pi = std::acos(-1.0);
+
+// Test programs have to define this variable for VTK dump purposes
+extern const char* TestName;
+
+void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
+             double dx, double dy, double dz) {
+
+    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
+
+    std::stringstream fname;
+    fname << "data/scalar_";
+    fname << std::setw(4) << std::setfill('0') << iteration;
+    fname << ".vtk";
+
+    Kokkos::deep_copy(host_view, rho.getView());
+
+    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+    vtkout.precision(10);
+    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+
+    // start with header
+    vtkout << "# vtk DataFile Version 2.0" << endl;
+    vtkout << TestName << endl;
+    vtkout << "ASCII" << endl;
+    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
+    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+
+    vtkout << "SCALARS Rho float" << endl;
+    vtkout << "LOOKUP_TABLE default" << endl;
+    for (int z=0; z<nz+2; z++) {
+        for (int y=0; y<ny+2; y++) {
+            for (int x=0; x<nx+2; x++) {
+
+                vtkout << host_view(x,y,z) << endl;
+            }
+        }
+    }
+}
+
+
+template<class PLayout>
+class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
+public:
+    CxField_t rho_m;
+    CxField_t rhoPIFhalf_m;
+    Field_t rhoPIFreal_m;
+    CxField_t rhoDFT_m;
+    Field_t Sk_m;
+
+    Vector<int, Dim> nr_m;
+
+    ippl::e_dim_tag decomp_m[Dim];
+
+    Vector_t hr_m;
+    Vector_t rmin_m;
+    Vector_t rmax_m;
+
+    double Q_m;
+
+    size_type Np_m;
+
+    double time_m;
+
+    double rhoNorm_m;
+
+    std::string shapetype_m;
+
+    int shapedegree_m;
+    std::shared_ptr<FFT_t> fft_mp;
+
+    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1_mp,nufftType2_mp;
+
+public:
+    ParticleAttrib<double>     q; // charge
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // particle velocity
+    typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
+
+
+    /*
+      This constructor is mandatory for all derived classes from
+      ParticleBase as the bunch buffer uses this
+    */
+    ChargedParticlesPIF(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+    }
+
+    ChargedParticlesPIF(PLayout& pl,
+                     Vector_t hr,
+                     Vector_t rmin,
+                     Vector_t rmax,
+                     ippl::e_dim_tag decomp[Dim],
+                     double Q,
+                     size_type Np)
+    : ippl::ParticleBase<PLayout>(pl)
+    , hr_m(hr)
+    , rmin_m(rmin)
+    , rmax_m(rmax)
+    , Q_m(Q)
+    , Np_m(Np)  
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+        setupBCs();
+        for (unsigned int i = 0; i < Dim; i++)
+            decomp_m[i]=decomp[i];
+    }
+
+    ~ChargedParticlesPIF(){ }
+
+    void setupBCs() {
+        setBCAllPeriodic();
+    }
+
+    void initNUFFT(FieldLayout_t& FL, double& tol) {
+        ippl::ParameterList fftParams;
+
+        fftParams.add("gpu_method", 1);
+        fftParams.add("gpu_sort", 0);
+        fftParams.add("gpu_kerevalmeth", 1);
+        fftParams.add("tolerance", tol);
+
+        fftParams.add("use_cufinufft_defaults", false);
+        //fftParams.add("use_cufinufft_defaults", true);
+
+        nufftType1_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FL, this->getLocalNum(), 1, fftParams);
+        nufftType2_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FL, this->getLocalNum(), 2, fftParams);
+    }
+
+    void gather() {
+
+        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, nufftType2_mp.get(), q);
+        //gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R);
+
+        //Set the charge back to original as we used this view as a 
+        //temporary buffer during gather
+        q = Q_m / Np_m; 
+
+    }
+
+    void scatter() {
+        
+        Inform m("scatter ");
+        rho_m = {0.0, 0.0};
+        scatterPIFNUFFT(q, rho_m, Sk_m, this->R, nufftType1_mp.get());
+        //rhoDFT_m = {0.0, 0.0};
+        //scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
+
+        //dumpFieldData();
+
+        rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    }
+
+
+    void dumpLandau() {
+       
+       
+       double fieldEnergy = 0.0; 
+       double ExAmp = 0.0;
+
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+     
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Ex energy and Max",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& tlSum,
+                                           double& tlMax)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+           bool isNotZero = (Dr != 0.0);
+           double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+           Ek = -(imag * kVec[0] * rho * factor);
+           double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+           tlSum += myVal;
+
+           double myValMax = std::sqrt(myVal);
+
+           if(myValMax > tlMax) tlMax = myValMax;
+
+       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+       
+
+       Kokkos::fence();
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       fieldEnergy *= volume;
+        
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/FieldLandau_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+           }
+
+           csvout << time_m << " "
+                  << fieldEnergy << " "
+                  << ExAmp << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+
+    void dumpBumponTail() {
+       
+       
+       double fieldEnergy = 0.0; 
+       double EzAmp = 0.0;
+
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+     
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Ez energy and Max",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& tlSum,
+                                           double& tlMax)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+           bool isNotZero = (Dr != 0.0);
+           double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+           Ek = -(imag * kVec[2] * rho * factor);
+           double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+           tlSum += myVal;
+
+           double myValMax = std::sqrt(myVal);
+
+           if(myValMax > tlMax) tlMax = myValMax;
+
+       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(EzAmp));
+       
+
+       Kokkos::fence();
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       fieldEnergy *= volume;
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/FieldBumponTail_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
+           }
+
+           csvout << time_m << " "
+                  << fieldEnergy << " "
+                  << EzAmp << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+
+    void dumpEnergy() {
+       
+
+       double potentialEnergy, kineticEnergy;
+       double temp = 0.0;
+       
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Potential energy",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& valL)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           double myVal = 0.0;
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+           for(size_t d = 0; d < Dim; ++d) {
+               bool isNotZero = (Dr != 0.0);
+               double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+               Ek = -(imag * kVec[d] * rho * factor);
+               myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+           }
+
+           valL += myVal;
+
+       }, Kokkos::Sum<double>(temp));
+       
+
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       potentialEnergy = 0.5 * temp * volume;
+
+       auto Pview = P.getView();
+       auto qView = q.getView();
+
+       temp = 0.0;
+
+       Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   double myVal = dot(Pview(i), Pview(i)).apply();
+                                   myVal *= -qView(i);
+                                   valL += myVal;
+                               }, Kokkos::Sum<double>(temp));
+
+       temp *= 0.5;
+       double globaltemp = 0.0;
+       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+       kineticEnergy = globaltemp;
+
+       auto rhoPIFhalfview = rhoPIFhalf_m.getView();
+       const int nghostHalf = rhoPIFhalf_m.getNghost();
+      
+       const FieldLayout_t& layoutHalf = rhoPIFhalf_m.getLayout(); 
+       const auto& domainHalf = layoutHalf.getDomain();
+
+       Vector<int, Dim> Nhalf;
+       for (unsigned d=0; d < Dim; ++d) {
+           Nhalf[d] = domainHalf[d].length();
+       }
+
+       //Heffte needs FFTshifted field whereas the field from cuFINUFFT
+       //is not shifted. Hence, here we do the shift. 
+       Kokkos::parallel_for("Transfer complex rho to half domain",
+                             mdrange_type({0, 0, 0},
+                                          {Nhalf[0],
+                                           Nhalf[1],
+                                           Nhalf[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k)
+       {
+           Vector<int, 3> iVec = {i, j, k};
+           int shift;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool isLessThanHalf = (iVec[d] < (Nhalf[d]/2));
+               shift = ((int)isLessThanHalf * 2) - 1;
+               iVec[d] = (iVec[d] + shift * (Nhalf[d]/2)) + nghostHalf;
+           }
+           rhoPIFhalfview(Nhalf[0]-1-i+nghostHalf, iVec[1], iVec[2]) = rhoview(i+nghostHalf,j+nghostHalf,k+nghostHalf);
+       });
+
+
+       rhoPIFreal_m = 0.0;
+       fft_mp->transform(-1, rhoPIFreal_m, rhoPIFhalf_m);
+
+
+       rhoPIFreal_m = (1.0/(nr_m[0]*nr_m[1]*nr_m[2])) * volume * rhoPIFreal_m;
+       auto rhoPIFrealview = rhoPIFreal_m.getView();
+       temp = 0.0;
+       Kokkos::parallel_reduce("Rho real sum",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& valL)
+       {
+         
+           valL += rhoPIFrealview(i+nghost, j+nghost, k+nghost);
+       }, Kokkos::Sum<double>(temp));
+
+       double charge = temp;
+
+       Vector_t totalMomentum = 0.0;
+       
+       for(size_t d = 0; d < Dim; ++d) {
+            double tempD = 0.0;
+            Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   valL  += (-qView(i)) * Pview(i)[d];
+                               }, Kokkos::Sum<double>(tempD));
+            totalMomentum[d] = tempD;
+       }
+       
+       Vector_t globalMom;
+
+       double magMomentum = 0.0;
+       for(size_t d = 0; d < Dim; ++d) {
+           MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+           magMomentum += globalMom[d] * globalMom[d];
+       }
+
+       magMomentum  = std::sqrt(magMomentum);
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/Energy_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(17);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Potential energy, Kinetic energy, Total energy Total charge Total Momentum" << endl;
+           }
+
+           csvout << time_m << " "
+                  << potentialEnergy << " "
+                  << kineticEnergy << " "
+                  << potentialEnergy + kineticEnergy << " " 
+                  << charge << " "
+                  << magMomentum << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+
+    void initializeShapeFunctionPIF() {
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        auto Skview = Sk_m.getView();
+        auto N = nr_m;
+        const int nghost = Sk_m.getNghost();
+        const Mesh_t& mesh = rho_m.get_mesh();
+        const Vector_t& dx = mesh.getMeshSpacing();
+        const Vector_t& Len = rmax_m - rmin_m;
+        const double pi = std::acos(-1.0);
+        int order = shapedegree_m + 1;
+        if(shapetype_m == "Gaussian") {
+
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Gaussian shape function not implemented yet");
+
+        }
+        else if(shapetype_m == "B-spline") {
+
+            Kokkos::parallel_for("B-spline shape functions",
+                                mdrange_type({0, 0, 0},
+                                             {N[0], N[1], N[2]}),
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
+            {
+                
+                Vector<int, 3> iVec = {i, j, k};
+                Vector<double, 3> kVec;
+                double Sk = 1.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                    double khbytwo = kVec[d] * dx[d] / 2;
+                    bool isNotZero = (khbytwo != 0.0);
+                    double factor = (1.0 / (khbytwo + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::sin(khbytwo) * factor) + 
+                                 (!isNotZero) * 1.0;
+                    //Fourier transform of CIC
+                    Sk *= std::pow(arg, order);
+                }
+                    Skview(i+nghost, j+nghost, k+nghost) = Sk;
+            });
+
+        }
+        else {
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Unrecognized shape function type");
+        }
+
+    }
+    
+
+    void dumpFieldData() {
+
+       typename CxField_t::HostMirror rhoNUFFT_host = rho_m.getHostMirror();
+       typename Field_t::HostMirror rhoNUFFT_real = rhoPIFreal_m.getHostMirror();
+       //typename CxField_t::HostMirror rhoNUDFT_host = rhoDFT_m.getHostMirror();
+       Kokkos::deep_copy(rhoNUFFT_host, rho_m.getView());
+       Kokkos::deep_copy(rhoNUFFT_real, rhoPIFreal_m.getView());
+       //Kokkos::deep_copy(rhoNUDFT_host, rhoDFT_m.getView());
+       const int nghost = rho_m.getNghost();
+       std::stringstream pname;
+       pname << "data/FieldFFT_";
+       pname << Ippl::Comm->rank();
+       pname << ".csv";
+       Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+       pcsvout.precision(10);
+       pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+       pcsvout << "rho" << endl;
+       for (int i = 0; i< nr_m[0]; i++) {
+            for (int j = 0; j< nr_m[1]; j++) {
+                for (int k = 0; k< nr_m[2]; k++) {
+                    pcsvout << rhoNUFFT_host(i+nghost,j+nghost, k+nghost) << endl;
+                }
+            }
+       }
+       std::stringstream pname2;
+       pname2 << "data/Fieldreal_";
+       pname2 << Ippl::Comm->rank();
+       pname2 << ".csv";
+       Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+       pcsvout2.precision(10);
+       pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+       pcsvout2 << "rho" << endl;
+       for (int i = 0; i< nr_m[0]; i++) {
+            for (int j = 0; j< nr_m[1]; j++) {
+                for (int k = 0; k< nr_m[2]; k++) {
+                    pcsvout2 << rhoNUFFT_real(i+nghost,j+nghost, k+nghost) << endl;
+                }
+            }
+       }
+       Ippl::Comm->barrier();
+    }
+
+
+    //void dumpParticleData() {
+
+    //   typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
+    //   typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
+    //   Kokkos::deep_copy(R_host, this->R.getView());
+    //   Kokkos::deep_copy(P_host, P.getView());
+    //   std::stringstream pname;
+    //   pname << "data/ParticleIC_";
+    //   pname << Ippl::Comm->rank();
+    //   pname << ".csv";
+    //   Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    //   pcsvout.precision(10);
+    //   pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    //   pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
+    //   for (size_type i = 0; i< this->getLocalNum(); i++) {
+    //       pcsvout << R_host(i)[0] << " "
+    //               << R_host(i)[1] << " "
+    //               << R_host(i)[2] << " "
+    //               << P_host(i)[0] << " "
+    //               << P_host(i)[1] << " "
+    //               << P_host(i)[2] << endl;
+    //   }
+    //   Ippl::Comm->barrier();
+    //}
+
+private:
+    void setBCAllPeriodic() {
+
+        this->setParticleBC(ippl::BC::PERIODIC);
+    }
+
+};
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
new file mode 100644
index 000000000..e07e249fc
--- /dev/null
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -0,0 +1,363 @@
+// Electrostatic Landau damping test with Particle-in-Fourier schemes
+//   Usage:
+//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
+//     Example:
+//     srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPIF.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, alpha, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& alpha_, 
+           const T& u_) 
+  : k(k_), alpha(alpha_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (alpha  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (alpha  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type alpha;
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& alpha_, T& k_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        alpha(alpha_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = u / (1 + alpha);
+        Newton1D<value_type> solver(k[d], alpha, u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& alpha, const double& k) {
+   double cdf = x + (alpha / k) * std::sin(k * x);
+   return cdf;
+}
+
+KOKKOS_FUNCTION
+double PDF(const Vector_t& xvec, const double& alpha, 
+             const Vector_t& kw, const unsigned Dim) {
+    double pdf = 1.0;
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        pdf *= (1.0 + alpha * std::cos(kw[d] * xvec[d]));
+    }
+    return pdf;
+}
+
+const char* TestName = "LandauDampingPIF";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg("LandauDampingPIF");
+    Inform msg2all("LandauDampingPIF",INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
+
+    double factor = 1.0/Ippl::Comm->size();
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+
+    msg << "Landau damping"
+        << endl
+        << "nt " << nt << " Np= "
+        << Total_particles << " Fourier modes = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw = {0.5, 0.5, 0.5};
+    double alpha = 0.05;
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw;
+    Vector_t length = rmax - rmin;
+    double dx = length[0] / nr[0];
+    double dy = length[1] / nr[1];
+    double dz = length[2] / nr[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    //Q = -\int\int f dx dv
+    double Q = -length[0] * length[1] * length[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
+
+    P->nr_m = nr;
+
+    P->rho_m.initialize(mesh, FL);
+    P->rhoDFT_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
+    P->time_m = 0.0;
+
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], alpha, kw[d]);
+        maxU[d]   = CDF(rmax[d], alpha, kw[d]);
+    }
+
+
+    //int rest = (int) (totalP - Total_particles);
+
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/Total_particles;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
+
+    P->scatter();
+
+    P->gather();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpBumponTail();
+    P->dumpEnergy();
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
+
+        //scatter the charge onto the underlying grid
+        P->scatter();
+
+        // Solve for and gather E field
+        P->gather();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpBumponTail();
+        P->dumpEnergy();
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << "LandauDamping: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
new file mode 100644
index 000000000..54984352e
--- /dev/null
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -0,0 +1,401 @@
+// Electrostatic Penning trap test with Particle-in-Fourier schemes
+//   Usage:
+//     srun ./PenningTrapPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
+//     Example:
+//     srun ./PenningTrapPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
+//
+// Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPIF.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T mu, sigma, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& mu_, const T& sigma_, 
+           const T& u_) 
+  : mu(mu_), sigma(sigma_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = std::erf((x - mu)/(sigma * std::sqrt(2.0))) 
+          - 2 * u + 1;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = (1 / sigma) * std::sqrt(2 / pi) * 
+               std::exp(-0.5 * (std::pow(((x - mu) / sigma),2)));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while ((iterations < max_iter) && (std::fabs(f(x)) > tol)) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T mu, sigma, minU, maxU;
+
+  double pi = std::acos(-1.0);
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_,
+                  T& mu_, T& sigma_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        mu(mu_), sigma(sigma_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = (std::sqrt(pi / 2) * (2 * u - 1)) * 
+                  sigma[d] + mu[d];
+        Newton1D<value_type> solver(mu[d], sigma[d], u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& mu, const double& sigma) {
+   double cdf = 0.5 * (1.0 + std::erf((x - mu)/(sigma * std::sqrt(2))));
+   return cdf;
+}
+
+const char* TestName = "PenningTrapPIF";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
+
+    double factor = 1.0/Ippl::Comm->size();
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+
+    msg << TestName
+        << endl
+        << "nt " << nt << " Np= "
+        << Total_particles << " Fourier modes = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t rmin(0.0);
+    Vector_t rmax(25.0);
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t length = rmax - rmin;
+
+    Vector_t mu, sd;
+
+    for (unsigned d = 0; d<Dim; d++) {
+        mu[d] = 0.5 * length[d];
+    }
+    //sd[0] = 0.15*length[0];
+    //sd[1] = 0.05*length[1];
+    //sd[2] = 0.20*length[2];
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    double Q = -1562.5;
+    double Bext = 5.0;
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
+
+    P->nr_m = nr;
+
+    P->rho_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
+    P->time_m = 0.0;
+
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], mu[d], sd[d]);
+        maxU[d] = CDF(rmax[d], mu[d], sd[d]);
+    }
+
+
+    //int rest = (int) (totalP - Total_particles);
+
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/Total_particles;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
+
+    P->scatter();
+
+    P->gather();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    //P->dumpEnergy();
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    double alpha = -0.5 * dt;
+    double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2))); 
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // Staggered Leap frog or Boris algorithm as per 
+        // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+        // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+        // the analytical matrix inversion which is not complex in this case.
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+        IpplTimings::startTimer(PTimer);
+        auto Rview = P->R.getView();
+        auto Pview = P->P.getView();
+        auto Eview = P->E.getView();
+        double V0 = 30*rmax[2];
+        Kokkos::parallel_for("Kick1", P->getLocalNum(),
+                              KOKKOS_LAMBDA(const size_t j){
+            double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+            Eext_x += Eview(j)[0];
+            Eext_y += Eview(j)[1];
+            Eext_z += Eview(j)[2];
+            
+            Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+            Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+            Pview(j)[2] += alpha * Eext_z;
+        });
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
+
+        //scatter the charge onto the underlying grid
+        P->scatter();
+
+        // Solve for and gather E field
+        P->gather();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        auto R2view = P->R.getView();
+        auto P2view = P->P.getView();
+        auto E2view = P->E.getView();
+        Kokkos::parallel_for("Kick2", P->getLocalNum(),
+                              KOKKOS_LAMBDA(const size_t j){
+            double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+            Eext_x += E2view(j)[0];
+            Eext_y += E2view(j)[1];
+            Eext_z += E2view(j)[2];
+            
+            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                            + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                            - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+            P2view(j)[2] += alpha * Eext_z;
+        });
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        //P->dumpEnergy();
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << TestName << " End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
new file mode 100644
index 000000000..94fd4dd8a
--- /dev/null
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -0,0 +1,770 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//  Usage:
+//     srun ./BumponTailInstabilityPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
+//     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
+//     nCycles = No. of Parareal blocks/cycles
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
+//     Example:
+//     srun ./BumponTailInstabilityPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "StatesSlice.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, delta, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& delta_, 
+           const T& u_) 
+  : k(k_), delta(delta_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (delta  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (delta  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type delta, sigma, muBulk, muBeam;
+  size_type nlocBulk; 
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& delta_, T& k_, value_type& sigma_, 
+                  value_type& muBulk_, value_type& muBeam_, 
+                  size_type& nlocBulk_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        delta(delta_), sigma(sigma_), muBulk(muBulk_), muBeam(muBeam_),
+        nlocBulk(nlocBulk_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    bool isBeam = (i >= nlocBulk);
+    
+    value_type muZ = (value_type)(((!isBeam) * muBulk) + (isBeam * muBeam));
+    
+    for (unsigned d = 0; d < Dim-1; ++d) {
+        
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]); 
+        v(i)[d] = rand_gen.normal(0.0, sigma);
+    }
+    v(i)[Dim-1] = rand_gen.normal(muZ, sigma);
+    
+    value_type u = rand_gen.drand(minU[Dim-1], maxU[Dim-1]);
+    x(i)[Dim-1] = u / (1 + delta);
+    Newton1D<value_type> solver(k[Dim-1], delta, u);
+    solver.solve(x(i)[Dim-1]);
+    
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& delta, const double& k,
+           const unsigned& dim) {
+
+   bool isDimZ = (dim == (Dim-1)); 
+   double cdf = x + (double)(isDimZ * ((delta / k) * std::sin(k * x)));
+   return cdf;
+}
+
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      Vector_t& length, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 10) mentioned here are just an adhoc
+                                //value depending on the domain length. 
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+const char* TestName = "TwoStreamInstability";
+//const char* TestName = "BumponTailInstability";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
+
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+
+
+    Inform msg(TestName, Ippl::Comm->size()-1);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nmPIF = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
+    const double dtSlice = tEndCycle / sizeTime;
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[11]);
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
+
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
+    for (unsigned i = 0; i< Dim; i++) {
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw;
+    double sigma, muBulk, muBeam, epsilon, delta;
+    
+   
+    if(std::strcmp(TestName,"TwoStreamInstability") == 0) {
+        // Parameters for two stream instability as in 
+        //  https://www.frontiersin.org/articles/10.3389/fphy.2018.00105/full
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    else if(std::strcmp(TestName,"BumponTailInstability") == 0) {
+        kw = {0.21, 0.21, 0.21};
+        sigma = 1.0 / std::sqrt(2.0);
+        epsilon = 0.1;
+        muBulk = 0.0;
+        muBeam = 4.0;
+        delta = 0.01;
+    }
+    else {
+        //Default value is two stream instability
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    Vector_t rmin(0.0);
+    Vector_t rmax = (2 * pi / kw);
+    Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
+
+
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
+
+
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    double factorConf = 1.0 / sizeSpace;
+    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+   
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+    
+    //Q = -\int\int f dx dv
+    double Q = -length[0] * length[1] * length[2];
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
+
+    Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
+
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+
+    Pcoarse->coarsetype_m = argv[19];
+    
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
+   
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], delta, kw[d], d);
+        maxU[d]   = CDF(rmax[d], delta, kw[d], d);
+    }
+
+    using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
+
+    
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = std::atof(argv[18]);
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+    
+    IpplTimings::startTimer(particleCreation);
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+
+    IpplTimings::startTimer(timeCommunication);
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, delta, kw, 
+                             sigma, muBulk, muBeam, nlocBulk, minU, maxU));
+
+
+        Kokkos::fence();
+    }
+    else {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+        buf->resetReadPos();
+    }
+    IpplTimings::stopTimer(timeCommunication);
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    IpplTimings::startTimer(coarsePropagator);
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm);
+    }
+    else {
+        //PIF with coarse tolerance as coarse propagator
+        Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm);
+    }
+    IpplTimings::stopTimer(coarsePropagator);
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    IpplTimings::startTimer(timeCommunication);
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+    IpplTimings::stopTimer(timeCommunication);
+
+#else
+    //Note the CPU version has not been tested.
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, delta, kw, 
+                         sigma, muBulk, muBeam, nlocBulk, minU, maxU));
+
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+#endif
+
+    msg << "Parareal "
+        << TestName
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " No. of cycles: " << nCycles
+        << endl
+        << "Np= " << Total_particles 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+    
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    msg << "particles created and initial conditions assigned " << endl;
+
+    int sign = 1;
+    for (unsigned int nc=0; nc < nCycles; nc++) {
+        
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        
+        unsigned int it = 0;
+        
+        while (!isConverged) { 
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, fine, spaceComm);
+            IpplTimings::stopTimer(finePropagator);
+    
+
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(recvCriteria && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm);
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
+
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            
+            IpplTimings::startTimer(computeErrors);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
+            IpplTimings::stopTimer(computeErrors);
+
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
+                isConverged = true;
+            }
+
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            
+            msg << "Finished iteration: " << it+1
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
+
+            IpplTimings::startTimer(dumpData);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            IpplTimings::stopTimer(dumpData);
+
+            MPI_Barrier(spaceComm);
+            
+            it += 1;
+        }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
+            
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(timeCommunication);
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            sign *= -1;
+        }
+    }
+    msg << TestName << " Parareal: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
+
+
+    return 0;
+}
diff --git a/alpine/PinT/CMakeLists.txt b/alpine/PinT/CMakeLists.txt
new file mode 100644
index 000000000..73976aa27
--- /dev/null
+++ b/alpine/PinT/CMakeLists.txt
@@ -0,0 +1,32 @@
+file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+message (STATUS "Adding index test found in ${_relPath}")
+
+include_directories (
+    ${CMAKE_SOURCE_DIR}/src
+)
+
+link_directories (
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${Kokkos_DIR}/..
+)
+
+set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
+set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
+
+add_executable (LandauDampingPinT LandauDampingPinT.cpp)
+target_link_libraries (LandauDampingPinT ${IPPL_LIBS})
+
+add_executable (BumponTailInstabilityPinT BumponTailInstabilityPinT.cpp)
+target_link_libraries (BumponTailInstabilityPinT ${IPPL_LIBS})
+
+add_executable (PenningTrapPinT PenningTrapPinT.cpp)
+target_link_libraries (PenningTrapPinT ${IPPL_LIBS})
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
new file mode 100644
index 000000000..7790c8f26
--- /dev/null
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -0,0 +1,913 @@
+// ChargedParticlesPinT header file
+//   Defines a particle attribute for charged particles to be used in
+//   test programs
+//
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "Ippl.h"
+#include "Solver/FFTPeriodicPoissonSolver.h"
+
+// dimension of our positions
+constexpr unsigned Dim = 3;
+
+// some typedefs
+typedef ippl::ParticleSpatialLayout<double,Dim>   PLayout_t;
+typedef ippl::UniformCartesian<double, Dim>        Mesh_t;
+typedef ippl::FieldLayout<Dim> FieldLayout_t;
+
+using size_type = ippl::detail::size_type;
+
+template<typename T, unsigned Dim>
+using Vector = ippl::Vector<T, Dim>;
+
+template<typename T, unsigned Dim>
+using Field = ippl::Field<T, Dim>;
+
+template<typename T>
+using ParticleAttrib = ippl::ParticleAttrib<T>;
+
+typedef Vector<double, Dim>  Vector_t;
+typedef Field<double, Dim>   Field_t;
+typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
+typedef Field<Vector_t, Dim> VField_t;
+typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
+
+typedef ippl::FFT<ippl::RCTransform, Dim, double> FFT_t;
+
+const double pi = std::acos(-1.0);
+
+// Test programs have to define this variable for VTK dump purposes
+extern const char* TestName;
+
+template<class PLayout>
+class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
+public:
+
+    CxField_t rhoPIF_m;
+    CxField_t rhoPIFhalf_m;
+    Field_t rhoPIFreal_m;
+    Field_t Sk_m;
+    Field_t rhoPIC_m;
+    VField_t EfieldPIC_m;
+
+    Vector<int, Dim> nr_m;
+    Vector<int, Dim> nm_m;
+
+    ippl::e_dim_tag decomp_m[Dim];
+
+    Vector_t hr_m;
+    Vector_t rmin_m;
+    Vector_t rmax_m;
+
+    double Q_m;
+
+    size_type Np_m;
+    
+    std::shared_ptr<Solver_t> solver_mp;
+    std::shared_ptr<FFT_t> fft_mp;
+    
+    double time_m;
+
+    std::string shapetype_m;
+
+    std::string coarsetype_m;
+
+    int shapedegree_m;
+
+    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1Fine_mp,nufftType2Fine_mp,
+                                                               nufftType1Coarse_mp,nufftType2Coarse_mp;
+
+public:
+    ParticleAttrib<double>     q; // charge
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // G(P^(k)_n)
+    typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
+
+    typename ippl::ParticleBase<PLayout>::particle_position_type R0;  // Initial particle positions at t=0
+    typename ippl::ParticleBase<PLayout>::particle_position_type P0;  // Initial particle velocities at t=0
+
+    typename ippl::ParticleBase<PLayout>::particle_position_type RprevIter;  // G(R^(k-1)_n)
+    typename ippl::ParticleBase<PLayout>::particle_position_type PprevIter;  // G(P^(k-1)_n)
+
+    ///*
+    //  This constructor is mandatory for all derived classes from
+    //  ParticleBase as the bunch buffer uses this
+    //*/
+    //ChargedParticlesPinT(PLayout& pl)
+    //: ippl::ParticleBase<PLayout>(pl)
+    //{
+    //    // register the particle attributes
+    //    this->addAttribute(q);
+    //    this->addAttribute(P);
+    //    this->addAttribute(E);
+    //    this->addAttribute(R0);
+    //    this->addAttribute(P0);
+    //    this->addAttribute(RprevIter);
+    //    this->addAttribute(PprevIter);
+    //    //this->addAttribute(Rfine);
+    //    //this->addAttribute(Pfine);
+    //}
+    
+    ChargedParticlesPinT(PLayout& pl,
+                     Vector_t hr,
+                     Vector_t rmin,
+                     Vector_t rmax,
+                     ippl::e_dim_tag decomp[Dim],
+                     double Q,
+                     size_type Np)
+    : ippl::ParticleBase<PLayout>(pl)
+    , hr_m(hr)
+    , rmin_m(rmin)
+    , rmax_m(rmax)
+    , Q_m(Q)
+    , Np_m(Np)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+        this->addAttribute(R0);
+        this->addAttribute(P0);
+        this->addAttribute(RprevIter);
+        this->addAttribute(PprevIter);
+        setupBCs();
+        for (unsigned int i = 0; i < Dim; i++)
+            decomp_m[i]=decomp[i];
+    }
+
+    ~ChargedParticlesPinT(){ }
+
+    void setupBCs() {
+        setBCAllPeriodic();
+    }
+
+
+    void initFFTSolver() {
+        ippl::ParameterList sp;
+        sp.add("output_type", Solver_t::GRAD);
+        sp.add("use_heffte_defaults", false);  
+        sp.add("use_pencils", true);  
+        sp.add("use_reorder", false);  
+        sp.add("use_gpu_aware", true);  
+        sp.add("comm", ippl::p2p_pl);  
+        sp.add("r2c_direction", 0);  
+
+        solver_mp = std::make_shared<Solver_t>();
+
+        solver_mp->mergeParameters(sp);
+
+        solver_mp->setRhs(rhoPIC_m);
+
+        solver_mp->setLhs(EfieldPIC_m);
+    }
+
+
+    void initNUFFTs(FieldLayout_t& FLPIF, double& coarseTol,
+                    double& fineTol) {
+        
+        ippl::ParameterList fftCoarseParams,fftFineParams;
+
+        fftFineParams.add("gpu_method", 1);
+        fftFineParams.add("gpu_sort", 0);
+        fftFineParams.add("gpu_kerevalmeth", 1);
+        fftFineParams.add("tolerance", fineTol);
+
+        fftCoarseParams.add("gpu_method", 1);
+        fftCoarseParams.add("gpu_sort", 0);
+        fftCoarseParams.add("gpu_kerevalmeth", 1);
+        fftCoarseParams.add("tolerance", coarseTol);
+
+        fftFineParams.add("use_cufinufft_defaults", false);
+        fftCoarseParams.add("use_cufinufft_defaults", false);
+        
+        nufftType1Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 1, fftFineParams);
+        nufftType2Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftFineParams);
+
+        nufftType1Coarse_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 1, fftCoarseParams);
+        nufftType2Coarse_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftCoarseParams);
+    }
+    
+    void dumpFieldEnergy(const unsigned int& nc, const unsigned int& iter, int rankTime, int rankSpace) {
+       
+        double fieldEnergy = 0.0; 
+        double EzAmp = 0.0;
+
+        auto rhoview = rhoPIF_m.getView();
+        const int nghost = rhoPIF_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Ez energy and Max",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& tlSum,
+                                            double& tlMax)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            bool isNotZero = (Dr != 0.0);
+            double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+            Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) * factor);
+            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+            tlSum += myVal;
+
+            double myValMax = std::sqrt(myVal);
+
+            if(myValMax > tlMax) tlMax = myValMax;
+
+        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(EzAmp));
+        
+
+        Kokkos::fence();
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy *= volume;
+
+
+        if(rankSpace == 0) {
+            std::stringstream fname;
+            fname << "data/FieldBumponTail_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << "_iter_";
+            fname << iter;
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << EzAmp << endl;
+        }
+    }
+
+    void dumpEnergy(const unsigned int& nc, const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp,
+                    int rankTime, int rankSpace, const MPI_Comm& spaceComm = MPI_COMM_WORLD) {
+
+        double potentialEnergy, kineticEnergy;
+        double temp = 0.0;
+
+        auto rhoview = rhoPIF_m.getView();
+        const int nghost = rhoPIF_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Potential energy",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& valL)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            double myVal = 0.0;
+            auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+            for(size_t d = 0; d < Dim; ++d) {
+                bool isNotZero = (Dr != 0.0);
+                double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                Ek = -(imag * kVec[d] * rho * factor);
+                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+            }
+
+            valL += myVal;
+
+        }, Kokkos::Sum<double>(temp));
+
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        potentialEnergy = 0.5 * temp * volume;
+
+        auto Pview = Ptemp.getView();
+        auto qView = q.getView();
+
+        temp = 0.0;
+
+        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    myVal *= -qView(i); //q/(q/m) where q/m=-1
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        temp *= 0.5;
+        double globaltemp = 0.0;
+        MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+        kineticEnergy = globaltemp;
+
+        auto rhoPIFhalfview = rhoPIFhalf_m.getView();
+        const int nghostHalf = rhoPIFhalf_m.getNghost();
+      
+        const FieldLayout_t& layoutHalf = rhoPIFhalf_m.getLayout(); 
+        const auto& domainHalf = layoutHalf.getDomain();
+
+        Vector<int, Dim> Nhalf;
+        for (unsigned d=0; d < Dim; ++d) {
+            Nhalf[d] = domainHalf[d].length();
+        }
+
+        //Heffte needs FFTshifted field whereas the field from cuFINUFFT
+        //is not shifted. Hence, here we do the shift. 
+        Kokkos::parallel_for("Transfer complex rho to half domain",
+                              mdrange_type({0, 0, 0},
+                                           {Nhalf[0],
+                                            Nhalf[1],
+                                            Nhalf[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k)
+        {
+            Vector<int, 3> iVec = {i, j, k};
+            int shift;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool isLessThanHalf = (iVec[d] < (Nhalf[d]/2));
+                shift = ((int)isLessThanHalf * 2) - 1;
+                iVec[d] = (iVec[d] + shift * (Nhalf[d]/2)) + nghostHalf;
+            }
+            rhoPIFhalfview(Nhalf[0]-1-i+nghostHalf, iVec[1], iVec[2]) = 
+            rhoview(i+nghostHalf,j+nghostHalf,k+nghostHalf);
+        });
+
+
+        rhoPIFreal_m = 0.0;
+        fft_mp->transform(-1, rhoPIFreal_m, rhoPIFhalf_m);
+
+        rhoPIFreal_m = (1.0/(N[0]*N[1]*N[2])) * volume * rhoPIFreal_m;
+        auto rhoPIFrealview = rhoPIFreal_m.getView();
+        temp = 0.0;
+        Kokkos::parallel_reduce("Rho real sum",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& valL)
+        {
+            valL += rhoPIFrealview(i+nghost, j+nghost, k+nghost);
+        }, Kokkos::Sum<double>(temp));
+
+        double chargeTotal = temp;
+
+        Vector_t totalMomentum = 0.0;
+        
+        for(size_t d = 0; d < Dim; ++d) {
+             double tempD = 0.0;
+             Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    valL  += (-qView(i)) * Pview(i)[d];
+                                }, Kokkos::Sum<double>(tempD));
+             totalMomentum[d] = tempD;
+        }
+        
+        Vector_t globalMom;
+
+        double magMomentum = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+            magMomentum += globalMom[d] * globalMom[d];
+        }
+
+        magMomentum  = std::sqrt(magMomentum);
+
+        if(rankSpace == 0) {
+            std::stringstream fname;
+            fname << "data/Energy_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << "_iter_";
+            fname << iter;
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(17);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+
+            csvout << time_m << " "
+                   << potentialEnergy << " "
+                   << kineticEnergy << " "
+                   << potentialEnergy + kineticEnergy << " " 
+                   << chargeTotal << " " 
+                   << magMomentum << endl;
+        }
+
+    }
+    
+    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime, int rankSpace) {
+        
+        if(rankSpace == 0) {
+            std::stringstream fname;
+            fname << "data/localError_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << ".csv";
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(17);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(iter == 1) {
+                csvout << "Iter, Rerror, Perror" << endl;
+            }
+
+            csvout << iter << " "
+                   << Rerror << " "
+                   << Perror << endl;
+        }
+
+    }
+
+    void initializeShapeFunctionPIF() {
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        auto Skview = Sk_m.getView();
+        auto N = nm_m;
+        const int nghost = Sk_m.getNghost();
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector_t& dx = mesh.getMeshSpacing();
+        const Vector_t& Len = rmax_m - rmin_m;
+        const double pi = std::acos(-1.0);
+        int order = shapedegree_m + 1;
+        
+        if(shapetype_m == "Gaussian") {
+
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Gaussian shape function not implemented yet");
+
+        }
+        else if(shapetype_m == "B-spline") {
+
+            Kokkos::parallel_for("B-spline shape functions",
+                                mdrange_type({0, 0, 0},
+                                             {N[0], N[1], N[2]}),
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
+            {
+                
+                Vector<int, 3> iVec = {i, j, k};
+                Vector<double, 3> kVec;
+                double Sk = 1.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                    double khbytwo = kVec[d] * dx[d] / 2;
+                    bool isNotZero = (khbytwo != 0.0);
+                    double factor = (1.0 / (khbytwo + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::sin(khbytwo) * factor) + 
+                                 (!isNotZero) * 1.0;
+                    //Fourier transform of CIC
+                    Sk *= std::pow(arg, order);
+                }
+                    Skview(i+nghost, j+nghost, k+nghost) = Sk;
+            });
+        }
+        else {
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Unrecognized shape function type");
+        }
+
+    }
+
+    void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                     const double dt, const double& tStartMySlice, MPI_Comm& spaceComm) {
+    
+        static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
+        PLayout& PL = this->getLayout();
+        rhoPIC_m = 0.0;
+        scatter(q, rhoPIC_m, Rtemp, spaceComm);
+    
+        rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+        rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+        //Field solve
+        solver_mp->solve();
+    
+        // gather E field
+        gather(E, EfieldPIC_m, Rtemp);
+    
+        time_m = tStartMySlice;
+
+        for (unsigned int it=0; it<nt; it++) {
+            
+            // kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIC_m = 0.0;
+            scatter(q, rhoPIC_m, Rtemp, spaceComm);
+    
+            rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+            rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+            //Field solve
+            IpplTimings::startTimer(fieldSolvePIC);
+            solver_mp->solve();
+            IpplTimings::stopTimer(fieldSolvePIC);
+    
+            // gather E field
+            gather(E, EfieldPIC_m, Rtemp);
+    
+            //kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+            
+            time_m += dt;
+        }
+    
+    }
+
+    void BorisPIC(ParticleAttrib<Vector_t>& Rtemp, ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                  const double dt, const double& tStartMySlice, const double& Bext, MPI_Comm& spaceComm) {
+    
+        static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
+        PLayout& PL = this->getLayout();
+        rhoPIC_m = 0.0;
+        scatter(q, rhoPIC_m, Rtemp, spaceComm);
+    
+        rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+        rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+        //Field solve
+        solver_mp->solve();
+    
+        // gather E field
+        gather(E, EfieldPIC_m, Rtemp);
+    
+        time_m = tStartMySlice;
+
+        double alpha = -0.5 * dt;
+        double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
+        Vector_t rmax = rmax_m;
+
+        for (unsigned int it=0; it<nt; it++) {
+            
+            // Staggered Leap frog or Boris algorithm as per 
+            // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+            // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+            // the analytical matrix inversion which is not complex in this case.
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+            auto Rview = Rtemp.getView();
+            auto Pview = Ptemp.getView();
+            auto Eview = E.getView();
+            double V0 = 30*rmax[2];
+            Kokkos::parallel_for("Kick1", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                Eext_x += Eview(j)[0];
+                Eext_y += Eview(j)[1];
+                Eext_z += Eview(j)[2];
+                
+                Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eext_z;
+            });
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIC_m = 0.0;
+            scatter(q, rhoPIC_m, Rtemp, spaceComm);
+    
+            rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+            rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+            //Field solve
+            IpplTimings::startTimer(fieldSolvePIC);
+            solver_mp->solve();
+            IpplTimings::stopTimer(fieldSolvePIC);
+    
+            // gather E field
+            gather(E, EfieldPIC_m, Rtemp);
+    
+            //kick
+            auto R2view = Rtemp.getView();
+            auto P2view = Ptemp.getView();
+            auto E2view = E.getView();
+            Kokkos::parallel_for("Kick2", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+         
+                Eext_x += E2view(j)[0];
+                Eext_y += E2view(j)[1];
+                Eext_z += E2view(j)[2];
+                
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                                + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                                - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+                P2view(j)[2] += alpha * Eext_z;
+            });
+            
+            time_m += dt;
+        }
+    
+    }
+
+    void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, int rankTime, int rankSpace,
+                     const std::string& propagator, MPI_Comm& spaceComm) {
+    
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        PLayout& PL = this->getLayout();
+        rhoPIF_m = {0.0, 0.0};
+        if(propagator == "Coarse") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+        }
+        else if(propagator == "Fine") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+        }
+    
+        rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+        // Solve for and gather E field
+        if(propagator == "Coarse") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+        }
+        else if(propagator == "Fine") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+        }
+
+        //Reset the value of q here as we used it as a temporary object in gather to 
+        //save memory
+        q = Q_m / Np_m;
+    
+        time_m = tStartMySlice;
+
+        if((time_m == 0.0) && (propagator == "Fine")) {
+            IpplTimings::startTimer(dumpData);
+            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            IpplTimings::stopTimer(dumpData);
+        }
+        for (unsigned int it=0; it<nt; it++) {
+    
+            // kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIF_m = {0.0, 0.0};
+            if(propagator == "Coarse") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+            }
+            else if(propagator == "Fine") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+            }
+    
+            rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+            // Solve for and gather E field
+            if(propagator == "Coarse") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+            }
+            else if(propagator == "Fine") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+            }
+
+            q = Q_m / Np_m;
+
+            //kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            time_m += dt;
+            
+            if(propagator == "Fine") {
+                IpplTimings::startTimer(dumpData);
+                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                IpplTimings::stopTimer(dumpData);
+            }
+        }
+    }
+
+
+    void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext,
+                     int rankTime, int rankSpace,
+                     const std::string& propagator, MPI_Comm& spaceComm) {
+    
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        PLayout& PL = this->getLayout();
+        rhoPIF_m = {0.0, 0.0};
+        if(propagator == "Coarse") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+        }
+        else if(propagator == "Fine") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+        }
+    
+        rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+        // Solve for and gather E field
+        if(propagator == "Coarse") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+        }
+        else if(propagator == "Fine") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+        }
+
+        q = Q_m / Np_m;
+
+        time_m = tStartMySlice;
+
+        if((time_m == 0.0) && (propagator == "Fine")) {
+            IpplTimings::startTimer(dumpData);
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            IpplTimings::stopTimer(dumpData);
+        }
+        double alpha = -0.5 * dt;
+        double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
+        Vector_t rmax = rmax_m;
+        for (unsigned int it=0; it<nt; it++) {
+    
+            // Staggered Leap frog or Boris algorithm as per 
+            // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+            // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+            // the analytical matrix inversion which is not complex in this case.
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+            auto Rview = Rtemp.getView();
+            auto Pview = Ptemp.getView();
+            auto Eview = E.getView();
+            double V0 = 30*rmax[2];
+            Kokkos::parallel_for("Kick1", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                Eext_x += Eview(j)[0];
+                Eext_y += Eview(j)[1];
+                Eext_z += Eview(j)[2];
+                
+                Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eext_z;
+            });
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIF_m = {0.0, 0.0};
+            if(propagator == "Coarse") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+            }
+            else if(propagator == "Fine") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+            }
+    
+            rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+            // Solve for and gather E field
+            if(propagator == "Coarse") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+            }
+            else if(propagator == "Fine") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+            }
+    
+            q = Q_m / Np_m;
+            //kick
+            auto R2view = Rtemp.getView();
+            auto P2view = Ptemp.getView();
+            auto E2view = E.getView();
+            Kokkos::parallel_for("Kick2", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                Eext_x += E2view(j)[0];
+                Eext_y += E2view(j)[1];
+                Eext_z += E2view(j)[2];
+                
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                                + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                                - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+                P2view(j)[2] += alpha * Eext_z;
+            });
+
+            time_m += dt;
+            
+            if(propagator == "Fine") {
+                IpplTimings::startTimer(dumpData);
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                IpplTimings::stopTimer(dumpData);
+            }
+        }
+    }
+
+private:
+    void setBCAllPeriodic() {
+        this->setParticleBC(ippl::BC::PERIODIC);
+    }
+
+};
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
new file mode 100644
index 000000000..99f625eb9
--- /dev/null
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -0,0 +1,721 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
+//     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
+//     nCycles = No. of Parareal blocks/cycles
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
+//     Example:
+//     srun ./LandauDampingPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "StatesSlice.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, alpha, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& alpha_, 
+           const T& u_) 
+  : k(k_), alpha(alpha_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (alpha  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (alpha  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  //value_type alpha;
+
+  T alpha, k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  T& alpha_, T& k_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        alpha(alpha_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = u / (1 + alpha[d]);
+        Newton1D<value_type> solver(k[d], alpha[d], u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& alpha, const double& k) {
+   double cdf = x + (alpha / k) * std::sin(k * x);
+   return cdf;
+}
+
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      Vector_t& length, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 10) mentioned here are just an adhoc
+                                //value depending on the domain length. 
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+const char* TestName = "LandauDampingPinT";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
+
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+    
+    Inform msg(TestName, Ippl::Comm->size()-1);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nmPIF = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
+    const double dtSlice = tEndCycle / sizeTime;
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[11]);
+
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
+
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
+    for (unsigned i = 0; i< Dim; i++) {
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw = {0.5, 0.5, 0.5};
+    //Vector_t kw = {1.0, 1.0, 1.0};
+    Vector_t alpha = {0.05, 0.05, 0.05};
+    //Vector_t alpha = {0.5, 0.5, 0.5};
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
+
+
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
+
+
+    double factor = 1.0 / sizeSpace;
+    size_type nloc = (size_type)(factor * totalP);
+
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+
+    //Q = -\int\int f dx dv
+    double Q = -length[0] * length[1] * length[2];
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
+
+    Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
+
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+    
+    Pcoarse->coarsetype_m = argv[19];
+   
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+    
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], alpha[d], kw[d]);
+        maxU[d]   = CDF(rmax[d], alpha[d], kw[d]);
+    }
+
+    using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
+    
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = std::atof(argv[18]);
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+
+    IpplTimings::startTimer(particleCreation);
+
+#ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+
+    
+    IpplTimings::startTimer(timeCommunication);
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
+        //Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(79 + 100*rankSpace));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+
+        Kokkos::fence();
+    }
+    else {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+        buf->resetReadPos();
+    }
+    IpplTimings::stopTimer(timeCommunication);
+    
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    IpplTimings::startTimer(coarsePropagator);
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm);
+    }
+    else {
+        //PIF with coarse tolerance as coarse propagator
+        Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm);
+    }
+    IpplTimings::stopTimer(coarsePropagator);
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    IpplTimings::startTimer(timeCommunication);
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+    IpplTimings::stopTimer(timeCommunication);
+#else
+    //Note the CPU version has not been tested.
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+#endif
+
+    msg << "Parareal "
+        << TestName
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " No. of cycles: " << nCycles
+        << endl
+        << "Np= " << Total_particles 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+
+
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    msg << "particles created and initial conditions assigned " << endl;
+
+    int sign = 1;
+    for (unsigned int nc=0; nc < nCycles; nc++) {
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        
+        unsigned int it = 0;
+        while (!isConverged) { 
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, fine, spaceComm);
+            IpplTimings::stopTimer(finePropagator);
+    
+
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(recvCriteria && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm);
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
+
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            
+            IpplTimings::startTimer(computeErrors);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
+            IpplTimings::stopTimer(computeErrors);
+
+            
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
+                isConverged = true;
+            }
+            
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            
+            
+            msg << "Finished iteration: " << it+1 
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
+
+            IpplTimings::startTimer(dumpData);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            IpplTimings::stopTimer(dumpData);
+
+            MPI_Barrier(spaceComm);
+            
+            it += 1;
+        }
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            IpplTimings::startTimer(timeCommunication);
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            sign *= -1;
+        }
+    }
+
+    msg << TestName << " Parareal: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
+
+
+    return 0;
+}
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
new file mode 100644
index 000000000..270fba8c2
--- /dev/null
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -0,0 +1,717 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
+//     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
+//     nCycles = No. of Parareal blocks/cycles
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
+//     Example:
+//     srun ./PenningTrapPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "StatesSlice.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T mu, sigma, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& mu_, const T& sigma_, 
+           const T& u_) 
+  : mu(mu_), sigma(sigma_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = std::erf((x - mu)/(sigma * std::sqrt(2.0))) 
+          - 2 * u + 1;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = (1 / sigma) * std::sqrt(2 / pi) * 
+               std::exp(-0.5 * (std::pow(((x - mu) / sigma),2)));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while ((iterations < max_iter) && (std::fabs(f(x)) > tol)) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T mu, sigma, minU, maxU;
+
+  double pi = std::acos(-1.0);
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_,
+                  T& mu_, T& sigma_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        mu(mu_), sigma(sigma_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = (std::sqrt(pi / 2) * (2 * u - 1)) * 
+                  sigma[d] + mu[d];
+        Newton1D<value_type> solver(mu[d], sigma[d], u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& mu, const double& sigma) {
+   double cdf = 0.5 * (1.0 + std::erf((x - mu)/(sigma * std::sqrt(2))));
+   return cdf;
+}
+
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      Vector_t& length, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 22) mentioned here are just an adhoc
+                                //value depending on the domain length. 
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -22.0);
+                                    bool isRight = (diff[d] >= 22.0);
+                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
+    
+    return relError;
+
+}
+
+const char* TestName = "PenningTrapPinT";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
+
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+
+    Inform msg(TestName, Ippl::Comm->size()-1);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nmPIF = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
+    const double dtSlice = tEndCycle / sizeTime;
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[11]);
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
+
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
+    for (unsigned i = 0; i< Dim; i++) {
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t rmin(0.0);
+    Vector_t rmax(25.0);
+    Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
+
+    Vector_t mu, sd;
+
+    for (unsigned d = 0; d<Dim; d++) {
+        mu[d] = 0.5 * length[d];
+    }
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
+
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
+
+    size_type nloc = (size_type)(totalP / sizeSpace);
+
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+
+    double Q = -1562.5;
+    double Bext = 5.0;
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
+
+    Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
+
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+
+    Pcoarse->coarsetype_m = argv[19];
+
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+    
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], mu[d], sd[d]);
+        maxU[d] = CDF(rmax[d], mu[d], sd[d]);
+    }
+
+
+    using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
+    
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = std::atof(argv[18]);
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+
+    IpplTimings::startTimer(particleCreation);
+#ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+    
+    IpplTimings::startTimer(timeCommunication);
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
+                             minU, maxU));
+
+
+        Kokkos::fence();
+    }
+    else {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+        buf->resetReadPos();
+    }
+    IpplTimings::stopTimer(timeCommunication);
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    IpplTimings::startTimer(coarsePropagator);
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm);
+    }
+    else {
+        Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    }
+    IpplTimings::stopTimer(coarsePropagator);
+    
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+    
+    IpplTimings::startTimer(timeCommunication);
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+    IpplTimings::stopTimer(timeCommunication);
+#else
+    //Note the CPU version has not been tested.
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, mu, sd, 
+                         minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+#endif
+
+    msg << "Parareal "
+        << TestName
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " No. of cycles: " << nCycles
+        << endl
+        << "Np= " << Total_particles 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+    
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    msg << "particles created and initial conditions assigned " << endl;
+    
+    int sign = 1;
+    for (unsigned int nc=0; nc < nCycles; nc++) {
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        
+        unsigned int it = 0;
+        while (!isConverged) { 
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, 
+                              Bext, rankTime, rankSpace, fine, spaceComm);
+            IpplTimings::stopTimer(finePropagator);
+        
+
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(recvCriteria && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm);
+            }
+            else {
+                Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
+
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            IpplTimings::startTimer(computeErrors);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
+            IpplTimings::stopTimer(computeErrors);
+
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
+                isConverged = true;
+            }
+
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            
+            msg << "Finished iteration: " << it+1
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
+
+            IpplTimings::startTimer(dumpData);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            IpplTimings::stopTimer(dumpData);
+
+            MPI_Barrier(spaceComm);
+            
+            it += 1;
+        }
+        
+        MPI_Barrier(MPI_COMM_WORLD);
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
+
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(timeCommunication);
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                buf->resetReadPos();
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm);
+            }
+            else {
+                Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm);
+            }
+            IpplTimings::stopTimer(coarsePropagator);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+            IpplTimings::startTimer(timeCommunication);
+            if(sendCriteria) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            sign *= -1;
+        }
+    }
+    msg << TestName << " Parareal: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
+
+    return 0;
+}
diff --git a/alpine/PinT/StatesSlice.hpp b/alpine/PinT/StatesSlice.hpp
new file mode 100644
index 000000000..206f8746c
--- /dev/null
+++ b/alpine/PinT/StatesSlice.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+
+template<class PLayout>
+class StatesSlice : public ippl::ParticleBase<PLayout> {
+
+public:
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;
+
+    StatesSlice(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(P);
+    }
+
+    ~StatesSlice(){ }
+
+};
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ad4b1f186..bd6a2205b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,7 +94,14 @@ include_directories (
 
 add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
-target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
+
+if (ENABLE_NUFFT)
+    target_include_directories(ippl PUBLIC ${CUFINUFFT_INCLUDE_DIR})
+    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR})
+else()
+    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
+endif()
+
 
 install (TARGETS ippl DESTINATION lib)
 install (FILES ${IPPL_BASEDIR_HDRS} DESTINATION include)
diff --git a/src/Communicate/Communicate.cpp b/src/Communicate/Communicate.cpp
index 78bf9bb82..63314a1ef 100644
--- a/src/Communicate/Communicate.cpp
+++ b/src/Communicate/Communicate.cpp
@@ -25,7 +25,13 @@ namespace ippl {
     Communicate::Communicate(int& argc, char**& argv, const MPI_Comm& comm)
     : comm_m(comm)
     {
-        MPI_Init(&argc, &argv);
+        int isInitialized;
+        MPI_Initialized(&isInitialized);
+        
+        if (!isInitialized) {
+            MPI_Init(&argc, &argv);
+        }
+        
         MPI_Comm_rank(comm_m, &rank_m);
         MPI_Comm_size(comm_m, &size_m);
     }
diff --git a/src/Communicate/Communicate.h b/src/Communicate/Communicate.h
index 7024982e2..423bd08b1 100644
--- a/src/Communicate/Communicate.h
+++ b/src/Communicate/Communicate.h
@@ -123,14 +123,14 @@ namespace ippl {
          */
         template <class Buffer>
         void recv(int src, int tag, Buffer& buffer, archive_type& ar,
-                  size_type msize, size_type nrecvs);
+                  size_type msize, size_type nrecvs, const MPI_Comm& comm = MPI_COMM_WORLD);
 
         /*!
          * \warning Only works with default spaces!
          */
         template <class Buffer>
         void isend(int dest, int tag, Buffer& buffer, archive_type&,
-                   MPI_Request&, size_type nsends);
+                   MPI_Request&, size_type nsends, const MPI_Comm& comm = MPI_COMM_WORLD);
 
         /*!
          * \warning Only works with default spaces!
@@ -158,7 +158,7 @@ namespace ippl {
 
     template <class Buffer>
     void Communicate::recv(int src, int tag, Buffer& buffer, archive_type& ar,
-                           size_type msize, size_type nrecvs)
+                           size_type msize, size_type nrecvs, const MPI_Comm& comm)
     {
         // Temporary fix. MPI communication seems to have problems when the
         // count argument exceeds the range of int, so large messages should
@@ -169,14 +169,15 @@ namespace ippl {
         }
         MPI_Status status;
         MPI_Recv(ar.getBuffer(), msize,
-                MPI_BYTE, src, tag, comm_m, &status);
+                MPI_BYTE, src, tag, comm, &status);
 
         buffer.deserialize(ar, nrecvs);
     }
 
     template <class Buffer>
     void Communicate::isend(int dest, int tag, Buffer& buffer,
-                            archive_type& ar, MPI_Request& request, size_type nsends)
+                            archive_type& ar, MPI_Request& request, size_type nsends, 
+                            const MPI_Comm& comm)
     {
         if (ar.getSize() > INT_MAX) {
             std::cerr << "Message size exceeds range of int" << std::endl;
@@ -184,7 +185,7 @@ namespace ippl {
         }
         buffer.serialize(ar, nsends);
         MPI_Isend(ar.getBuffer(), ar.getSize(),
-                  MPI_BYTE, dest, tag, comm_m, &request);
+                  MPI_BYTE, dest, tag, comm, &request);
     }
 }
 
diff --git a/src/Communicate/Tags.h b/src/Communicate/Tags.h
index 8d6db8bcd..1e07ed717 100644
--- a/src/Communicate/Tags.h
+++ b/src/Communicate/Tags.h
@@ -26,13 +26,6 @@
 #define IPPL_EXIT_TAG             6    // program should exit()
 
 
-// tags for reduction
-#define COMM_REDUCE_SEND_TAG    10000
-#define COMM_REDUCE_RECV_TAG    11000
-#define COMM_REDUCE_SCATTER_TAG 12000
-#define COMM_REDUCE_CYCLE        1000
-
-
 // tag for applying parallel periodic boundary condition.
 
 #define BC_PARALLEL_PERIODIC_TAG 15000
@@ -48,28 +41,6 @@ namespace ippl {
     }
 }
 
-#define F_GUARD_CELLS_TAG       20000 // Field::fillGuardCells()
-#define F_WRITE_TAG             21000 // Field::write()
-#define F_READ_TAG              22000 // Field::read()
-#define F_GEN_ASSIGN_TAG        23000 // assign(BareField,BareField)
-#define F_REPARTITION_BCAST_TAG 24000 // broadcast in FieldLayout::repartion.
-#define F_REDUCE_PERP_TAG       25000 // reduction in binary load balance.
-#define F_GETSINGLE_TAG         26000 // IndexedBareField::getsingle()
-#define F_REDUCE_TAG            27000 // Reduction in minloc/maxloc
-#define F_LAYOUT_IO_TAG         28000 // Reduction in minloc/maxloc
-#define F_TAG_CYCLE              1000
-
-// // Tags for FieldView and FieldBlock
-// #define FV_2D_TAG               30000 // FieldView::update_2D_data()
-// #define FV_3D_TAG               31000 // FieldView::update_2D_data()
-// #define FV_TAG_CYCLE             1000
-//
-// #define FB_WRITE_TAG            32000 // FieldBlock::write()
-// #define FB_READ_TAG             33000 // FieldBlock::read()
-// #define FB_TAG_CYCLE             1000
-//
-// #define FP_GATHER_TAG           34000 // FieldPrint::print()
-// #define FP_TAG_CYCLE             1000
 
 // Special tags used by Particle classes for communication.
 #define P_WEIGHTED_LAYOUT_TAG   50000
@@ -88,7 +59,7 @@ namespace ippl {
 #define IPPL_TAG_CYCLE             1000
 
 // Tags for Ippl application codes
-#define IPPL_APP_TAG0    90000
+#define IPPL_PARAREAL_APP    90000
 #define IPPL_APP_TAG1    91000
 #define IPPL_APP_TAG2    92000
 #define IPPL_APP_TAG3    93000
@@ -128,4 +99,7 @@ namespace ippl {
 #define OPEN_SOLVER_TAG 18000
 #define VICO_SOLVER_TAG 70000
 
+#define IPPL_PARAREAL_SEND 19000
+#define IPPL_PARAREAL_RECV 21000
+
 #endif // TAGS_H
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 8a13e3b45..6807e8ba3 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -30,10 +30,13 @@
 
 #include <heffte_fft3d.h>
 #include <heffte_fft3d_r2c.h>
+#include <cufinufft.h>
 #include <array>
 #include <memory>
+#include <functional>
 #include <type_traits>
 
+#include "Types/IpplTypes.h"
 #include "FieldLayout/FieldLayout.h"
 #include "Field/Field.h"
 #include "Utility/ParameterList.h"
@@ -48,6 +51,8 @@ namespace heffte {
 
 namespace ippl {
 
+    template <typename T, class... Properties> class ParticleAttrib;
+
     /**
        Tag classes for CC type of Fourier transforms
     */
@@ -64,6 +69,12 @@ namespace ippl {
        Tag classes for Cosine transforms
     */
     class CosTransform {};
+#ifdef KOKKOS_ENABLE_CUDA
+    /**
+       Tag classes for Non-uniform type of Fourier transforms
+    */
+    class NUFFTransform {};
+#endif
 
     enum FFTComm {
         a2av = 0,
@@ -110,6 +121,37 @@ namespace ippl {
             using backendCos = heffte::backend::stock_cos;
         };
 #endif
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template <class T>
+        struct CufinufftType {};
+
+        template <>
+        struct CufinufftType<float> {
+            std::function<int(int, int, int64_t*, int, int, 
+                              float, cufinufftf_plan*, cufinufft_opts*)> makeplan = cufinufftf_makeplan; 
+            std::function<int(cufinufftf_plan, int, float*, float*, float*, 
+                              int, float*, float*, float*)> setpts = cufinufftf_setpts; 
+            std::function<int(cufinufftf_plan, cuFloatComplex*, cuFloatComplex*)> execute = cufinufftf_execute; 
+            std::function<int(cufinufftf_plan)> destroy = cufinufftf_destroy;
+            
+            using complexType = cuFloatComplex;
+            using plan_t      = cufinufftf_plan;
+        };
+
+        template <>
+        struct CufinufftType<double> {
+            std::function<int(int, int, int64_t*, int, int, 
+                              double, cufinufft_plan*, cufinufft_opts*)> makeplan = cufinufft_makeplan; 
+            std::function<int(cufinufft_plan, int, double*, double*, double*, 
+                              int, double*, double*, double*)> setpts = cufinufft_setpts; 
+            std::function<int(cufinufft_plan, cuDoubleComplex*, cuDoubleComplex*)> execute = cufinufft_execute; 
+            std::function<int(cufinufft_plan)> destroy = cufinufft_destroy; 
+            
+            using complexType = cuDoubleComplex;
+            using plan_t      = cufinufft_plan;
+        };
 #endif
     }
 
@@ -133,6 +175,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backend;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<Complex_t>;
+        using view_type = typename detail::ViewType<Complex_t, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -160,6 +203,7 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
 
     };
 
@@ -178,6 +222,8 @@ namespace ippl {
         using heffteBackend = typename detail::HeffteBackendType::backend;
         typedef Kokkos::complex<T> Complex_t;
         using workspace_t = typename heffte::fft3d_r2c<heffteBackend>::template buffer_container<Complex_t>;
+        using view_real_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
+        using view_complex_type = typename detail::ViewType<Complex_t, 3, Kokkos::LayoutLeft>::view_type;
 
         typedef Field<Complex_t,Dim> ComplexField_t;
 
@@ -211,6 +257,8 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d_r2c<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_real_type tempFieldf_m;
+        view_complex_type tempFieldg_m;
 
     };
 
@@ -227,6 +275,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backendSine;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<T>;
+        using view_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -252,6 +301,7 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
 
     };
     /**
@@ -267,6 +317,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backendCos;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<T>;
+        using view_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -292,11 +343,71 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
+
+    };
+
+
+#ifdef KOKKOS_ENABLE_CUDA
+    /**
+       Non-uniform FFT class
+    */
+    template <size_t Dim, class T>
+    class FFT<NUFFTransform,Dim,T> {
+
+    public:
+
+        typedef FieldLayout<Dim> Layout_t;
+        typedef Kokkos::complex<T> KokkosComplex_t;
+        typedef Field<KokkosComplex_t,Dim> ComplexField_t;
+
+        using complexType = typename detail::CufinufftType<T>::complexType;
+        using plan_t = typename detail::CufinufftType<T>::plan_t;
+        using view_field_type = typename detail::ViewType<complexType, 3, Kokkos::LayoutLeft>::view_type;
+        using view_particle_real_type = typename detail::ViewType<T, 1, Kokkos::LayoutLeft>::view_type;
+        using view_particle_complex_type = typename detail::ViewType<complexType, 1, Kokkos::LayoutLeft>::view_type;
+
+
+        FFT() = default;
+
+        /** Create a new FFT object with the layout for the input Field, type 
+         * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
+        */
+        FFT(const Layout_t& layout, const detail::size_type& localNp, int type, const ParameterList& params);
+
+        // Destructor
+        ~FFT();
+
+        /** Do the NUFFT.
+        */
+        template<class... Properties>
+        void transform(const ParticleAttrib< Vector<T, Dim>, Properties... >& R, 
+                       ParticleAttrib<T, Properties... >& Q, ComplexField_t& f);
+
+
+    private:
+
+        /**
+           setup performs the initialization necessary.
+        */
+        void setup(std::array<int64_t, 3>& nmodes,
+                   const ParameterList& params);
+
+        detail::CufinufftType<T> nufft_m;
+        plan_t plan_m;
+        int ier_m;
+        T tol_m;
+        int type_m;
+        view_field_type tempField_m;
+        view_particle_real_type tempR_m[3] = {};
+        view_particle_complex_type tempQ_m;
+
 
     };
 
 
 }
+#endif
 #include "FFT/FFT.hpp"
 #endif // IPPL_FFT_FFT_H
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 853651858..07f36cee9 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -57,8 +57,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -74,6 +74,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -88,45 +91,45 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
 
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
 
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                  (inbox, outbox, Ippl::getComm(), heffteOptions);
 
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -138,74 +141,75 @@ namespace ippl {
         int direction,
         typename FFT<CCTransform,Dim,T>::ComplexField_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                tempField(i-nghost, j-nghost, k-nghost).real(
-                                      fview(i, j, k).real());
-                                tempField(i-nghost, j-nghost, k-nghost).imag(
-                                      fview(i, j, k).imag());
-                            });
-
-
-
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                fview(i, j, k).real() =
-                                tempField(i-nghost, j-nghost, k-nghost).real();
-                                fview(i, j, k).imag() =
-                                tempField(i-nghost, j-nghost, k-nghost).imag();
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        //Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
+
+        auto tempField = tempField_m;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempField(i-nghost, j-nghost, k-nghost).real(
+                                       fview(i, j, k).real());
+                                 tempField(i-nghost, j-nghost, k-nghost).imag(
+                                       fview(i, j, k).imag());
+                             });
+
+
+
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 fview(i, j, k).real() =
+                                 tempField(i-nghost, j-nghost, k-nghost).real();
+                                 fview(i, j, k).imag() =
+                                 tempField(i-nghost, j-nghost, k-nghost).imag();
+                             });
 
     }
 
@@ -259,6 +263,14 @@ namespace ippl {
                             lDomOutput[d].first() - 1);
         }
 
+
+        if(tempFieldf_m.size() < lDomInput.size()) {
+            Kokkos::realloc(tempFieldf_m, lDomInput[0].length(), lDomInput[1].length(), lDomInput[2].length());
+        }
+        if(tempFieldg_m.size() < lDomOutput.size()) {
+            Kokkos::realloc(tempFieldg_m, lDomOutput[0].length(), lDomOutput[1].length(), lDomOutput[2].length());
+        }
+
         setup(lowInput, highInput, lowOutput, highOutput, params);
     }
 
@@ -275,17 +287,17 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {lowInput, highInput};
-         heffte::box3d<long long> outbox = {lowOutput, highOutput};
+        heffte::box3d<long long> inbox  = {lowInput, highInput};
+        heffte::box3d<long long> outbox = {lowOutput, highOutput};
 
-         heffte::plan_options heffteOptions = 
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions = 
+            heffte::default_options<heffteBackend>();
         
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
 
             switch (params.get<int>("comm")) {
@@ -309,12 +321,12 @@ namespace ippl {
          }
 
          heffte_m = std::make_shared<heffte::fft3d_r2c<heffteBackend, long long>>
-                    (inbox, outbox, params.get<int>("r2c_direction"), Ippl::getComm(),
+                    (inbox, outbox, params.get<int>("r2c_direction"), MPI_COMM_SELF,
                      heffteOptions);
         
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -325,104 +337,106 @@ namespace ippl {
         typename FFT<RCTransform,Dim,T>::RealField_t& f,
         typename FFT<RCTransform,Dim,T>::ComplexField_t& g)
     {
-       auto fview = f.getView();
-       auto gview = g.getView();
-       const int nghostf = f.getNghost();
-       const int nghostg = g.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***, Kokkos::LayoutLeft>
-           tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
-                                    fview.extent(1) - 2*nghostf,
-                                    fview.extent(2) - 2*nghostf);
-
-       Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
-           tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
-                                    gview.extent(1) - 2*nghostg,
-                                    gview.extent(2) - 2*nghostg);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos f field in FFT",
-                            mdrange_type({nghostf, nghostf, nghostf},
-                                         {fview.extent(0) - nghostf,
-                                          fview.extent(1) - nghostf,
-                                          fview.extent(2) - nghostf
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-       {
-            tempFieldf(i-nghostf, j-nghostf, k-nghostf) = fview(i, j, k);
-       });
-       Kokkos::parallel_for("copy from Kokkos g field in FFT",
-                            mdrange_type({nghostg, nghostg, nghostg},
-                                         {gview.extent(0) - nghostg,
-                                          gview.extent(1) - nghostg,
-                                          gview.extent(2) - nghostg
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).real(
-                                      gview(i, j, k).real());
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag(
-                                      gview(i, j, k).imag());
-                            });
+        auto fview = f.getView();
+        auto gview = g.getView();
+        const int nghostf = f.getNghost();
+        const int nghostg = g.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        //Kokkos::View<T***, Kokkos::LayoutLeft>
+        //    tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
+        //                             fview.extent(1) - 2*nghostf,
+        //                             fview.extent(2) - 2*nghostf);
+
+        //Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
+        //    tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
+        //                             gview.extent(1) - 2*nghostg,
+        //                             gview.extent(2) - 2*nghostg);
+
+        auto tempFieldf = tempFieldf_m;
+        auto tempFieldg = tempFieldg_m;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos f field in FFT",
+                             mdrange_type({nghostf, nghostf, nghostf},
+                                          {fview.extent(0) - nghostf,
+                                           fview.extent(1) - nghostf,
+                                           fview.extent(2) - nghostf
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+        {
+             tempFieldf(i-nghostf, j-nghostf, k-nghostf) = fview(i, j, k);
+        });
+        Kokkos::parallel_for("copy from Kokkos g field in FFT",
+                             mdrange_type({nghostg, nghostg, nghostg},
+                                          {gview.extent(0) - nghostg,
+                                           gview.extent(1) - nghostg,
+                                           gview.extent(2) - nghostg
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).real(
+                                       gview(i, j, k).real());
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag(
+                                       gview(i, j, k).imag());
+                             });
 
       
-       if ( direction == 1 )
-       {
-           heffte_m->forward( tempFieldf.data(), tempFieldg.data(), workspace_m.data(),
-                              heffte::scale::full );
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward( tempFieldg.data(), tempFieldf.data(), workspace_m.data(),
-                               heffte::scale::none );
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-
-       Kokkos::parallel_for("copy to Kokkos f field FFT",
-                            mdrange_type({nghostf, nghostf, nghostf},
-                                         {fview.extent(0) - nghostf,
-                                          fview.extent(1) - nghostf,
-                                          fview.extent(2) - nghostf
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-       {
-            fview(i, j, k) = tempFieldf(i-nghostf, j-nghostf, k-nghostf);
-       });
-
-       Kokkos::parallel_for("copy to Kokkos g field FFT",
-                            mdrange_type({nghostg, nghostg, nghostg},
-                                         {gview.extent(0) - nghostg,
-                                          gview.extent(1) - nghostg,
-                                          gview.extent(2) - nghostg
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                gview(i, j, k).real() =
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).real();
-                                gview(i, j, k).imag() =
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag();
-                            });
+        if ( direction == 1 )
+        {
+            heffte_m->forward( tempFieldf.data(), tempFieldg.data(), workspace_m.data(),
+                               heffte::scale::full );
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward( tempFieldg.data(), tempFieldf.data(), workspace_m.data(),
+                                heffte::scale::none );
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+
+        Kokkos::parallel_for("copy to Kokkos f field FFT",
+                             mdrange_type({nghostf, nghostf, nghostf},
+                                          {fview.extent(0) - nghostf,
+                                           fview.extent(1) - nghostf,
+                                           fview.extent(2) - nghostf
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+        {
+             fview(i, j, k) = tempFieldf(i-nghostf, j-nghostf, k-nghostf);
+        });
+
+        Kokkos::parallel_for("copy to Kokkos g field FFT",
+                             mdrange_type({nghostg, nghostg, nghostg},
+                                          {gview.extent(0) - nghostg,
+                                           gview.extent(1) - nghostg,
+                                           gview.extent(2) - nghostg
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 gview(i, j, k).real() =
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).real();
+                                 gview(i, j, k).imag() =
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag();
+                             });
 
     }
 
@@ -446,8 +460,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -463,6 +477,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -477,44 +494,44 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
 
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                   (inbox, outbox, Ippl::getComm(), heffteOptions);
 
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -524,66 +541,67 @@ namespace ippl {
         int direction,
         typename FFT<SineTransform,Dim,T>::Field_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-       
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              tempField(i-nghost, j-nghost, k-nghost) = 
-                              fview(i, j, k);
-                            });
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              fview(i, j, k) =
-                              tempField(i-nghost, j-nghost, k-nghost);
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+        
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        //Kokkos::View<T***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
+
+        auto tempField = tempField_m;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               tempField(i-nghost, j-nghost, k-nghost) = 
+                               fview(i, j, k);
+                             });
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               fview(i, j, k) =
+                               tempField(i-nghost, j-nghost, k-nghost);
+                             });
 
     }
 
@@ -607,8 +625,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -624,6 +642,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -638,44 +659,44 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
 
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                   (inbox, outbox, Ippl::getComm(), heffteOptions);
 
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -686,68 +707,277 @@ namespace ippl {
         int direction,
         typename FFT<CosTransform,Dim,T>::Field_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              tempField(i-nghost, j-nghost, k-nghost) = 
-                              fview(i, j, k);
-                            });
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              fview(i, j, k) =
-                              tempField(i-nghost, j-nghost, k-nghost);
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        //Kokkos::View<T***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
+
+        auto tempField = tempField_m;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               tempField(i-nghost, j-nghost, k-nghost) = 
+                               fview(i, j, k);
+                             });
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               fview(i, j, k) =
+                               tempField(i-nghost, j-nghost, k-nghost);
+                             });
+
+    }
+
+
+#ifdef KOKKOS_ENABLE_CUDA
+    //=========================================================================
+    // FFT NUFFTransform Constructors
+    //=========================================================================
+
+    /**
+       Create a new FFT object of type NUFFTransform, with a
+       given layout and cuFINUFFT parameters.
+    */
+
+    template <size_t Dim, class T>
+    FFT<NUFFTransform,Dim,T>::FFT(const Layout_t& layout,
+                                  const detail::size_type& localNp,
+                                  int type,
+                                  const ParameterList& params)
+    {
+        /**
+         * cuFINUFFT requires to pass a 3D array even for 2D and
+         * 1D FFTs we just have to fill in other
+         * dimensions to be 1. Note this is different from Heffte
+         * where we fill 0.
+         */
+        
+        std::array<int64_t, 3> nmodes;
+
+        const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
+        nmodes.fill(1);
+    
+        for(size_t d = 0; d < Dim; ++d) {
+            nmodes[d] = lDom[d].length();;
+        }
+
+        type_m = type;
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
+        for(size_t d = 0; d < Dim; ++d) {
+            if(tempR_m[d].size() < localNp) {
+                Kokkos::realloc(tempR_m[d], localNp);
+            }
+        }
+        if(tempQ_m.size() < localNp) {
+            Kokkos::realloc(tempQ_m, localNp);
+        }
+        setup(nmodes, params);
     }
+
+
+    /**
+        setup performs the initialization necessary.
+    */
+    template <size_t Dim, class T>
+    void
+    FFT<NUFFTransform,Dim,T>::setup(std::array<int64_t, 3>& nmodes,
+                                    const ParameterList& params)
+    {
+
+        cufinufft_opts opts;
+	    cufinufft_default_opts(&opts);
+        tol_m = 1e-6;
+
+        if(!params.get<bool>("use_cufinufft_defaults")) {
+           tol_m = params.get<T>("tolerance");
+           opts.gpu_method = params.get<int>("gpu_method");
+           opts.gpu_sort = params.get<int>("gpu_sort");
+           opts.gpu_kerevalmeth = params.get<int>("gpu_kerevalmeth");
+        }
+
+	    opts.gpu_maxbatchsize = 0; //default option. ignored for ntransf = 1 which
+                                   // is our case
+	    //For Perlmutter since the mask to hide the other GPUs in the node is 
+        //somehow not working there
+        //opts.gpu_device_id = (int)(Ippl::Comm->rank() % 4);
+
+        int iflag;
+        
+        if(type_m == 1) {
+            iflag = -1;
+        }
+        else if(type_m == 2) {
+            iflag = 1;
+        }
+        else {
+            throw std::logic_error("Only type 1 and type 2 NUFFT are allowed now");
+        }
+
+        //dim in cufinufft is int
+        int dim = static_cast<int>(Dim);
+        ier_m = nufft_m.makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
+                       		 &plan_m, &opts);  
+
+    }
+
+
+
+    template <size_t Dim, class T>
+    template<class... Properties>
+    void
+    FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<T, Dim>, Properties... >& R,
+                                        ParticleAttrib<T, Properties... >& Q,
+                                        typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
+    {
+        
+        
+        auto fview = f.getView();
+        auto Rview = R.getView();
+        auto Qview = Q.getView();
+        const int nghost = f.getNghost();
+
+        auto localNp = R.getParticleCount();
+
+        const Layout_t& layout = f.getLayout(); 
+        const UniformCartesian<T, Dim>& mesh = f.get_mesh();
+        const Vector<T, Dim>& dx = mesh.getMeshSpacing();
+        const Vector<T, Dim>& origin = mesh.getOrigin();
+        const auto& domain = layout.getDomain();
+        Vector<T, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+        const double pi = std::acos(-1.0);
+
+        auto tempField = tempField_m;
+        auto tempQ = tempQ_m;
+        Kokkos::View<T*,Kokkos::LayoutLeft> tempR[3] = {};
+        for(size_t d = 0; d < Dim; ++d) {
+            tempR[d] = tempR_m[d];
+        }
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from field data NUFFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempField(i-nghost, j-nghost, k-nghost).x =
+                                       fview(i, j, k).real();
+                                 tempField(i-nghost, j-nghost, k-nghost).y = 
+                                       fview(i, j, k).imag();
+                             });
+
+
+        Kokkos::parallel_for("copy from particle data NUFFT",
+                             localNp,
+                             KOKKOS_LAMBDA(const size_t i)
+                             {
+                                 for(size_t d = 0; d < Dim; ++d) {
+                                    //tempR[d](i) = (Rview(i)[d] - (twopiFactor * 2.0 * pi)) * (2.0 * pi / Len[d]);
+                                    tempR[d](i) = Rview(i)[d] * (2.0 * pi / Len[d]);
+                                    //tempR[d](i) = Rview(i)[d];
+                                 }
+                                 tempQ(i).x = Qview(i);
+                                 tempQ(i).y = 0.0;
+                             });
+
+        ier_m = nufft_m.setpts(plan_m, localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+                     NULL, NULL, NULL);
+
+        ier_m = nufft_m.execute(plan_m, tempQ.data(), tempField.data());
+        Kokkos::fence();
+
+
+        if(type_m == 1) { 
+            Kokkos::parallel_for("copy to field data NUFFT",
+                                 mdrange_type({nghost, nghost, nghost},
+                                              {fview.extent(0) - nghost,
+                                               fview.extent(1) - nghost,
+                                               fview.extent(2) - nghost
+                                              }),
+                                 KOKKOS_LAMBDA(const size_t i,
+                                               const size_t j,
+                                               const size_t k)
+                                 {
+                                     fview(i, j, k).real() =
+                                     tempField(i-nghost, j-nghost, k-nghost).x;
+                                     fview(i, j, k).imag() =
+                                     tempField(i-nghost, j-nghost, k-nghost).y;
+                                 });
+        }
+        else if(type_m == 2) {
+            Kokkos::parallel_for("copy to particle data NUFFT",
+                                 localNp,
+                                 KOKKOS_LAMBDA(const size_t i)
+                                 {
+                                     Qview(i) = tempQ(i).x;
+                                 });
+        }
+    }
+    
+    template <size_t Dim, class T>
+    FFT<NUFFTransform,Dim,T>::~FFT() {
+        
+        ier_m = nufft_m.destroy(plan_m);
+
+    }
+#endif
 }
 
 // vi: set et ts=4 sw=4 sts=4:
diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp
index 685e6a751..36886e86d 100644
--- a/src/Field/BareField.hpp
+++ b/src/Field/BareField.hpp
@@ -92,7 +92,14 @@ namespace ippl {
 
     template <typename T, unsigned Dim>
     void BareField<T, Dim>::fillHalo() {
-        if(Ippl::Comm->size() > 1) {
+        
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (layout_m->getRequestedDistribution(d) == SERIAL);
+        }
+        
+        if((Ippl::Comm->size() > 1) && (!isAllSerial)) {
             halo_m.fillHalo(dview_m, layout_m);
         }
         if(layout_m->isAllPeriodic_m) {
@@ -106,7 +113,14 @@ namespace ippl {
 
     template <typename T, unsigned Dim>
     void BareField<T, Dim>::accumulateHalo() {
-        if(Ippl::Comm->size() > 1) {
+
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (layout_m->getRequestedDistribution(d) == SERIAL);
+        }
+        
+        if((Ippl::Comm->size() > 1) && (!isAllSerial)) {
             halo_m.accumulateHalo(dview_m, layout_m);
         }
         if(layout_m->isAllPeriodic_m) {
diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp
index 34cf92e0e..9e129e497 100644
--- a/src/FieldLayout/FieldLayout.hpp
+++ b/src/FieldLayout/FieldLayout.hpp
@@ -132,15 +132,6 @@ namespace ippl {
 
         isAllPeriodic_m = isAllPeriodic;
 
-        if (nRanks < 2) {
-            Kokkos::resize(dLocalDomains_m, nRanks);
-            Kokkos::resize(hLocalDomains_m, nRanks);
-            hLocalDomains_m(0) = domain;
-            Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
-            return;
-        }
-
-
         // If the user did not specify parallel/serial flags then make all parallel.
         long totparelems = 1;
         for (unsigned d = 0; d < Dim; ++d) {
@@ -154,6 +145,22 @@ namespace ippl {
             }
         }
 
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (requestedLayout_m[d] == SERIAL);
+        }
+
+        if ((nRanks < 2) || isAllSerial) {
+            Kokkos::resize(dLocalDomains_m,nRanks);
+            Kokkos::resize(hLocalDomains_m,nRanks);
+            for (int r = 0; r < nRanks; ++r) {
+                hLocalDomains_m(r) = domain;
+            }
+            Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
+            return;
+        }
+
         /* Check to see if we have too few elements to partition.  If so, reduce
          * the number of ranks (if necessary) to just the number of elements along
          * parallel dims.
diff --git a/src/Ippl.cpp b/src/Ippl.cpp
index 420af917a..26b5b30a6 100644
--- a/src/Ippl.cpp
+++ b/src/Ippl.cpp
@@ -98,7 +98,7 @@ Ippl::Ippl(int& argc, char**& argv, MPI_Comm mpicomm)
 
         if (infoLevel > 0 && Comm->myNode() == 0) {
             for (auto& l : notparsed) {
-                std::cout << "Warning: Option '" << l << "' is not parsed by Ippl." << std::endl;
+                std::cout << "Option '" << l << "' is not parsed by Ippl. Make sure your application parses it." << std::endl;
             }
         }
 
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 99276a82c..bfa089d29 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -31,6 +31,17 @@
 
 #include "Expression/IpplExpressions.h"
 #include "Particle/ParticleAttribBase.h"
+#include "FFT/FFT.h"
+#include "Utility/ParameterList.h"
+
+namespace Kokkos { //reduction identity must be defined in Kokkos namespace
+    template<>
+    struct reduction_identity< ippl::Vector<double, 3> > {
+        KOKKOS_FORCEINLINE_FUNCTION static ippl::Vector<double, 3> sum() {
+            return ippl::Vector<double, 3>();
+        }
+    };
+}
 
 namespace ippl {
 
@@ -50,6 +61,7 @@ namespace ippl {
 
         using size_type = detail::size_type;
 
+
         // Create storage for M particle attributes.  The storage is uninitialized.
         // New items are appended to the end of the array.
         void create(size_type) override;
@@ -127,7 +139,6 @@ namespace ippl {
         /*!
          * Assign the same value to the whole attribute.
          */
-	//KOKKOS_INLINE_FUNCTION
         ParticleAttrib<T, Properties...>& operator=(T x);
 
         /*!
@@ -138,22 +149,49 @@ namespace ippl {
          * @param expr is the expression
          */
         template <typename E, size_t N>
-	//KOKKOS_INLINE_FUNCTION
         ParticleAttrib<T, Properties...>& operator=(detail::Expression<E, N> const& expr);
 
 
-        //     // scatter the data from this attribute onto the given Field, using
-//     // the given Position attribute
+        // scatter the data from this attribute onto the given Field, using
+        // the given Position attribute
         template <unsigned Dim, class M, class C, typename P2>
         void
         scatter(Field<T, Dim, M, C>& f,
-                const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
-        
+                const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp,
+                const MPI_Comm& spaceComm) const;
+
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        void
+        scatterPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                const MPI_Comm& spaceComm) const;
+
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
                const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp);
 
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        void
+        gatherPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp);
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        void
+        scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                FFT<NUFFTransform, 3, P3>* nufft,
+                const MPI_Comm& spaceComm) const;
+        
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        void
+        gatherPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                FFT<NUFFTransform, 3, P3>* nufft,
+                ParticleAttrib<P4, Properties... >& q);
+#endif
+
         T sum();
         T max();
         T min();
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 824cd50aa..724bc2913 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -30,6 +30,7 @@
 #include "Communicate/DataTypes.h"
 #include "Utility/IpplTimings.h"
 
+
 namespace ippl {
 
     template<typename T, class... Properties>
@@ -138,14 +139,16 @@ namespace ippl {
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class PT>
     void ParticleAttrib<T, Properties...>::scatter(Field<T,Dim,M,C>& f,
-                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   const MPI_Comm& spaceComm)
     const
     {
-        static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("scatter");           
-        IpplTimings::startTimer(scatterTimer);                                               
+        static IpplTimings::TimerRef scatterPICTimer = IpplTimings::getTimer("ScatterPIC");           
+        IpplTimings::startTimer(scatterPICTimer);                                               
+        
         typename Field<T, Dim, M, C>::view_type view = f.getView();
 
-        const M& mesh = f.get_mesh();
+        M& mesh = f.get_mesh();
 
         using vector_type = typename M::vector_type;
         using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
@@ -154,10 +157,19 @@ namespace ippl {
         const vector_type& origin = mesh.getOrigin();
         const vector_type invdx = 1.0 / dx;
 
-        const FieldLayout<Dim>& layout = f.getLayout(); 
+        FieldLayout<Dim>& layout = f.getLayout(); 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
         const int nghost = f.getNghost();
 
+
+        Field<T, Dim, M, C> tempField;
+
+        tempField.initialize(mesh, layout);
+
+        tempField = 0.0;
+        
+        typename Field<T, Dim, M, C>::view_type viewLocal = tempField.getView();
+
         Kokkos::parallel_for(
             "ParticleAttrib::scatter",
             *(this->localNum_mp),
@@ -169,29 +181,137 @@ namespace ippl {
                 Vector<double, Dim> whi = l - index;
                 Vector<double, Dim> wlo = 1.0 - whi;
 
-                const size_t i = index[0] - lDom[0].first() + nghost;
-                const size_t j = index[1] - lDom[1].first() + nghost;
-                const size_t k = index[2] - lDom[2].first() + nghost;
-
+                const int i = index[0] - lDom[0].first() + nghost;
+                const int j = index[1] - lDom[1].first() + nghost;
+                const int k = index[2] - lDom[2].first() + nghost;
 
                 // scatter
                 const value_type& val = dview_m(idx);
-                Kokkos::atomic_add(&view(i-1, j-1, k-1), wlo[0] * wlo[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i-1, j-1, k  ), wlo[0] * wlo[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i-1, j,   k-1), wlo[0] * whi[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i-1, j,   k  ), wlo[0] * whi[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i,   j-1, k-1), whi[0] * wlo[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i,   j-1, k  ), whi[0] * wlo[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i,   j,   k-1), whi[0] * whi[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j-1, k-1), wlo[0] * wlo[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j-1, k  ), wlo[0] * wlo[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j,   k-1), wlo[0] * whi[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j,   k  ), wlo[0] * whi[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j-1, k-1), whi[0] * wlo[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j-1, k  ), whi[0] * wlo[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j,   k-1), whi[0] * whi[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
             }
         );
-        IpplTimings::stopTimer(scatterTimer);
             
-        static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("accumulateHalo");           
-        IpplTimings::startTimer(accumulateHaloTimer);                                               
-        f.accumulateHalo();
-        IpplTimings::stopTimer(accumulateHaloTimer);                                               
+        tempField.accumulateHalo();
+
+        IpplTimings::stopTimer(scatterPICTimer);
+
+        static IpplTimings::TimerRef scatterAllReducePICTimer = IpplTimings::getTimer("scatterAllReducePIC");           
+        IpplTimings::startTimer(scatterAllReducePICTimer);                                               
+        int viewSize = view.extent(0) * view.extent(1) * view.extent(2);
+        MPI_Allreduce(viewLocal.data(), view.data(), viewSize, 
+                      MPI_DOUBLE, MPI_SUM, spaceComm);  
+        IpplTimings::stopTimer(scatterAllReducePICTimer);
+    }
+
+
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   const MPI_Comm& spaceComm)
+    const
+    {
+        
+        static IpplTimings::TimerRef scatterPIFNUDFTTimer = IpplTimings::getTimer("ScatterPIFNUDFT");           
+        IpplTimings::startTimer(scatterPIFNUDFTTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+        
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+        using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
+
+        view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
+        
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t flatN = N[0]*N[1]*N[2];
+
+        Kokkos::parallel_for("ParticleAttrib::scatterPIFNUDFT compute",
+                team_policy(flatN, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t flatIndex = teamMember.league_rank();
+               
+#ifdef KOKKOS_ENABLE_CUDA
+                const int k = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                const int i = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
+#else
+
+                const int i = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                const int k = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
+#endif
+                
+                FT reducedValue = 0.0;
+                Vector<int, 3> iVec = {i, j, k};
+                vector_type kVec;
+                for(size_t d = 0; d < Dim; ++d) {
+                    //bool shift = (iVec[d] > (N[d]/2));
+                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                }
+                auto Sk = Skview(i+nghost, j+nghost, k+nghost);
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
+                [=](const size_t idx, FT& innerReduce)
+                {
+                    double arg = 0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        arg += kVec[d]*pp(idx)[d];
+                    }
+                    const value_type& val = dview_m(idx);
+
+                    innerReduce += Sk * (Kokkos::cos(arg) 
+                                - imag * Kokkos::sin(arg)) * val;
+                }, Kokkos::Sum<FT>(reducedValue));
+
+                if(teamMember.team_rank() == 0) {
+                    viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    //fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+                }
+
+                }
+        );
+
+        IpplTimings::stopTimer(scatterPIFNUDFTTimer);
+
+        static IpplTimings::TimerRef scatterAllReducePIFTimer = IpplTimings::getTimer("scatterAllReducePIF");           
+        IpplTimings::startTimer(scatterAllReducePIFTimer);                                               
+        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, spaceComm);  
+        IpplTimings::stopTimer(scatterAllReducePIFTimer);
+
     }
 
 
@@ -201,13 +321,11 @@ namespace ippl {
                                                   const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp)
     {
 
-        static IpplTimings::TimerRef fillHaloTimer = IpplTimings::getTimer("fillHalo");           
-        IpplTimings::startTimer(fillHaloTimer);                                               
-        f.fillHalo();
-        IpplTimings::stopTimer(fillHaloTimer);                                               
+        static IpplTimings::TimerRef gatherPICTimer = IpplTimings::getTimer("GatherPIC");           
+        IpplTimings::startTimer(gatherPICTimer);                                               
 
-        static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("gather");           
-        IpplTimings::startTimer(gatherTimer);                                               
+        f.fillHalo();
+        
         const typename Field<T, Dim, M, C>::view_type view = f.getView();
 
         const M& mesh = f.get_mesh();
@@ -250,25 +368,318 @@ namespace ippl {
                     + whi[0] * whi[1] * whi[2] * view(i,   j,   k  );
             }
         );
-        IpplTimings::stopTimer(gatherTimer);                                               
+        IpplTimings::stopTimer(gatherPICTimer);                                               
+    }
+
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    {
+        static IpplTimings::TimerRef gatherPIFNUDFTTimer = IpplTimings::getTimer("GatherPIFNUDFT");           
+        IpplTimings::startTimer(gatherPIFNUDFTTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t flatN = N[0]*N[1]*N[2];
+
+        Kokkos::parallel_for("ParticleAttrib::gatherPIFNUDFT",
+                team_policy(Np, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t idx = teamMember.league_rank();
+
+                value_type reducedValue = 0.0;
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
+                [=](const size_t flatIndex, value_type& innerReduce)
+                {
+                    
+#ifdef KOKKOS_ENABLE_CUDA
+                    const int k = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                    const int i = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
+#else
+                    const int i = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                    const int k = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
+#endif
+
+                    Vector<int, 3> iVec = {i, j, k};
+                    vector_type kVec;
+                    double Dr = 0.0, arg = 0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        //bool shift = (iVec[d] > (N[d]/2));
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
+                        Dr += kVec[d] * kVec[d];
+                        arg += kVec[d]*pp(idx)[d];
+                    }
+                    
+
+                    FT Ek = 0.0;
+                    value_type Ex = 0.0;
+                    auto rho = fview(i+nghost,j+nghost,k+nghost);
+                    auto Sk = Skview(i+nghost,j+nghost,k+nghost);
+                    for(size_t d = 0; d < Dim; ++d) {
+                        
+                        bool isNotZero = (Dr != 0.0);
+                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                        Ek = -(imag * kVec[d] * rho * factor);
+                        
+                        //Inverse Fourier transform when the lhs is real. Use when 
+                        //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
+                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::cos(arg) 
+                        //        - Ek.imag() * Kokkos::sin(arg));
+                        Ek *= Sk * (Kokkos::cos(arg) 
+                                + imag * Kokkos::sin(arg));
+                        Ex[d] = Ek.real();
+                    }
+                    
+                    innerReduce += Ex;
+                }, Kokkos::Sum<value_type>(reducedValue));
+
+                teamMember.team_barrier();
+
+                if(teamMember.team_rank() == 0) {
+                    dview_m(idx) = reducedValue;
+                }
+
+                }
+        );
+
+        
+        IpplTimings::stopTimer(gatherPIFNUDFTTimer);
+
     }
 
+#ifdef KOKKOS_ENABLE_CUDA
+    
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   FFT<NUFFTransform, 3, ST>* nufft,
+                                                   const MPI_Comm& spaceComm)
+    const
+    {
+        
+        static IpplTimings::TimerRef scatterPIFNUFFTTimer = IpplTimings::getTimer("ScatterPIFNUFFT");           
+        IpplTimings::startTimer(scatterPIFNUFFTTimer);
+
+        auto q = *this;
+        
+        Field<FT,Dim,M,C> tempField;
+
+        FieldLayout<Dim>& layout = f.getLayout(); 
+        M& mesh = f.get_mesh();
 
+        tempField.initialize(mesh, layout);
+
+        tempField = 0.0;
+        
+        nufft->transform(pp, q, tempField);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        view_type fview = f.getView();
+        view_type viewLocal = tempField.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        
+        IpplTimings::stopTimer(scatterPIFNUFFTTimer);
+
+        static IpplTimings::TimerRef scatterAllReducePIFTimer = IpplTimings::getTimer("scatterAllReducePIF");           
+        IpplTimings::startTimer(scatterAllReducePIFTimer);                                               
+        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, spaceComm);  
+        IpplTimings::stopTimer(scatterAllReducePIFTimer);
+
+        IpplTimings::startTimer(scatterPIFNUFFTTimer);
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        Kokkos::parallel_for("Multiply with shape functions",
+                            mdrange_type({nghost, nghost, nghost},
+                                         {fview.extent(0) - nghost, 
+                                          fview.extent(1) - nghost,
+                                          fview.extent(2) - nghost}),
+                            KOKKOS_LAMBDA(const size_t i,
+                                          const size_t j,
+                                          const size_t k)
+        {
+            fview(i, j, k) *= Skview(i, j, k);    
+        });
+
+        IpplTimings::stopTimer(scatterPIFNUFFTTimer);
+    }
+
+
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp, 
+                                                   FFT<NUFFTransform, 3, ST>* nufft,
+                                                   ParticleAttrib<PT, Properties... >& q)
+    {
+        static IpplTimings::TimerRef gatherPIFNUFFTTimer = IpplTimings::getTimer("GatherPIFNUFFT");           
+        IpplTimings::startTimer(gatherPIFNUFFTTimer);
+
+        Field<FT,Dim,M,C> tempField;
+
+        FieldLayout<Dim>& layout = f.getLayout(); 
+        M& mesh = f.get_mesh();
+
+        tempField.initialize(mesh, layout);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        view_type fview = f.getView();
+        view_type tempview = tempField.getView();
+        auto qview = q.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        size_t Np = *(this->localNum_mp);
+
+        
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        
+        for(size_t gd = 0; gd < Dim; ++gd) {
+            Kokkos::parallel_for("Gather NUFFT",
+                                mdrange_type({nghost, nghost, nghost},
+                                             {fview.extent(0) - nghost,
+                                              fview.extent(1) - nghost,
+                                              fview.extent(2) - nghost}),
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
+            {
+                Vector<int, 3> iVec = {i-nghost, j-nghost, k-nghost};
+                Vector<double, 3> kVec;
+
+                double Dr = 0.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                    Dr += kVec[d] * kVec[d];
+                }
+
+                tempview(i, j, k) = fview(i, j, k);
+                
+                bool isNotZero = (Dr != 0.0);
+                double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                
+                tempview(i, j, k) *= -Skview(i, j, k) * (imag * kVec[gd] * factor);
+            });
+
+            nufft->transform(pp, q, tempField);
+
+            Kokkos::parallel_for("Assign E gather NUFFT",
+                                Np,
+                                KOKKOS_CLASS_LAMBDA(const size_t i)
+            {
+                dview_m(i)[gd] = qview(i);
+            });
+        }
+        
+        IpplTimings::stopTimer(gatherPIFNUFFTTimer);
+
+    }
+#endif
 
     /*
-     * Non-class function
+     * Non-class functions
      *
      */
 
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
+                 FFT<NUFFTransform, 3, P3>* nufft,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
+    {
+#ifdef KOKKOS_ENABLE_CUDA
+        attrib.scatterPIFNUFFT(f, Sk, pp, nufft, spaceComm);
+#else
+        //throw IpplException("scatterPIFNUFFT", "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+        //                     be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
+#endif
+    }
+
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void gatherPIFNUFFT(ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp, 
+                 FFT<NUFFTransform, 3, P3>* nufft,
+                 ParticleAttrib<P4, Properties... >& q)
+    {
+#ifdef KOKKOS_ENABLE_CUDA
+        attrib.gatherPIFNUFFT(f, Sk, pp, nufft, q);
+#else
+        //throw IpplException("gatherPIFNUFFT",
+        //                    "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+        //                     be compiled with CUDA. Otherwise use gatherPIFNUDFT.");
+#endif
+    }
+
+
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, class... Properties>
     inline
     void scatter(const ParticleAttrib<P1, Properties...>& attrib, Field<P1, Dim, M, C>& f,
-                 const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp)
+                 const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
-        attrib.scatter(f, pp);
+        attrib.scatter(f, pp, spaceComm);
     }
 
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void scatterPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
+    {
+        attrib.scatterPIFNUDFT(f, Sk, pp, spaceComm);
+    }
+
+
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, class... Properties>
     inline
@@ -278,6 +689,15 @@ namespace ippl {
         attrib.gather(f, pp);
     }
 
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void gatherPIFNUDFT(ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+    {
+        attrib.gatherPIFNUDFT(f, Sk, pp);
+    }
+
+
     #define DefineParticleReduction(fun, name, op, MPI_Op)                                                   \
     template<typename T, class... Properties>                                                                \
     T ParticleAttrib<T, Properties...>::name() {                                                             \
diff --git a/src/Particle/ParticleBC.h b/src/Particle/ParticleBC.h
index 275f04e00..dfd5aa5a0 100644
--- a/src/Particle/ParticleBC.h
+++ b/src/Particle/ParticleBC.h
@@ -77,8 +77,11 @@ namespace ippl {
         struct PeriodicBC : public ParticleBC<T, Dim, ViewType> {
             using value_type = typename ParticleBC<T, Dim, ViewType>::value_type;
 
-            using ParticleBC<T, Dim, ViewType>::extent_m;
-            using ParticleBC<T, Dim, ViewType>::middle_m;
+            //using ParticleBC<T, Dim, ViewType>::extent_m;
+            //using ParticleBC<T, Dim, ViewType>::middle_m;
+            using ParticleBC<T, Dim, ViewType>::maxval_m;
+            using ParticleBC<T, Dim, ViewType>::minval_m;
+            using ParticleBC<T, Dim, ViewType>::isUpper_m;
 
             KOKKOS_DEFAULTED_FUNCTION
             PeriodicBC() = default;
@@ -94,7 +97,16 @@ namespace ippl {
             KOKKOS_INLINE_FUNCTION
             void operator()(const size_t& i) const {
                 value_type& value = this->view_m(i)[this->dim_m];
-                value = value - extent_m * (int)((value - middle_m) * 2 / extent_m);
+                //value = value - this->extent_m * (int)((value - this->middle_m) * 2 / extent_m);
+                //if ((value < this->minval_m) && (!this->isUpper_m))
+                //    value = (this->maxval_m - (this->minval_m - value));
+                //else if ((value >= this->maxval_m) && (this->isUpper_m))
+                //    value = (this->minval_m + (value - this->maxval_m));
+                bool tooHigh = value >= maxval_m;
+                bool tooLow = value < minval_m;
+
+                value += tooHigh * (minval_m - maxval_m) +
+                         tooLow * (maxval_m - minval_m);
             }
 
             KOKKOS_DEFAULTED_FUNCTION
diff --git a/src/Solver/FFTPeriodicPoissonSolver.hpp b/src/Solver/FFTPeriodicPoissonSolver.hpp
index 015400e9a..73d7d2a2c 100644
--- a/src/Solver/FFTPeriodicPoissonSolver.hpp
+++ b/src/Solver/FFTPeriodicPoissonSolver.hpp
@@ -113,12 +113,11 @@ namespace ippl {
 
                   double Dr = kVec[0] * kVec[0] +
                               kVec[1] * kVec[1] + kVec[2] * kVec[2];
-
-                  //It would be great if we can remove this conditional
-                  if(Dr != 0.0)
-                      view(i, j, k) *= 1 / Dr;
-                  else
-                      view(i, j, k) = 0.0;
+                  
+                  bool isNotZero = (Dr != 0.0);
+                  double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                  
+                  view(i, j, k) *= factor;
               });
 
               fft_mp->transform(-1, *this->rhs_mp, fieldComplex_m);
@@ -158,6 +157,8 @@ namespace ippl {
                             const double Len = rmax[d] - origin[d];
                             bool shift = (iVec[d] > (N[d]/2));
                             bool notMid = (iVec[d] != (N[d]/2));
+                            //For the noMid part see 
+                            //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
                             kVec[d] = notMid * 2 * pi / Len * (iVec[d] - shift * N[d]);
                         }
 
@@ -166,11 +167,10 @@ namespace ippl {
 
                         tempview(i, j, k) = view(i, j, k);
                         
-                        //It would be great if we can remove this conditional
-                        if(Dr != 0.0)
-                            tempview(i, j, k) *= -(imag * kVec[gd] / Dr);
-                        else
-                            tempview(i, j, k) = 0.0;
+                        bool isNotZero = (Dr != 0.0);
+                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                        
+                        tempview(i, j, k) *= -(imag * kVec[gd] * factor);
                     });
 
                     fft_mp->transform(-1, *this->rhs_mp, tempFieldComplex_m);
diff --git a/src/Types/ViewTypes.h b/src/Types/ViewTypes.h
index 179cc4056..a8877926d 100644
--- a/src/Types/ViewTypes.h
+++ b/src/Types/ViewTypes.h
@@ -19,6 +19,7 @@
 #define IPPL_VIEW_TYPES_H
 
 #include <Kokkos_Core.hpp>
+#include <iostream>
 
 namespace ippl {
     /**
@@ -54,7 +55,7 @@ namespace ippl {
         };
 
         /*!
-         * Specialized view type for thee dimensions.
+         * Specialized view type for three dimensions.
          */
         template <typename T, class... Properties>
         struct ViewType<T, 3, Properties...> {
diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp
index 7cc1079c4..c52c53b51 100644
--- a/src/Utility/IpplTimings.cpp
+++ b/src/Utility/IpplTimings.cpp
@@ -115,7 +115,7 @@ void Timing::print() {
     msg << level1
         << "---------------------------------------------";
     msg << "\n";
-    msg << "     Timing results for " << Ippl::Comm->getNodes() << " nodes:" << "\n";
+    msg << "     Timing results for " << Ippl::Comm->getNodes() << " ranks:" << "\n";
     msg << "---------------------------------------------";
     msg << "\n";
 
@@ -178,7 +178,7 @@ void Timing::print(const std::string &fn, const std::map<std::string, unsigned i
         *msg << endl;
     }
 
-    *msg << std::setw(27) << "num Nodes"
+    *msg << std::setw(27) << "num Ranks"
          << std::setw(11) << "Wall tot\n"
          << std::string().assign(37,'=')
          << "\n";
@@ -198,7 +198,7 @@ void Timing::print(const std::string &fn, const std::map<std::string, unsigned i
     }
 
     *msg << "\n"
-         << std::setw(27) << "num Nodes"
+         << std::setw(27) << "num Ranks"
          << std::setw(10) << "Wall max"
          << std::setw(10) << "Wall min"
          << std::setw(11) << "Wall avg\n"
diff --git a/test/FFT/CMakeLists.txt b/test/FFT/CMakeLists.txt
index 5d0332166..4d3e5fe90 100644
--- a/test/FFT/CMakeLists.txt
+++ b/test/FFT/CMakeLists.txt
@@ -39,6 +39,18 @@ target_link_libraries (
     ${IPPL_LIBS}
     ${MPI_CXX_LIBRARIES}
 )
+add_executable (TestNUFFT1 TestNUFFT1.cpp)
+target_link_libraries (
+    TestNUFFT1
+    ${IPPL_LIBS}
+    ${MPI_CXX_LIBRARIES}
+)
+add_executable (TestNUFFT2 TestNUFFT2.cpp)
+target_link_libraries (
+    TestNUFFT2
+    ${IPPL_LIBS}
+    ${MPI_CXX_LIBRARIES}
+)
 # vi: set et ts=4 sw=4 sts=4:
 
 # Local Variables:
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
new file mode 100644
index 000000000..0f6a6ba3e
--- /dev/null
+++ b/test/FFT/TestNUFFT1.cpp
@@ -0,0 +1,281 @@
+#include "Ippl.h"
+#include "Utility/ParameterList.h"
+
+#include <iostream>
+#include <typeinfo>
+#include <array>
+#include<Kokkos_Random.hpp>
+#include <random>
+
+template<class PLayout>
+struct Bunch : public ippl::ParticleBase<PLayout>
+{
+
+    Bunch(PLayout& playout)
+    : ippl::ParticleBase<PLayout>(playout)
+    {
+        this->addAttribute(Q);
+    }
+
+    ~Bunch(){ }
+    
+    typedef ippl::ParticleAttrib<double> charge_container_type;
+    charge_container_type Q;
+
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  using view_type_scalar = typename ippl::detail::ViewType<value_type, 1>::view_type;
+  // Output View for the random numbers
+  view_type x;
+
+  view_type_scalar Q;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_,view_type_scalar Q_,  GeneratorPool rand_pool_, 
+                  T& minU_, T& maxU_)
+      : x(x_), Q(Q_), rand_pool(rand_pool_), 
+        minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]);
+    }
+    Q(i) = rand_gen.drand(0.0, 1.0);
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+
+int main(int argc, char *argv[]) {
+
+    Ippl ippl(argc,argv);
+
+    constexpr unsigned int dim = 3;
+    const double pi = std::acos(-1.0);
+
+    typedef ippl::ParticleSpatialLayout<double, 3> playout_type;
+    typedef Bunch<playout_type> bunch_type;
+
+    
+    ippl::Vector<int, dim> pt = {16, 16, 16};
+    ippl::Index I(pt[0]);
+    ippl::Index J(pt[1]);
+    ippl::Index K(pt[2]);
+    ippl::NDIndex<dim> owned(I, J, K);
+
+    ippl::e_dim_tag decomp[dim];    // Specifies SERIAL, PARALLEL dims
+    for (unsigned int d=0; d<dim; d++)
+        decomp[d] = ippl::SERIAL;
+
+    ippl::FieldLayout<dim> layout(owned, decomp);
+
+    typedef ippl::Vector<double, 3> Vector_t;
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+    //Vector_t minU = {0.0, 0.0, 0.0};
+    //Vector_t maxU = {25.0, 25.0, 25.0};
+    
+    std::array<double, dim> dx = {
+    (maxU[0] - minU[0]) / double(pt[0]),
+    (maxU[1] - minU[1]) / double(pt[1]),
+    (maxU[2] - minU[2]) / double(pt[2]),
+    };
+    
+
+    Vector_t hx = {dx[0], dx[1], dx[2]};
+    Vector_t origin = {minU[0], minU[1], minU[2]};
+    ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
+
+    playout_type pl(layout, mesh);
+
+    bunch_type bunch(pl);
+    bunch.setParticleBC(ippl::BC::PERIODIC);
+   
+    using size_type = ippl::detail::size_type;
+
+
+    size_type Np = std::pow(16,3);
+    
+    typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
+
+    field_type field(mesh, layout);
+    field_type field_dft(mesh, layout);
+
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 0);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-12);
+
+    fftParams.add("use_cufinufft_defaults", false);  
+    
+    typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
+
+    std::unique_ptr<FFT_type> fft;
+    
+    int type = 1;
+    
+
+    size_type nloc = Np/Ippl::Comm->size();
+
+    bunch.create(nloc);
+    fft = std::make_unique<FFT_type>(layout, nloc, type, fftParams);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         bunch.R.getView(), bunch.Q.getView(), rand_pool64, minU, maxU));
+    
+
+    fft->transform(bunch.R, bunch.Q, field);
+    
+    auto field_result = Kokkos::create_mirror_view_and_copy(
+                        Kokkos::HostSpace(), field.getView());
+
+
+    //Pick some mode to check. We choose it same as cuFINUFFT testcase cufinufft3d1_test.cu
+    ippl::Vector<int, 3> kVec;
+    kVec[0] = (int)(0.37 * pt[0]);
+    kVec[1] = (int)(0.26 * pt[1]);
+    kVec[2] = (int)(0.13 * pt[2]);
+
+    const int nghost = field.getNghost();
+
+    int iInd = (pt[0]/2 + kVec[0] + nghost);
+    int jInd = (pt[1]/2 + kVec[1] + nghost);
+    int kInd = (pt[2]/2 + kVec[2] + nghost);
+
+
+    Kokkos::complex<double> reducedValue(0.0, 0.0);
+
+    auto Rview = bunch.R.getView();
+    auto Qview = bunch.Q.getView();
+
+    Kokkos::complex<double> imag = {0.0, 1.0};
+    size_t flatN = pt[0] * pt[1] * pt[2];
+    auto fview = field_dft.getView();
+  
+
+
+    typedef Kokkos::TeamPolicy<> team_policy;
+    typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+    Kokkos::parallel_for("NUDFT type 1",
+           team_policy(flatN, Kokkos::AUTO),
+           KOKKOS_LAMBDA(const member_type& teamMember) {
+           const size_t flatIndex = teamMember.league_rank();
+          
+           const int k = (int)(flatIndex / (pt[0] * pt[1]));
+           const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
+           const int i = flatIndex2D % pt[0];
+           const int j = (int)(flatIndex2D / pt[0]);
+           
+           Kokkos::complex<double> reducedValue = 0.0;
+           ippl::Vector<int, 3> iVec = {i, j, k};
+           ippl::Vector<double, 3>kVec;
+           for(size_t d = 0; d < 3; ++d) {
+               kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
+           }
+           Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
+           [=](const size_t idx, Kokkos::complex<double>& innerReduce)
+           {
+               double arg = 0.0;
+               for(size_t d = 0; d < 3; ++d) {
+                   arg += kVec[d]*Rview(idx)[d];
+               }
+               const double& val = Qview(idx);
+
+               innerReduce += (Kokkos::cos(arg) 
+                           - imag * Kokkos::sin(arg)) * val;
+           }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+
+           if(teamMember.team_rank() == 0) {
+               fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+           }
+
+           });
+    
+    typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
+    Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
+    std::stringstream pname;
+    pname << "data/FieldFFT_";
+    pname << Ippl::Comm->rank();
+    pname << ".csv";
+    Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout.precision(10);
+    pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+    }
+    std::stringstream pname2;
+    pname2 << "data/FieldDFT_";
+    pname2 << Ippl::Comm->rank();
+    pname2 << ".csv";
+    Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout2.precision(10);
+    pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout2 << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+       }
+       Ippl::Comm->barrier();
+    
+    
+    
+    Kokkos::parallel_reduce("NUDFT type1", nloc,
+                             KOKKOS_LAMBDA(const size_t idx, Kokkos::complex<double>& valL) {
+
+                                double arg = 0.0;
+                                for(size_t d = 0; d < dim; ++d) {
+                                    arg += (2 * pi / (hx[d] * pt[d])) * kVec[d] * Rview(idx)[d];
+                                }
+
+                                valL += (Kokkos::cos(arg) 
+                                - imag * Kokkos::sin(arg)) * Qview(idx);
+                            }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+    
+    double abs_error_real = std::fabs(reducedValue.real() - field_result(iInd, jInd, kInd).real());
+    double rel_error_real = std::fabs(reducedValue.real() - field_result(iInd, jInd, kInd).real()) /std::fabs(reducedValue.real());
+    double abs_error_imag = std::fabs(reducedValue.imag() - field_result(iInd, jInd, kInd).imag());
+    double rel_error_imag = std::fabs(reducedValue.imag() - field_result(iInd, jInd, kInd).imag()) /std::fabs(reducedValue.imag());
+ 
+    std::cout << "Abs Error in real part: " << std::setprecision(16) 
+              << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
+    std::cout << "Abs Error in imag part: " << std::setprecision(16) 
+              << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
+    std::cout << "Field result: " << std::setprecision(16) 
+              << field_result(iInd,jInd,kInd).real() << " " << std::setprecision(16) << field_result(iInd,jInd,kInd).imag() 
+              << "index: " << iInd << "," << jInd << "," << kInd << std::endl;
+
+
+    //Kokkos::complex<double> max_error(0.0, 0.0);
+    //MPI_Reduce(&max_error_local, &max_error, 1, 
+    //           MPI_C_DOUBLE_COMPLEX, MPI_MAX, 0, Ippl::getComm());
+
+    return 0;
+}
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
new file mode 100644
index 000000000..9d47a9607
--- /dev/null
+++ b/test/FFT/TestNUFFT2.cpp
@@ -0,0 +1,231 @@
+#include "Ippl.h"
+#include "Utility/ParameterList.h"
+
+#include <iostream>
+#include <typeinfo>
+#include <array>
+#include<Kokkos_Random.hpp>
+#include <random>
+
+template<class PLayout>
+struct Bunch : public ippl::ParticleBase<PLayout>
+{
+
+    Bunch(PLayout& playout)
+    : ippl::ParticleBase<PLayout>(playout)
+    {
+        this->addAttribute(Q);
+    }
+
+    ~Bunch(){ }
+    
+    typedef ippl::ParticleAttrib<double> charge_container_type;
+    charge_container_type Q;
+
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random_particles {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T minU, maxU;
+
+  // Initialize all members
+  generate_random_particles(view_type x_, GeneratorPool rand_pool_, 
+                  T& minU_, T& maxU_)
+      : x(x_), rand_pool(rand_pool_), 
+        minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random_field {
+
+  using view_type = typename ippl::detail::ViewType<T, Dim>::view_type;
+  view_type f;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  // Initialize all members
+  generate_random_field(view_type f_, GeneratorPool rand_pool_)
+      : f(f_), rand_pool(rand_pool_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i, const size_t j, const size_t k) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    f(i, j, k).real() = rand_gen.drand(0.0, 1.0);
+    f(i, j, k).imag() = rand_gen.drand(0.0, 1.0);
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+int main(int argc, char *argv[]) {
+
+    Ippl ippl(argc,argv);
+
+    constexpr unsigned int dim = 3;
+    const double pi = std::acos(-1.0);
+
+    typedef ippl::ParticleSpatialLayout<double, 3> playout_type;
+    typedef Bunch<playout_type> bunch_type;
+
+    
+    ippl::Vector<int, dim> pt = {32, 32, 32};
+    ippl::Index I(pt[0]);
+    ippl::Index J(pt[1]);
+    ippl::Index K(pt[2]);
+    ippl::NDIndex<dim> owned(I, J, K);
+
+    ippl::e_dim_tag decomp[dim];    // Specifies SERIAL, PARALLEL dims
+    for (unsigned int d=0; d<dim; d++)
+        decomp[d] = ippl::SERIAL;
+
+    ippl::FieldLayout<dim> layout(owned, decomp);
+
+    typedef ippl::Vector<double, 3> Vector_t;
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+    //Vector_t minU = {0.0, 0.0, 0.0};
+    //Vector_t maxU = {25.0, 25.0, 25.0};
+
+    std::array<double, dim> dx = {
+    (maxU[0] - minU[0]) / double(pt[0]),
+    (maxU[1] - minU[1]) / double(pt[1]),
+    (maxU[2] - minU[2]) / double(pt[2]),
+    };
+    //typedef ippl::Vector<Kokkos::complex<double>, 3> CxVector_t;
+
+    Vector_t hx = {dx[0], dx[1], dx[2]};
+    Vector_t origin = {minU[0], minU[1], minU[2]};
+    ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
+
+    playout_type pl(layout, mesh);
+
+    bunch_type bunch(pl);
+    bunch.setParticleBC(ippl::BC::PERIODIC);
+   
+    using size_type = ippl::detail::size_type;
+
+
+    size_type Np = std::pow(32,3) * 20;
+    
+    typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
+
+    field_type field(mesh, layout);
+
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-10);
+
+    fftParams.add("use_cufinufft_defaults", false);  
+    
+    typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
+
+    std::unique_ptr<FFT_type> fft;
+    
+    int type = 2;
+    
+
+    size_type nloc = Np/Ippl::Comm->size();
+
+
+    fft = std::make_unique<FFT_type>(layout, nloc, type, fftParams);
+
+
+
+    const int nghost = field.getNghost();
+    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+    auto fview = field.getView();
+    bunch.create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42));
+    Kokkos::parallel_for(nloc,
+                         generate_random_particles<Vector_t, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         bunch.R.getView(), rand_pool64, minU, maxU));
+    
+    Kokkos::parallel_for(mdrange_type({nghost, nghost, nghost},
+                                      {fview.extent(0) - nghost,
+                                       fview.extent(1) - nghost,
+                                       fview.extent(2) - nghost}),
+                         generate_random_field<Kokkos::complex<double>, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         field.getView(), rand_pool64));
+
+    fft->transform(bunch.R, bunch.Q, field);
+    
+    auto Q_result = Kokkos::create_mirror_view_and_copy(
+                        Kokkos::HostSpace(), bunch.Q.getView());
+
+
+    //Pick some target point to check. We choose it same as cuFINUFFT testcase cufinufft3d2_test.cu
+    
+    int idx = nloc/2;
+
+    Kokkos::complex<double> reducedValue(0.0, 0.0);
+
+    auto Rview = bunch.R.getView();
+
+    Kokkos::complex<double> imag = {0.0, 1.0};
+
+    Kokkos::parallel_reduce("NUDFT type2",
+                            mdrange_type({0, 0, 0},
+                                         {fview.extent(0) - 2 * nghost,
+                                          fview.extent(1) - 2 * nghost,
+                                          fview.extent(2) - 2 * nghost}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           Kokkos::complex<double>& valL) 
+                             {
+                                ippl::Vector<int, 3> iVec = {i, j, k};
+                                double arg = 0.0;
+                                for(size_t d = 0; d < dim; ++d) {
+                                    arg += (2 * pi / (hx[d] * pt[d])) * (iVec[d] - (pt[d]/2)) * Rview(idx)[d];
+                                }
+
+                                valL += (Kokkos::cos(arg) 
+                                + imag * Kokkos::sin(arg)) * fview(i + nghost, j + nghost, k + nghost);
+                            }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+    
+    double abs_error_real = std::fabs(reducedValue.real() - Q_result(idx));
+    double rel_error_real = std::fabs(reducedValue.real() - Q_result(idx)) /std::fabs(reducedValue.real());
+    //double abs_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag());
+    //double rel_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag()) /std::fabs(reducedValue.imag());
+ 
+    std::cout << "Abs Error in real part: " << std::setprecision(16) 
+              << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
+    //std::cout << "Abs Error in imag part: " << std::setprecision(16) 
+    //          << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
+
+
+    //Kokkos::complex<double> max_error(0.0, 0.0);
+    //MPI_Reduce(&max_error_local, &max_error, 1, 
+    //           MPI_C_DOUBLE_COMPLEX, MPI_MAX, 0, Ippl::getComm());
+
+    return 0;
+}