diff --git a/alpine/AlpineManager.h b/alpine/AlpineManager.h
index 94a7855ce..69360d551 100644
--- a/alpine/AlpineManager.h
+++ b/alpine/AlpineManager.h
@@ -60,6 +60,9 @@ class AlpineManager
     ippl::NDIndex<Dim> domain_m;
     std::array<bool, Dim> decomp_m;
     double rhoNorm_m;
+    Kokkos::View<int*> index_m;
+    Kokkos::View<int*> start_m;
+    Kokkos::View<int*> permuteTemp_m;
 
 public:
     size_type getTotalP() const { return totalP_m; }
@@ -129,6 +132,7 @@ class AlpineManager
         Vector_t<double, Dim> hr        = hr_m;
 
         scatter(*q, *rho, *R);
+
         double relError = std::fabs((Q-(*rho).sum())/Q);
 
         m << relError << endl;
diff --git a/alpine/ExamplesWithoutPicManager/test.cpp b/alpine/ExamplesWithoutPicManager/test.cpp
new file mode 100644
index 000000000..94ccb1cee
--- /dev/null
+++ b/alpine/ExamplesWithoutPicManager/test.cpp
@@ -0,0 +1,240 @@
+// Uniform Plasma Test
+//   Usage:
+//     srun ./UniformPlasmaTest
+//                  <nx> [<ny>...] <Np> <Nt> <stype>
+//                  <lbthres> --overallocate <ovfactor> --info 10
+//     nx       = No. cell-centered points in the x-direction
+//     ny...    = No. cell-centered points in the y-, z-, ...direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     stype    = Field solver type (FFT, CG, P3M, and OPEN supported)
+//     lbfreq   = Load balancing frequency i.e., Number of time steps after which particle
+//                load balancing should happen
+//     ovfactor = Over-allocation factor for the buffers used in the communication. Typical
+//                values are 1.0, 2.0. Value 1.0 means no over-allocation.
+//     Example:
+//     srun ./UniformPlasmaTest 128 128 128 10000 10 FFT 10 --overallocate 1.0 --info 10
+//
+#include <Kokkos_Random.hpp>
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "Utility/IpplTimings.h"
+
+#include "ChargedParticles.hpp"
+
+constexpr unsigned Dim = 3;
+
+const char* TestName = "UniformPlasmaTest";
+
+int main(int argc, char* argv[]) {
+    ippl::initialize(argc, argv);
+    {
+        setSignalHandler();
+
+        Inform msg("UniformPlasmaTest");
+        Inform msg2all(argv[0], INFORM_ALL_NODES);
+
+        auto start = std::chrono::high_resolution_clock::now();
+        int arg    = 1;
+
+        Vector_t<int, Dim> nr;
+        for (unsigned d = 0; d < Dim; d++)
+            nr[d] = std::atoi(argv[arg++]);
+
+        static IpplTimings::TimerRef mainTimer        = IpplTimings::getTimer("total");
+        static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+        static IpplTimings::TimerRef dumpDataTimer    = IpplTimings::getTimer("dumpData");
+        static IpplTimings::TimerRef PTimer           = IpplTimings::getTimer("pushVelocity");
+        static IpplTimings::TimerRef temp             = IpplTimings::getTimer("randomMove");
+        static IpplTimings::TimerRef RTimer           = IpplTimings::getTimer("pushPosition");
+        static IpplTimings::TimerRef updateTimer      = IpplTimings::getTimer("update");
+        static IpplTimings::TimerRef DummySolveTimer  = IpplTimings::getTimer("solveWarmup");
+        static IpplTimings::TimerRef SolveTimer       = IpplTimings::getTimer("solve");
+        static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance");
+
+        IpplTimings::startTimer(mainTimer);
+
+        const size_type totalP = std::atoll(argv[arg++]);
+        const unsigned int nt  = std::atoi(argv[arg++]);
+
+        msg << "Uniform Plasma Test" << endl
+            << "nt " << nt << " Np= " << totalP << " grid = " << nr << endl;
+
+        using bunch_type = ChargedParticles<PLayout_t<double, Dim>, double, Dim>;
+
+        std::unique_ptr<bunch_type> P;
+
+        ippl::NDIndex<Dim> domain;
+        for (unsigned i = 0; i < Dim; i++) {
+            domain[i] = ippl::Index(nr[i]);
+        }
+
+        // create mesh and layout objects for this problem domain
+        Vector_t<double, Dim> rmin(0.0);
+        Vector_t<double, Dim> rmax(20.0);
+
+        Vector_t<double, Dim> hr     = rmax / nr;
+        Vector_t<double, Dim> origin = rmin;
+        const double dt              = 1.0;
+
+        std::array<bool, Dim> isParallel;
+        isParallel.fill(true);
+
+        const bool isAllPeriodic = true;
+        Mesh_t<Dim> mesh(domain, hr, origin);
+        FieldLayout_t<Dim> FL(MPI_COMM_WORLD, domain, isParallel, isAllPeriodic);
+        PLayout_t<double, Dim> PL(FL, mesh);
+
+        double Q           = -1562.5;
+        std::string solver = argv[arg++];
+        P = std::make_unique<bunch_type>(PL, hr, rmin, rmax, isParallel, Q, solver);
+
+        P->nr_m        = nr;
+        size_type nloc = totalP / ippl::Comm->size();
+
+        int rest = (int)(totalP - nloc * ippl::Comm->size());
+
+        if (ippl::Comm->rank() < rest)
+            ++nloc;
+
+        IpplTimings::startTimer(particleCreation);
+        P->create(nloc);
+
+        const ippl::NDIndex<Dim>& lDom = FL.getLocalNDIndex();
+        Vector_t<double, Dim> Rmin, Rmax;
+        for (unsigned d = 0; d < Dim; ++d) {
+            Rmin[d] = origin[d] + lDom[d].first() * hr[d];
+            Rmax[d] = origin[d] + (lDom[d].last() + 1) * hr[d];
+        }
+
+        Kokkos::View<ippl::Vector<double, Dim>*> particle_pos;
+
+        Kokkos::parallel_for(nloc, 
+            KOKKOS_LAMBDA(size_t i) {
+                Rview(i) = i * hr[0];
+            });
+        Kokkos::fence();
+        P->q = P->Q_m / totalP;
+        P->P = 0.0;
+        IpplTimings::stopTimer(particleCreation);
+
+        P->initializeFields(mesh, FL);
+
+        IpplTimings::startTimer(updateTimer);
+        P->update();
+        IpplTimings::stopTimer(updateTimer);
+
+        msg << "particles created and initial conditions assigned " << endl;
+
+        P->initSolver();
+        P->time_m            = 0.0;
+        P->loadbalancefreq_m = std::atoi(argv[arg++]);
+
+        IpplTimings::startTimer(DummySolveTimer);
+        P->rho_m = 0.0;
+        P->runSolver();
+        IpplTimings::stopTimer(DummySolveTimer);
+
+        P->scatterCIC(totalP, 0, hr);
+        P->initializeORB(FL, mesh);
+        bool fromAnalyticDensity = false;
+
+        IpplTimings::startTimer(SolveTimer);
+        P->runSolver();
+        IpplTimings::stopTimer(SolveTimer);
+
+        P->gatherCIC();
+
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpData();
+        P->gatherStatistics(totalP);
+        IpplTimings::stopTimer(dumpDataTimer);
+
+        // begin main timestep loop
+        msg << "Starting iterations ..." << endl;
+        // P->gatherStatistics(totalP);
+        for (unsigned int it = 0; it < nt; it++) {
+            // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+            IpplTimings::startTimer(PTimer);
+            P->P = P->P - 0.5 * dt * P->E;
+            IpplTimings::stopTimer(PTimer);
+
+            IpplTimings::startTimer(temp);
+            Kokkos::parallel_for(
+                P->getLocalNum(),
+                generate_random<Vector_t<double, Dim>, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                    P->P.getView(), rand_pool64, -hr, hr));
+            Kokkos::fence();
+            IpplTimings::stopTimer(temp);
+
+            // drift
+            IpplTimings::startTimer(RTimer);
+            P->R = P->R + dt * P->P;
+            IpplTimings::stopTimer(RTimer);
+
+            // Since the particles have moved spatially update them to correct processors
+            IpplTimings::startTimer(updateTimer);
+            P->update();
+            IpplTimings::stopTimer(updateTimer);
+
+            // Domain Decomposition
+            if (P->balance(totalP, it + 1)) {
+                msg << "Starting repartition" << endl;
+                IpplTimings::startTimer(domainDecomposition);
+                P->repartition(FL, mesh, fromAnalyticDensity);
+                IpplTimings::stopTimer(domainDecomposition);
+            }
+
+            // scatter the charge onto the underlying grid
+            P->scatterCIC(totalP, it + 1, hr);
+
+            // Field solve
+            IpplTimings::startTimer(SolveTimer);
+            P->runSolver();
+            IpplTimings::stopTimer(SolveTimer);
+
+            // gather E field
+            P->gatherCIC();
+
+            // kick
+            IpplTimings::startTimer(PTimer);
+            P->P = P->P - 0.5 * dt * P->E;
+            IpplTimings::stopTimer(PTimer);
+
+            P->time_m += dt;
+            IpplTimings::startTimer(dumpDataTimer);
+            P->dumpData();
+            P->gatherStatistics(totalP);
+            IpplTimings::stopTimer(dumpDataTimer);
+            msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl;
+
+            if (checkSignalHandler()) {
+                msg << "Aborting timestepping loop due to signal " << interruptSignalReceived
+                    << endl;
+                break;
+            }
+        }
+
+        msg << "Uniform Plasma Test: End." << endl;
+        IpplTimings::stopTimer(mainTimer);
+        IpplTimings::print();
+        IpplTimings::print(std::string("timing.dat"));
+        auto end = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration<double> time_chrono =
+            std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+        std::cout << "Elapsed time: " << time_chrono.count() << std::endl;
+    }
+    ippl::finalize();
+
+    return 0;
+}
diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h
index 39d38f630..50624367f 100644
--- a/alpine/LandauDampingManager.h
+++ b/alpine/LandauDampingManager.h
@@ -1,6 +1,8 @@
 #ifndef IPPL_LANDAU_DAMPING_MANAGER_H
 #define IPPL_LANDAU_DAMPING_MANAGER_H
 
+#include <nvtx3/nvToolsExt.h>
+
 #include <memory>
 
 #include "FieldContainer.hpp"
@@ -222,28 +224,36 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
         // Here, we assume a constant charge-to-mass ratio of -1 for
         // all the particles hence eliminating the need to store mass as
         // an attribute
-        static IpplTimings::TimerRef PTimer           = IpplTimings::getTimer("pushVelocity");
-        static IpplTimings::TimerRef RTimer           = IpplTimings::getTimer("pushPosition");
-        static IpplTimings::TimerRef updateTimer      = IpplTimings::getTimer("update");
+        static IpplTimings::TimerRef PTimer              = IpplTimings::getTimer("pushVelocity");
+        static IpplTimings::TimerRef RTimer              = IpplTimings::getTimer("pushPosition");
+        static IpplTimings::TimerRef updateTimer         = IpplTimings::getTimer("update");
         static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance");
-        static IpplTimings::TimerRef SolveTimer       = IpplTimings::getTimer("solve");
+        static IpplTimings::TimerRef SolveTimer          = IpplTimings::getTimer("solve");
+        static IpplTimings::TimerRef SortTimer           = IpplTimings::getTimer("sort");
+        static IpplTimings::TimerRef PermuteTimer        = IpplTimings::getTimer("permute");
 
         double dt                               = this->dt_m;
         std::shared_ptr<ParticleContainer_t> pc = this->pcontainer_m;
         std::shared_ptr<FieldContainer_t> fc    = this->fcontainer_m;
 
         IpplTimings::startTimer(PTimer);
+        nvtxRangePush("pushVelocity1");
         pc->P = pc->P - 0.5 * dt * pc->E;
+        nvtxRangePop();
         IpplTimings::stopTimer(PTimer);
 
         // drift
         IpplTimings::startTimer(RTimer);
+        nvtxRangePush("pushPosition");
         pc->R = pc->R + dt * pc->P;
+        nvtxRangePop();
         IpplTimings::stopTimer(RTimer);
 
         // Since the particles have moved spatially update them to correct processors
         IpplTimings::startTimer(updateTimer);
+        nvtxRangePush("update");
         pc->update();
+        nvtxRangePop();
         IpplTimings::stopTimer(updateTimer);
 
         size_type totalP        = this->totalP_m;
@@ -257,23 +267,263 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
                 IpplTimings::stopTimer(domainDecomposition);
         }
 
+        Kokkos::fence();
+
+        // sort the particle positions for over-decomposition
+        IpplTimings::startTimer(SortTimer);
+        nvtxRangePush("sort");
+
+        auto *FL = &this->fcontainer_m->getFL();
+        const ippl::NDIndex<Dim>& ldom = FL->getLocalNDIndex();
+
+        size_t ncells = 1;
+        for (unsigned int i = 0; i < Dim; ++i) {
+            ncells *= ldom[i].length();
+        }
+
+        if (this->index_m.extent(0) < pc->getLocalNum()) {
+            this->index_m = Kokkos::View<int*> ("resize_index", pc->getLocalNum()); 
+        }
+        if (this->start_m.extent(0) < ncells + 1) {
+            this->start_m = Kokkos::View<int*> ("resize_start", ncells + 1); 
+        }
+
+        this->sort(this->index_m, this->start_m, ncells);
+
+        nvtxRangePop();
+        IpplTimings::stopTimer(SortTimer);
+
+        IpplTimings::startTimer(PermuteTimer);
+        nvtxRangePush("permute");
+        auto Rview = pc->R.getView();
+        this->permute<Kokkos::View<ippl::Vector<double, Dim>*>>(Rview, this->index_m);
+        nvtxRangePop();
+        IpplTimings::stopTimer(PermuteTimer);
+
         // scatter the charge onto the underlying grid
-        this->par2grid();
+        nvtxRangePush("scatter");
+        //this->par2grid();
+        scatter_new(this->start_m);
+        nvtxRangePop();
 
         // Field solve
         IpplTimings::startTimer(SolveTimer);
+        nvtxRangePush("solve");
         this->fsolver_m->runSolver();
+        nvtxRangePop();
         IpplTimings::stopTimer(SolveTimer);
 
         // gather E field
+        nvtxRangePush("gather");
         this->grid2par();
+        nvtxRangePop();
 
         // kick
         IpplTimings::startTimer(PTimer);
+        nvtxRangePush("pushVelocity2");
         pc->P = pc->P - 0.5 * dt * pc->E;
+        nvtxRangePop();
         IpplTimings::stopTimer(PTimer);
     }
 
+    void sort(Kokkos::View<int*> index, Kokkos::View<int*> start, size_t ncells) {
+        // given a particle container with positions, return the index in the grid
+
+        std::shared_ptr<ParticleContainer_t> pc = this->pcontainer_m;
+        int npart  = pc->getLocalNum();
+        auto Rview = pc->R.getView();
+
+        Vector_t<double, Dim> hr     = this->hr_m;
+        Vector_t<double, Dim> origin = this->origin_m;
+
+        auto *FL = &this->fcontainer_m->getFL();
+        const ippl::NDIndex<Dim>& ldom = FL->getLocalNDIndex();
+
+        Kokkos::deep_copy(start,0);
+
+        Kokkos::parallel_for("Get cell index", npart,
+            KOKKOS_LAMBDA(size_t idx) {
+                Vector_t<double, Dim> pos = Rview(idx);
+                size_t serialized = ((pos[0] - origin[0]) * (1.0 / hr[0]));
+
+                for (unsigned int i = 1; i < Dim; ++i) {
+                    // compute index for each dim
+                    size_t index_i = ((pos[i] - origin[i]) * (1.0 / hr[i]));
+                    index_i = index_i - ldom[i].first();
+
+                    // serialize to get global cell index
+                    size_t length = 1;
+                    for (unsigned int j = 0; j < i; ++j) {
+                        length *= ldom[j].length();
+                    }
+                    serialized += index_i * length;
+                }
+
+                Kokkos::atomic_add(&start(serialized+1),1);
+        });
+
+        Kokkos::parallel_scan(ncells,
+            KOKKOS_LAMBDA(int i, int &sum, bool is_final) {
+                int tmp;
+                if (is_final) {
+                    tmp = sum;
+                }
+                sum += start(i+1);
+                if (is_final) {
+                    start(i+1) = tmp;
+                }
+        });
+
+        Kokkos::parallel_for(npart,
+            KOKKOS_LAMBDA (int idx) {
+                Vector_t<double, Dim> pos = Rview(idx);
+                size_t serialized = ((pos[0] - origin[0]) * (1.0 / hr[0]));
+
+                for (unsigned int i = 1; i < Dim; ++i) {
+                    // compute index for each dim
+                    size_t index_i = ((pos[i] - origin[i]) * (1.0 / hr[i]));
+                    index_i = index_i - ldom[i].first();
+
+                    // serialize to get global cell index
+                    size_t length = 1;
+                    for (unsigned int j = 0; j < i; ++j) {
+                        length *= ldom[j].length();
+                    }
+                    serialized += index_i * length;
+                }
+
+                int loc = Kokkos::atomic_fetch_add(&start(serialized+1),1);
+                index(idx)=loc;
+        });
+    }
+
+    template <typename Attrib>
+    void permute(Attrib orig_attrib, Kokkos::View<int*> index) {
+        std::shared_ptr<ParticleContainer_t> pc = this->pcontainer_m;
+        int npart  = pc->getLocalNum();
+
+        using type = typename Attrib::value_type;
+
+        // size of temp should be (overalloc factor * npart) to match Rview
+        int overalloc = ippl::Comm->getDefaultOverallocation();
+        size_t val = sizeof(type)/sizeof(int);
+        if (this->permuteTemp_m.extent(0) < npart * val) {
+            this->permuteTemp_m = Kokkos::View<int*> ("permute", npart * val); 
+        }
+        Kokkos::View<type*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> temp((type*)this->permuteTemp_m.data(), npart);
+
+        Kokkos::parallel_for("Permute", npart, 
+            KOKKOS_LAMBDA(int i) {
+                temp(index(i)) = orig_attrib(i);
+            });
+
+        Kokkos::parallel_for("Permute", npart, 
+            KOKKOS_LAMBDA(int i) {
+                orig_attrib(i) = temp(i);
+            });
+
+        Kokkos::fence();
+    }
+
+    void scatter_new(Kokkos::View<int*> start) {
+        Inform m("scatter_new ");
+
+        std::shared_ptr<ParticleContainer_t> pc = this->pcontainer_m;
+        ippl::detail::size_type npart  = pc->getLocalNum();
+
+        ippl::ParticleAttrib<double> *q                    = &pc->q;
+        ippl::ParticleAttrib<ippl::Vector<double, Dim>> *R = &pc->R;
+        Field_t<Dim> *rho                                  = &this->fcontainer_m->getRho();
+
+        (*rho) = 0.0;
+
+        double Q                        = this->Q_m;
+        Vector_t<double, Dim> rmin   	= this->rmin_m;
+        Vector_t<double, Dim> rmax  	= this->rmax_m;
+        Vector_t<double, Dim> hr        = this->hr_m;
+        Vector_t<double, Dim> origin    = this->origin_m;
+        Vector_t<double, Dim> invdx     = 1.0 / hr;
+        auto mesh                       = (*rho).get_mesh();
+
+        auto qview       = (*q).getView();
+        auto Rview       = (*R).getView();
+        auto view_field  = (*rho).getView();
+
+        auto layout = (*rho).getLayout();
+        auto lDom   = layout.getLocalNDIndex();
+        int nghost  = (*rho).getNghost();
+
+        Kokkos::parallel_for("new_scatter", start.extent(0) - 1,
+            KOKKOS_CLASS_LAMBDA(const int i) {
+
+                int idx = start(i);
+
+                Vector_t<int, Dim> index    = (Rview(idx) - origin) * invdx + 0.5;
+                Vector_t<size_t, Dim> args  = index - lDom.first() + nghost;
+
+                Vector_t<Vector_t<size_t, Dim>, (1 << Dim)> interpol_idx;
+
+                // interpolation indices
+                for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) {
+                    for (unsigned int k = 0; k < Dim; ++k) {
+                        interpol_idx[ScatterPoint][k] = (ScatterPoint & (1 << k)) ? args[k] - 1 : args[k];
+                    }
+                }
+
+                // array of local values scattered in my cell
+                double local_vals[1 << Dim] = { 0.0 };
+
+                // loop over the particles in my cell (using start array)
+                // to compute local interpolation values
+                for (int j = start(i); j < start(i+1); ++j) {
+
+                    // find nearest grid point
+                    Vector_t<double, Dim> l     = (Rview(j) - origin) * invdx + 0.5;
+                    Vector_t<double, Dim> whi   = l - index;
+                    Vector_t<double, Dim> wlo   = 1.0 - whi;
+
+                    // scatter
+                    const double& val = qview(j);
+
+                    for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) {
+                        double weights = 1.0;
+                        for (unsigned int k = 0; k < Dim; ++k) {
+                            weights *= (ScatterPoint & (1 << k)) ? wlo[k] : whi[k];
+                        }
+                        local_vals[ScatterPoint] += val * weights;
+                    }
+                }
+
+                // add each local cell interpolated value atomically to field vals
+                for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) {
+                    Kokkos::atomic_add(&view_field(interpol_idx[ScatterPoint][0],
+                            interpol_idx[ScatterPoint][1], interpol_idx[ScatterPoint][2]), 
+                            local_vals[ScatterPoint]);
+                }
+            });
+        Kokkos::fence();
+
+        (*rho).accumulateHalo();
+
+        double relError = std::fabs((Q-(*rho).sum())/Q);
+
+        m << relError << endl;
+
+        ippl::detail::size_type TotalParticles = 0;
+
+        ippl::Comm->reduce(npart, TotalParticles, 1, std::plus<ippl::detail::size_type>());
+
+        if (ippl::Comm->rank() == 0) {
+            if (TotalParticles != this->getTotalP() || relError > 1e-10) {
+                m << "Time step: " << this->it_m << endl;
+                m << "Total particles in the sim. " << this->getTotalP() << " "
+                  << "after update: " << TotalParticles << endl;
+                m << "Rel. error in charge conservation: " << relError << endl;
+                ippl::Comm->abort();
+            }
+        }
+    }
+
     void dump() override {
         static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
         IpplTimings::startTimer(dumpDataTimer);