diff --git a/alpine/AlpineManager.h b/alpine/AlpineManager.h index 94a7855ce..69360d551 100644 --- a/alpine/AlpineManager.h +++ b/alpine/AlpineManager.h @@ -60,6 +60,9 @@ class AlpineManager ippl::NDIndex domain_m; std::array decomp_m; double rhoNorm_m; + Kokkos::View index_m; + Kokkos::View start_m; + Kokkos::View permuteTemp_m; public: size_type getTotalP() const { return totalP_m; } @@ -129,6 +132,7 @@ class AlpineManager Vector_t hr = hr_m; scatter(*q, *rho, *R); + double relError = std::fabs((Q-(*rho).sum())/Q); m << relError << endl; diff --git a/alpine/ExamplesWithoutPicManager/test.cpp b/alpine/ExamplesWithoutPicManager/test.cpp new file mode 100644 index 000000000..94ccb1cee --- /dev/null +++ b/alpine/ExamplesWithoutPicManager/test.cpp @@ -0,0 +1,240 @@ +// Uniform Plasma Test +// Usage: +// srun ./UniformPlasmaTest +// [...] +// --overallocate --info 10 +// nx = No. cell-centered points in the x-direction +// ny... = No. cell-centered points in the y-, z-, ...direction +// Np = Total no. of macro-particles in the simulation +// Nt = Number of time steps +// stype = Field solver type (FFT, CG, P3M, and OPEN supported) +// lbfreq = Load balancing frequency i.e., Number of time steps after which particle +// load balancing should happen +// ovfactor = Over-allocation factor for the buffers used in the communication. Typical +// values are 1.0, 2.0. Value 1.0 means no over-allocation. +// Example: +// srun ./UniformPlasmaTest 128 128 128 10000 10 FFT 10 --overallocate 1.0 --info 10 +// +#include +#include +#include +#include +#include +#include +#include + +#include "Utility/IpplTimings.h" + +#include "ChargedParticles.hpp" + +constexpr unsigned Dim = 3; + +const char* TestName = "UniformPlasmaTest"; + +int main(int argc, char* argv[]) { + ippl::initialize(argc, argv); + { + setSignalHandler(); + + Inform msg("UniformPlasmaTest"); + Inform msg2all(argv[0], INFORM_ALL_NODES); + + auto start = std::chrono::high_resolution_clock::now(); + int arg = 1; + + Vector_t nr; + for (unsigned d = 0; d < Dim; d++) + nr[d] = std::atoi(argv[arg++]); + + static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("total"); + static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation"); + static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity"); + static IpplTimings::TimerRef temp = IpplTimings::getTimer("randomMove"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition"); + static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update"); + static IpplTimings::TimerRef DummySolveTimer = IpplTimings::getTimer("solveWarmup"); + static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve"); + static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance"); + + IpplTimings::startTimer(mainTimer); + + const size_type totalP = std::atoll(argv[arg++]); + const unsigned int nt = std::atoi(argv[arg++]); + + msg << "Uniform Plasma Test" << endl + << "nt " << nt << " Np= " << totalP << " grid = " << nr << endl; + + using bunch_type = ChargedParticles, double, Dim>; + + std::unique_ptr P; + + ippl::NDIndex domain; + for (unsigned i = 0; i < Dim; i++) { + domain[i] = ippl::Index(nr[i]); + } + + // create mesh and layout objects for this problem domain + Vector_t rmin(0.0); + Vector_t rmax(20.0); + + Vector_t hr = rmax / nr; + Vector_t origin = rmin; + const double dt = 1.0; + + std::array isParallel; + isParallel.fill(true); + + const bool isAllPeriodic = true; + Mesh_t mesh(domain, hr, origin); + FieldLayout_t FL(MPI_COMM_WORLD, domain, isParallel, isAllPeriodic); + PLayout_t PL(FL, mesh); + + double Q = -1562.5; + std::string solver = argv[arg++]; + P = std::make_unique(PL, hr, rmin, rmax, isParallel, Q, solver); + + P->nr_m = nr; + size_type nloc = totalP / ippl::Comm->size(); + + int rest = (int)(totalP - nloc * ippl::Comm->size()); + + if (ippl::Comm->rank() < rest) + ++nloc; + + IpplTimings::startTimer(particleCreation); + P->create(nloc); + + const ippl::NDIndex& lDom = FL.getLocalNDIndex(); + Vector_t Rmin, Rmax; + for (unsigned d = 0; d < Dim; ++d) { + Rmin[d] = origin[d] + lDom[d].first() * hr[d]; + Rmax[d] = origin[d] + (lDom[d].last() + 1) * hr[d]; + } + + Kokkos::View*> particle_pos; + + Kokkos::parallel_for(nloc, + KOKKOS_LAMBDA(size_t i) { + Rview(i) = i * hr[0]; + }); + Kokkos::fence(); + P->q = P->Q_m / totalP; + P->P = 0.0; + IpplTimings::stopTimer(particleCreation); + + P->initializeFields(mesh, FL); + + IpplTimings::startTimer(updateTimer); + P->update(); + IpplTimings::stopTimer(updateTimer); + + msg << "particles created and initial conditions assigned " << endl; + + P->initSolver(); + P->time_m = 0.0; + P->loadbalancefreq_m = std::atoi(argv[arg++]); + + IpplTimings::startTimer(DummySolveTimer); + P->rho_m = 0.0; + P->runSolver(); + IpplTimings::stopTimer(DummySolveTimer); + + P->scatterCIC(totalP, 0, hr); + P->initializeORB(FL, mesh); + bool fromAnalyticDensity = false; + + IpplTimings::startTimer(SolveTimer); + P->runSolver(); + IpplTimings::stopTimer(SolveTimer); + + P->gatherCIC(); + + IpplTimings::startTimer(dumpDataTimer); + P->dumpData(); + P->gatherStatistics(totalP); + IpplTimings::stopTimer(dumpDataTimer); + + // begin main timestep loop + msg << "Starting iterations ..." << endl; + // P->gatherStatistics(totalP); + for (unsigned int it = 0; it < nt; it++) { + // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration + // Here, we assume a constant charge-to-mass ratio of -1 for + // all the particles hence eliminating the need to store mass as + // an attribute + // kick + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + IpplTimings::startTimer(temp); + Kokkos::parallel_for( + P->getLocalNum(), + generate_random, Kokkos::Random_XorShift64_Pool<>, Dim>( + P->P.getView(), rand_pool64, -hr, hr)); + Kokkos::fence(); + IpplTimings::stopTimer(temp); + + // drift + IpplTimings::startTimer(RTimer); + P->R = P->R + dt * P->P; + IpplTimings::stopTimer(RTimer); + + // Since the particles have moved spatially update them to correct processors + IpplTimings::startTimer(updateTimer); + P->update(); + IpplTimings::stopTimer(updateTimer); + + // Domain Decomposition + if (P->balance(totalP, it + 1)) { + msg << "Starting repartition" << endl; + IpplTimings::startTimer(domainDecomposition); + P->repartition(FL, mesh, fromAnalyticDensity); + IpplTimings::stopTimer(domainDecomposition); + } + + // scatter the charge onto the underlying grid + P->scatterCIC(totalP, it + 1, hr); + + // Field solve + IpplTimings::startTimer(SolveTimer); + P->runSolver(); + IpplTimings::stopTimer(SolveTimer); + + // gather E field + P->gatherCIC(); + + // kick + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + P->time_m += dt; + IpplTimings::startTimer(dumpDataTimer); + P->dumpData(); + P->gatherStatistics(totalP); + IpplTimings::stopTimer(dumpDataTimer); + msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl; + + if (checkSignalHandler()) { + msg << "Aborting timestepping loop due to signal " << interruptSignalReceived + << endl; + break; + } + } + + msg << "Uniform Plasma Test: End." << endl; + IpplTimings::stopTimer(mainTimer); + IpplTimings::print(); + IpplTimings::print(std::string("timing.dat")); + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration time_chrono = + std::chrono::duration_cast>(end - start); + std::cout << "Elapsed time: " << time_chrono.count() << std::endl; + } + ippl::finalize(); + + return 0; +} diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h index 39d38f630..50624367f 100644 --- a/alpine/LandauDampingManager.h +++ b/alpine/LandauDampingManager.h @@ -1,6 +1,8 @@ #ifndef IPPL_LANDAU_DAMPING_MANAGER_H #define IPPL_LANDAU_DAMPING_MANAGER_H +#include + #include #include "FieldContainer.hpp" @@ -222,28 +224,36 @@ class LandauDampingManager : public AlpineManager { // Here, we assume a constant charge-to-mass ratio of -1 for // all the particles hence eliminating the need to store mass as // an attribute - static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity"); - static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition"); - static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition"); + static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update"); static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance"); - static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve"); + static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve"); + static IpplTimings::TimerRef SortTimer = IpplTimings::getTimer("sort"); + static IpplTimings::TimerRef PermuteTimer = IpplTimings::getTimer("permute"); double dt = this->dt_m; std::shared_ptr pc = this->pcontainer_m; std::shared_ptr fc = this->fcontainer_m; IpplTimings::startTimer(PTimer); + nvtxRangePush("pushVelocity1"); pc->P = pc->P - 0.5 * dt * pc->E; + nvtxRangePop(); IpplTimings::stopTimer(PTimer); // drift IpplTimings::startTimer(RTimer); + nvtxRangePush("pushPosition"); pc->R = pc->R + dt * pc->P; + nvtxRangePop(); IpplTimings::stopTimer(RTimer); // Since the particles have moved spatially update them to correct processors IpplTimings::startTimer(updateTimer); + nvtxRangePush("update"); pc->update(); + nvtxRangePop(); IpplTimings::stopTimer(updateTimer); size_type totalP = this->totalP_m; @@ -257,23 +267,263 @@ class LandauDampingManager : public AlpineManager { IpplTimings::stopTimer(domainDecomposition); } + Kokkos::fence(); + + // sort the particle positions for over-decomposition + IpplTimings::startTimer(SortTimer); + nvtxRangePush("sort"); + + auto *FL = &this->fcontainer_m->getFL(); + const ippl::NDIndex& ldom = FL->getLocalNDIndex(); + + size_t ncells = 1; + for (unsigned int i = 0; i < Dim; ++i) { + ncells *= ldom[i].length(); + } + + if (this->index_m.extent(0) < pc->getLocalNum()) { + this->index_m = Kokkos::View ("resize_index", pc->getLocalNum()); + } + if (this->start_m.extent(0) < ncells + 1) { + this->start_m = Kokkos::View ("resize_start", ncells + 1); + } + + this->sort(this->index_m, this->start_m, ncells); + + nvtxRangePop(); + IpplTimings::stopTimer(SortTimer); + + IpplTimings::startTimer(PermuteTimer); + nvtxRangePush("permute"); + auto Rview = pc->R.getView(); + this->permute*>>(Rview, this->index_m); + nvtxRangePop(); + IpplTimings::stopTimer(PermuteTimer); + // scatter the charge onto the underlying grid - this->par2grid(); + nvtxRangePush("scatter"); + //this->par2grid(); + scatter_new(this->start_m); + nvtxRangePop(); // Field solve IpplTimings::startTimer(SolveTimer); + nvtxRangePush("solve"); this->fsolver_m->runSolver(); + nvtxRangePop(); IpplTimings::stopTimer(SolveTimer); // gather E field + nvtxRangePush("gather"); this->grid2par(); + nvtxRangePop(); // kick IpplTimings::startTimer(PTimer); + nvtxRangePush("pushVelocity2"); pc->P = pc->P - 0.5 * dt * pc->E; + nvtxRangePop(); IpplTimings::stopTimer(PTimer); } + void sort(Kokkos::View index, Kokkos::View start, size_t ncells) { + // given a particle container with positions, return the index in the grid + + std::shared_ptr pc = this->pcontainer_m; + int npart = pc->getLocalNum(); + auto Rview = pc->R.getView(); + + Vector_t hr = this->hr_m; + Vector_t origin = this->origin_m; + + auto *FL = &this->fcontainer_m->getFL(); + const ippl::NDIndex& ldom = FL->getLocalNDIndex(); + + Kokkos::deep_copy(start,0); + + Kokkos::parallel_for("Get cell index", npart, + KOKKOS_LAMBDA(size_t idx) { + Vector_t pos = Rview(idx); + size_t serialized = ((pos[0] - origin[0]) * (1.0 / hr[0])); + + for (unsigned int i = 1; i < Dim; ++i) { + // compute index for each dim + size_t index_i = ((pos[i] - origin[i]) * (1.0 / hr[i])); + index_i = index_i - ldom[i].first(); + + // serialize to get global cell index + size_t length = 1; + for (unsigned int j = 0; j < i; ++j) { + length *= ldom[j].length(); + } + serialized += index_i * length; + } + + Kokkos::atomic_add(&start(serialized+1),1); + }); + + Kokkos::parallel_scan(ncells, + KOKKOS_LAMBDA(int i, int &sum, bool is_final) { + int tmp; + if (is_final) { + tmp = sum; + } + sum += start(i+1); + if (is_final) { + start(i+1) = tmp; + } + }); + + Kokkos::parallel_for(npart, + KOKKOS_LAMBDA (int idx) { + Vector_t pos = Rview(idx); + size_t serialized = ((pos[0] - origin[0]) * (1.0 / hr[0])); + + for (unsigned int i = 1; i < Dim; ++i) { + // compute index for each dim + size_t index_i = ((pos[i] - origin[i]) * (1.0 / hr[i])); + index_i = index_i - ldom[i].first(); + + // serialize to get global cell index + size_t length = 1; + for (unsigned int j = 0; j < i; ++j) { + length *= ldom[j].length(); + } + serialized += index_i * length; + } + + int loc = Kokkos::atomic_fetch_add(&start(serialized+1),1); + index(idx)=loc; + }); + } + + template + void permute(Attrib orig_attrib, Kokkos::View index) { + std::shared_ptr pc = this->pcontainer_m; + int npart = pc->getLocalNum(); + + using type = typename Attrib::value_type; + + // size of temp should be (overalloc factor * npart) to match Rview + int overalloc = ippl::Comm->getDefaultOverallocation(); + size_t val = sizeof(type)/sizeof(int); + if (this->permuteTemp_m.extent(0) < npart * val) { + this->permuteTemp_m = Kokkos::View ("permute", npart * val); + } + Kokkos::View> temp((type*)this->permuteTemp_m.data(), npart); + + Kokkos::parallel_for("Permute", npart, + KOKKOS_LAMBDA(int i) { + temp(index(i)) = orig_attrib(i); + }); + + Kokkos::parallel_for("Permute", npart, + KOKKOS_LAMBDA(int i) { + orig_attrib(i) = temp(i); + }); + + Kokkos::fence(); + } + + void scatter_new(Kokkos::View start) { + Inform m("scatter_new "); + + std::shared_ptr pc = this->pcontainer_m; + ippl::detail::size_type npart = pc->getLocalNum(); + + ippl::ParticleAttrib *q = &pc->q; + ippl::ParticleAttrib> *R = &pc->R; + Field_t *rho = &this->fcontainer_m->getRho(); + + (*rho) = 0.0; + + double Q = this->Q_m; + Vector_t rmin = this->rmin_m; + Vector_t rmax = this->rmax_m; + Vector_t hr = this->hr_m; + Vector_t origin = this->origin_m; + Vector_t invdx = 1.0 / hr; + auto mesh = (*rho).get_mesh(); + + auto qview = (*q).getView(); + auto Rview = (*R).getView(); + auto view_field = (*rho).getView(); + + auto layout = (*rho).getLayout(); + auto lDom = layout.getLocalNDIndex(); + int nghost = (*rho).getNghost(); + + Kokkos::parallel_for("new_scatter", start.extent(0) - 1, + KOKKOS_CLASS_LAMBDA(const int i) { + + int idx = start(i); + + Vector_t index = (Rview(idx) - origin) * invdx + 0.5; + Vector_t args = index - lDom.first() + nghost; + + Vector_t, (1 << Dim)> interpol_idx; + + // interpolation indices + for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) { + for (unsigned int k = 0; k < Dim; ++k) { + interpol_idx[ScatterPoint][k] = (ScatterPoint & (1 << k)) ? args[k] - 1 : args[k]; + } + } + + // array of local values scattered in my cell + double local_vals[1 << Dim] = { 0.0 }; + + // loop over the particles in my cell (using start array) + // to compute local interpolation values + for (int j = start(i); j < start(i+1); ++j) { + + // find nearest grid point + Vector_t l = (Rview(j) - origin) * invdx + 0.5; + Vector_t whi = l - index; + Vector_t wlo = 1.0 - whi; + + // scatter + const double& val = qview(j); + + for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) { + double weights = 1.0; + for (unsigned int k = 0; k < Dim; ++k) { + weights *= (ScatterPoint & (1 << k)) ? wlo[k] : whi[k]; + } + local_vals[ScatterPoint] += val * weights; + } + } + + // add each local cell interpolated value atomically to field vals + for (int ScatterPoint = 0; ScatterPoint < (1 << Dim); ++ScatterPoint) { + Kokkos::atomic_add(&view_field(interpol_idx[ScatterPoint][0], + interpol_idx[ScatterPoint][1], interpol_idx[ScatterPoint][2]), + local_vals[ScatterPoint]); + } + }); + Kokkos::fence(); + + (*rho).accumulateHalo(); + + double relError = std::fabs((Q-(*rho).sum())/Q); + + m << relError << endl; + + ippl::detail::size_type TotalParticles = 0; + + ippl::Comm->reduce(npart, TotalParticles, 1, std::plus()); + + if (ippl::Comm->rank() == 0) { + if (TotalParticles != this->getTotalP() || relError > 1e-10) { + m << "Time step: " << this->it_m << endl; + m << "Total particles in the sim. " << this->getTotalP() << " " + << "after update: " << TotalParticles << endl; + m << "Rel. error in charge conservation: " << relError << endl; + ippl::Comm->abort(); + } + } + } + void dump() override { static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData"); IpplTimings::startTimer(dumpDataTimer);