@@ -345,6 +345,9 @@ void mParticle::InteractWithEuler(MultiFab &EulerVel,
345345 // for 1 -> Ns是
346346 int loop = ParticleProperties::loop_ns;
347347
348+ // Sync kernel data to device memory for GPU ParallelFor
349+ syncKernelsToDevice ();
350+
348351 BL_ASSERT (loop > 0 );
349352 while (loop > 0 ){
350353 if (verbose) Print () << " [Particle] Ns loop index : " << loop << " \n " ;
@@ -477,8 +480,7 @@ void mParticle::InitParticles(const Vector<Real>& x,
477480 if ( Ml > max_largrangian_num ) max_largrangian_num = Ml;
478481
479482 Real phiK = 0 ;
480- auto phiKdata = new Vector<Real>();
481- auto thetaKdata = new Vector<Real>();
483+ Gpu::HostVector<Real> h_phiK, h_thetaK;
482484 for (int marker_index = 0 ; marker_index < Ml; marker_index++){
483485 const Real Hk = -1.0 + 2.0 * (marker_index) / ( Ml - 1.0 );
484486 Real thetaK = std::acos (Hk);
@@ -487,13 +489,14 @@ void mParticle::InitParticles(const Vector<Real>& x,
487489 }else {
488490 phiK = std::fmod ( phiK + 3.809 / std::sqrt (Ml) / std::sqrt ( 1 - Math::powi<2 >(Hk)) , 2 * Math::pi<Real>());
489491 }
490- phiKdata-> push_back (phiK);
491- thetaKdata-> push_back (thetaK);
492+ h_phiK. push_back (phiK);
493+ h_thetaK. push_back (thetaK);
492494 }
493- phiKdata->shrink_to_fit ();
494- thetaKdata->shrink_to_fit ();
495- mKernel .phiK = phiKdata->data ();
496- mKernel .thetaK = thetaKdata->data ();
495+ mKernel .phiK .resize (Ml);
496+ mKernel .thetaK .resize (Ml);
497+ Gpu::copyAsync (Gpu::hostToDevice, h_phiK.begin (), h_phiK.end (), mKernel .phiK .begin ());
498+ Gpu::copyAsync (Gpu::hostToDevice, h_thetaK.begin (), h_thetaK.end (), mKernel .thetaK .begin ());
499+ Gpu::streamSynchronize ();
497500
498501 if (verbose) Print () << " h: " << h << " , Ml: " << Ml << " , D: " << Math::powi<3 >(h) << " gravity : " << gravity << " \n "
499502 << " Kernel : " << index << " : Location (" << x[index] << " , " << y[index] << " , " << z[index]
@@ -510,6 +513,26 @@ void mParticle::InitParticles(const Vector<Real>& x,
510513 m_Collision.SetGeometry (RealVect (ParticleProperties::GLO), RealVect (ParticleProperties::GHI),particle_kernels[0 ].radius , h);
511514}
512515
516+ void mParticle::syncKernelsToDevice ()
517+ {
518+ const int nk = static_cast <int >(particle_kernels.size ());
519+ Gpu::HostVector<kernel_gpu> h_kg (nk);
520+ for (int i = 0 ; i < nk; ++i) {
521+ auto const & pk = particle_kernels[i];
522+ h_kg[i].location = pk.location ;
523+ h_kg[i].velocity = pk.velocity ;
524+ h_kg[i].omega = pk.omega ;
525+ h_kg[i].radius = pk.radius ;
526+ h_kg[i].dv = pk.dv ;
527+ h_kg[i].phiK = pk.phiK .data ();
528+ h_kg[i].thetaK = pk.thetaK .data ();
529+ h_kg[i].start_id = pk.start_id ;
530+ }
531+ d_kernels.resize (nk);
532+ Gpu::copyAsync (Gpu::hostToDevice, h_kg.begin (), h_kg.end (), d_kernels.begin ());
533+ Gpu::streamSynchronize ();
534+ }
535+
513536void mParticle::UpdateLagrangianMarker () {
514537 if (verbose) Print () << " \t mParticle::UpdateLagrangianMarker\n " ;
515538 // update lagrangian marker attributions
@@ -530,11 +553,12 @@ void mParticle::UpdateLagrangianMarker() {
530553 auto *const myP_ptr = attri[P_ATTR_REAL::My_Marker].data ();
531554 auto *const mzP_ptr = attri[P_ATTR_REAL::Mz_Marker].data ();
532555
533- const auto *const ps = particle_kernels.data ();
556+ const auto *const ps = d_kernels.data ();
557+ const bool do_RKPM_l = do_RKPM;
534558
535559 ParallelFor (np,
536560 [=] AMREX_GPU_DEVICE (const int i) noexcept {
537- if (!do_RKPM ) {
561+ if (!do_RKPM_l ) {
538562 const auto id = ids[i];
539563 const auto m_id = particles[i].id ();
540564 const auto location = ps[id].location ;
@@ -571,7 +595,7 @@ void mParticle::UpdateLagrangianMarker() {
571595
572596template <typename P = Particle<num_Real, num_Int>>
573597AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
574- void VelocityInterpolation_cir (int p_iter, P const & p, Real & Up, Real & Vp, Real & Wp,
598+ void VelocityInterpolation_cir (int p_iter, P const & p, ParticleReal & Up, ParticleReal & Vp, ParticleReal & Wp,
575599 Array4<Real const > const & E, int EulerVIndex,
576600 const int *lo, const int *hi,
577601 GpuArray<Real, AMREX_SPACEDIM> const & plo,
@@ -615,10 +639,10 @@ template<typename P>
615639AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
616640void VelocityInterpolationRKPM_cir (
617641 P p,
618- Real & U,
619- Real & V,
620- Real & W,
621- Vector< MAP_INFO> RKPM_MAP ,
642+ ParticleReal & U,
643+ ParticleReal & V,
644+ ParticleReal & W,
645+ MAP_INFO const * rkpm_data ,
622646 Array4<Real const > const & E,
623647 GpuArray<Real, AMREX_SPACEDIM> const & plo,
624648 GpuArray<Real, AMREX_SPACEDIM> const & dx,
@@ -640,10 +664,10 @@ void VelocityInterpolationRKPM_cir(
640664 for (int ii = -1 ; ii < 2 ; ii++){
641665 for (int jj = -1 ; jj < 2 ; jj++){
642666 for (int kk = -1 ; kk < 2 ; kk ++){
643- auto RKPM = RKPM_MAP. at ( cell_index++) ;
644- U += RKPM .weight * RKPM .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex );
645- V += RKPM .weight * RKPM .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex + 1 );
646- W += RKPM .weight * RKPM .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex + 2 );
667+ auto rkpm = rkpm_data[ cell_index++] ;
668+ U += rkpm .weight * rkpm .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex );
669+ V += rkpm .weight * rkpm .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex + 1 );
670+ W += rkpm .weight * rkpm .Vcell * E (i + ii, j + jj, k + kk, EulerVIndex + 2 );
647671 }
648672 }
649673 }
@@ -680,10 +704,13 @@ void mParticle::VelocityInterpolation(MultiFab &EulerVel,
680704 const auto & E = EulerVel.array (pti);
681705
682706 if (do_RKPM) {
707+ const MAP_INFO* rkpm_ptr = d_rkpm_flat.data ();
708+ constexpr int STENCIL = mParticle ::RKPM_STENCIL_SIZE;
683709 ParallelFor (np,
684710 [=] AMREX_GPU_DEVICE (const int i) {
685711 const auto id = p_ptr[i].id () - 1 ;
686- VelocityInterpolationRKPM_cir (p_ptr[i], Up[i], Vp[i], Wp[i], RKPM_MAP[id], E, plo, dx, EulerVelocityIndex);
712+ VelocityInterpolationRKPM_cir (p_ptr[i], Up[i], Vp[i], Wp[i],
713+ rkpm_ptr + id * STENCIL, E, plo, dx, EulerVelocityIndex);
687714 });
688715 }else {
689716 ParallelFor (np,
@@ -714,12 +741,12 @@ void mParticle::ComputeLagrangianForce(Real dt)
714741 auto * FzP = attri[P_ATTR_REAL::Fz_Marker].data ();
715742
716743 const auto p_ids = pti.GetIDs ().data ();
717- const auto ps = particle_kernels .data ();
744+ const auto ps = d_kernels .data ();
718745
719746 ParallelFor (np,
720747 [=] AMREX_GPU_DEVICE (const int i) noexcept {
721748 const auto p_id = p_ids[i];
722- auto p = ps[p_id];
749+ const auto & p = ps[p_id];
723750 const Real Ub = p.velocity [0 ];
724751 const Real Vb = p.velocity [1 ];
725752 const Real Wb = p.velocity [2 ];
@@ -783,9 +810,9 @@ void ForceSpreading_cic (P const& p,
783810 deltaFunction ( p.pos (1 ), yj, dx[1 ], tV, type);
784811 deltaFunction ( p.pos (2 ), kz, dx[2 ], tW, type);
785812 Real delta_value = tU * tV * tW;
786- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex ), delta_value * fxP);
787- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex+1 ), delta_value * fyP);
788- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex+2 ), delta_value * fzP);
813+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex ), Real ( delta_value * fxP) );
814+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex+1 ), Real ( delta_value * fyP) );
815+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex+2 ), Real ( delta_value * fzP) );
789816 }
790817 }
791818 }
@@ -798,13 +825,13 @@ void ForceSpreadingRKPM_cir(
798825 Real Px,
799826 Real Py,
800827 Real Pz,
801- Real & fxP,
802- Real & fyP,
803- Real & fzP,
804- Real & mxP,
805- Real & myP,
806- Real & mzP,
807- Vector< MAP_INFO> RKPM_MAP ,
828+ ParticleReal & fxP,
829+ ParticleReal & fyP,
830+ ParticleReal & fzP,
831+ ParticleReal & mxP,
832+ ParticleReal & myP,
833+ ParticleReal & mzP,
834+ MAP_INFO const * rkpm_data ,
808835 Real dv,
809836 Array4<Real> const &E,
810837 GpuArray<Real,AMREX_SPACEDIM> const & plo,
@@ -822,7 +849,8 @@ void ForceSpreadingRKPM_cir(
822849 fxP *= dv;
823850 fyP *= dv;
824851 fzP *= dv;
825- RealVect moment = RealVect (p.pos (0 ) - Px, p.pos (1 ) - Py, p.pos (2 ) - Pz).crossProduct (RealVect (fxP, fyP, fzP));
852+ RealVect moment = RealVect (Real (p.pos (0 ) - Px), Real (p.pos (1 ) - Py), Real (p.pos (2 ) - Pz)).crossProduct (
853+ RealVect (Real (fxP), Real (fyP), Real (fzP)));
826854 mxP = moment[0 ];
827855 myP = moment[1 ];
828856 mzP = moment[2 ];
@@ -831,10 +859,10 @@ void ForceSpreadingRKPM_cir(
831859 for (int ii = -1 ; ii < 2 ; ii++){
832860 for (int jj = -1 ; jj < 2 ; jj++){
833861 for (int kk = -1 ; kk < 2 ; kk ++){
834- auto RKPM = RKPM_MAP. at ( cell_index++) ;
835- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex ), RKPM .weight * fxP);
836- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex + 1 ), RKPM .weight * fyP);
837- Gpu ::Atomic::AddNoRet (&E (i + ii, j + jj, k + kk, EulerForceIndex + 2 ), RKPM .weight * fzP);
862+ auto rkpm = rkpm_data[ cell_index++] ;
863+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex ), Real (rkpm .weight * fxP) );
864+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex + 1 ), Real (rkpm .weight * fyP) );
865+ HostDevice ::Atomic::Add (&E (i + ii, j + jj, k + kk, EulerForceIndex + 2 ), Real (rkpm .weight * fzP) );
838866 }
839867 }
840868 }
@@ -864,19 +892,21 @@ void mParticle::ForceSpreading(MultiFab & EulerForce,
864892 const auto *const p_ptr = particles ().data ();
865893
866894 auto force_index = ParticleProperties::euler_force_index;
867- const auto ps = particle_kernels .data ();
895+ const auto ps = d_kernels .data ();
868896
869897 if (do_RKPM) {
898+ const MAP_INFO* rkpm_ptr = d_rkpm_flat.data ();
899+ constexpr int STENCIL = mParticle ::RKPM_STENCIL_SIZE;
870900 ParallelFor (np,
871901 [=] AMREX_GPU_DEVICE (const int i) noexcept {
872902 const auto p_id = p_ptr[i].id () - 1 ; // lagrangian marker's id
873903 const auto id = ids[i]; // particle's id
874904 auto loc_ptr = ps[id].location ;
875- auto dv = RKPM_MAP [p_id][ 0 ].eps ;
905+ auto dv = rkpm_ptr [p_id * STENCIL ].eps ;
876906 ForceSpreadingRKPM_cir (p_ptr[i], loc_ptr[0 ], loc_ptr[1 ], loc_ptr[2 ],
877907 fxP_ptr[i], fyP_ptr[i], fzP_ptr[i],
878908 mxP_ptr[i], myP_ptr[i], mzP_ptr[i],
879- RKPM_MAP[ p_id] , dv, Uarray, plo, dxi, force_index);
909+ rkpm_ptr + p_id * STENCIL , dv, Uarray, plo, dxi, force_index);
880910 });
881911 }else {
882912 ParallelFor (np,
@@ -967,8 +997,8 @@ void mParticle::ForceSpreading(MultiFab & EulerForce,
967997 ParallelAllReduce::Sum (my, ParallelDescriptor::Communicator ());
968998 ParallelAllReduce::Sum (mz, ParallelDescriptor::Communicator ());
969999
970- cur_p.ib_force += {fx, fy, fz };
971- cur_p.ib_moment += {mx, my, mz };
1000+ cur_p.ib_force += {Real (fx), Real (fy), Real (fz) };
1001+ cur_p.ib_moment += {Real (mx), Real (my), Real (mz) };
9721002 }
9731003
9741004 EulerForce.SumBoundary (ParticleProperties::euler_force_index, 3 , gm.periodicity ());
@@ -1278,6 +1308,23 @@ void mParticle::ResolveWithRPKM(std::string RKPM_file) {
12781308 }
12791309 }
12801310 Print () << " \t RKPM mapping size : " << RKPM_MAP.size () << " \n " ;
1311+
1312+ // Flatten RKPM_MAP into a contiguous device-accessible array for GPU ParallelFor
1313+ int max_key = 0 ;
1314+ for (const auto & [key, vec] : RKPM_MAP) {
1315+ max_key = std::max (max_key, key);
1316+ AMREX_ALWAYS_ASSERT (int (vec.size ()) <= RKPM_STENCIL_SIZE);
1317+ }
1318+ Gpu::HostVector<MAP_INFO> h_rkpm_flat ((max_key + 1 ) * RKPM_STENCIL_SIZE);
1319+ for (const auto & [key, vec] : RKPM_MAP) {
1320+ for (int j = 0 ; j < int (vec.size ()); ++j) {
1321+ h_rkpm_flat[key * RKPM_STENCIL_SIZE + j] = vec[j];
1322+ }
1323+ }
1324+ d_rkpm_flat.resize (h_rkpm_flat.size ());
1325+ Gpu::copyAsync (Gpu::hostToDevice, h_rkpm_flat.begin (), h_rkpm_flat.end (),
1326+ d_rkpm_flat.begin ());
1327+ Gpu::streamSynchronize ();
12811328}
12821329
12831330int mParticle::StartOfLagrangianMarker (size_t index) {
0 commit comments