Skip to content

Commit bbd155e

Browse files
author
yzeng
committed
fix some compilation issues using floating numbers on gpus
1 parent 52ffb65 commit bbd155e

File tree

7 files changed

+144
-70
lines changed

7 files changed

+144
-70
lines changed

Source/DiffusedIB.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -519,9 +519,9 @@ void mParticle::InitialWithLargrangianPoints(const kernel& current_kernel){
519519

520520
template <typename P = Particle<numAttri>>
521521
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
522-
void VelocityInterpolation_cir(int p_iter, P const& p, Real& Up, Real& Vp, Real& Wp,
522+
void VelocityInterpolation_cir(int p_iter, P const& p, ParticleReal& Up, ParticleReal& Vp, ParticleReal& Wp,
523523
Array4<Real const> const& E, int EulerVIndex,
524-
const int *lo, const int *hi,
524+
const int *lo, const int *hi,
525525
GpuArray<Real, AMREX_SPACEDIM> const& plo,
526526
GpuArray<Real, AMREX_SPACEDIM> const& dx,
527527
DELTA_FUNCTION_TYPE type)
@@ -654,9 +654,9 @@ void ForceSpreading_cic (P const& p,
654654
deltaFunction( p.pos(1), yj, dx[1], tV, type);
655655
deltaFunction( p.pos(2), kz, dx[2], tW, type);
656656
Real delta_value = tU * tV * tW;
657-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex ), delta_value * fxP);
658-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex+1), delta_value * fyP);
659-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex+2), delta_value * fzP);
657+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex ), Real(delta_value * fxP));
658+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex+1), Real(delta_value * fyP));
659+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex+2), Real(delta_value * fzP));
660660
}
661661
}
662662
}
@@ -714,8 +714,8 @@ void mParticle::ForceSpreading(MultiFab & EulerForce,
714714
amrex::ParallelAllReduce::Sum(my, ParallelDescriptor::Communicator());
715715
amrex::ParallelAllReduce::Sum(mz, ParallelDescriptor::Communicator());
716716

717-
kernel.ib_force = {fx, fy, fz};
718-
kernel.ib_moment = {mx, my, mz};
717+
kernel.ib_force = {Real(fx), Real(fy), Real(fz)};
718+
kernel.ib_moment = {Real(mx), Real(my), Real(mz)};
719719

720720
EulerForce.SumBoundary(ParticleProperties::euler_force_index, 3, gm.periodicity());
721721

@@ -980,6 +980,7 @@ void mParticle::ComputeLagrangianForce(Real dt,
980980
Real Px = kernel.location[0];
981981
Real Py = kernel.location[1];
982982
Real Pz = kernel.location[2];
983+
RealVect omega_local = kernel.omega;
983984

984985
for(mParIter pti(*mContainer, LOCAL_LEVEL); pti.isValid(); ++pti){
985986
const Long np = pti.numParticles();
@@ -995,7 +996,7 @@ void mParticle::ComputeLagrangianForce(Real dt,
995996

996997
amrex::ParallelFor(np,
997998
[=] AMREX_GPU_DEVICE (int i) noexcept{
998-
auto Ur = (kernel.omega).crossProduct(RealVect(p_ptr[i].pos(0) - Px, p_ptr[i].pos(1) - Py, p_ptr[i].pos(2) - Pz));
999+
auto Ur = omega_local.crossProduct(RealVect(p_ptr[i].pos(0) - Px, p_ptr[i].pos(1) - Py, p_ptr[i].pos(2) - Pz));
9991000
FxP[i] = (Ub + Ur[0] - Up[i])/dt; //
10001001
FyP[i] = (Vb + Ur[1] - Vp[i])/dt; //
10011002
FzP[i] = (Wb + Ur[2] - Wp[i])/dt; //

Source/DiffusedIB_Parallel.cpp

Lines changed: 89 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,9 @@ void mParticle::InteractWithEuler(MultiFab &EulerVel,
345345
//for 1 -> Ns是
346346
int loop = ParticleProperties::loop_ns;
347347

348+
// Sync kernel data to device memory for GPU ParallelFor
349+
syncKernelsToDevice();
350+
348351
BL_ASSERT(loop > 0);
349352
while(loop > 0){
350353
if(verbose) Print() << "[Particle] Ns loop index : " << loop << "\n";
@@ -477,8 +480,7 @@ void mParticle::InitParticles(const Vector<Real>& x,
477480
if( Ml > max_largrangian_num ) max_largrangian_num = Ml;
478481

479482
Real phiK = 0;
480-
auto phiKdata = new Vector<Real>();
481-
auto thetaKdata = new Vector<Real>();
483+
Gpu::HostVector<Real> h_phiK, h_thetaK;
482484
for(int marker_index = 0; marker_index < Ml; marker_index++){
483485
const Real Hk = -1.0 + 2.0 * (marker_index) / ( Ml - 1.0);
484486
Real thetaK = std::acos(Hk);
@@ -487,13 +489,14 @@ void mParticle::InitParticles(const Vector<Real>& x,
487489
}else {
488490
phiK = std::fmod( phiK + 3.809 / std::sqrt(Ml) / std::sqrt( 1 - Math::powi<2>(Hk)) , 2 * Math::pi<Real>());
489491
}
490-
phiKdata->push_back(phiK);
491-
thetaKdata->push_back(thetaK);
492+
h_phiK.push_back(phiK);
493+
h_thetaK.push_back(thetaK);
492494
}
493-
phiKdata->shrink_to_fit();
494-
thetaKdata->shrink_to_fit();
495-
mKernel.phiK = phiKdata->data();
496-
mKernel.thetaK = thetaKdata->data();
495+
mKernel.phiK.resize(Ml);
496+
mKernel.thetaK.resize(Ml);
497+
Gpu::copyAsync(Gpu::hostToDevice, h_phiK.begin(), h_phiK.end(), mKernel.phiK.begin());
498+
Gpu::copyAsync(Gpu::hostToDevice, h_thetaK.begin(), h_thetaK.end(), mKernel.thetaK.begin());
499+
Gpu::streamSynchronize();
497500

498501
if (verbose) Print() << "h: " << h << ", Ml: " << Ml << ", D: " << Math::powi<3>(h) << " gravity : " << gravity << "\n"
499502
<< "Kernel : " << index << ": Location (" << x[index] << ", " << y[index] << ", " << z[index]
@@ -510,6 +513,26 @@ void mParticle::InitParticles(const Vector<Real>& x,
510513
m_Collision.SetGeometry(RealVect(ParticleProperties::GLO), RealVect(ParticleProperties::GHI),particle_kernels[0].radius, h);
511514
}
512515

516+
void mParticle::syncKernelsToDevice ()
517+
{
518+
const int nk = static_cast<int>(particle_kernels.size());
519+
Gpu::HostVector<kernel_gpu> h_kg(nk);
520+
for (int i = 0; i < nk; ++i) {
521+
auto const& pk = particle_kernels[i];
522+
h_kg[i].location = pk.location;
523+
h_kg[i].velocity = pk.velocity;
524+
h_kg[i].omega = pk.omega;
525+
h_kg[i].radius = pk.radius;
526+
h_kg[i].dv = pk.dv;
527+
h_kg[i].phiK = pk.phiK.data();
528+
h_kg[i].thetaK = pk.thetaK.data();
529+
h_kg[i].start_id = pk.start_id;
530+
}
531+
d_kernels.resize(nk);
532+
Gpu::copyAsync(Gpu::hostToDevice, h_kg.begin(), h_kg.end(), d_kernels.begin());
533+
Gpu::streamSynchronize();
534+
}
535+
513536
void mParticle::UpdateLagrangianMarker() {
514537
if (verbose) Print() << "\tmParticle::UpdateLagrangianMarker\n";
515538
// update lagrangian marker attributions
@@ -530,11 +553,12 @@ void mParticle::UpdateLagrangianMarker() {
530553
auto *const myP_ptr = attri[P_ATTR_REAL::My_Marker].data();
531554
auto *const mzP_ptr = attri[P_ATTR_REAL::Mz_Marker].data();
532555

533-
const auto *const ps = particle_kernels.data();
556+
const auto *const ps = d_kernels.data();
557+
const bool do_RKPM_l = do_RKPM;
534558

535559
ParallelFor(np,
536560
[=] AMREX_GPU_DEVICE (const int i) noexcept {
537-
if (!do_RKPM) {
561+
if (!do_RKPM_l) {
538562
const auto id = ids[i];
539563
const auto m_id = particles[i].id();
540564
const auto location = ps[id].location;
@@ -571,7 +595,7 @@ void mParticle::UpdateLagrangianMarker() {
571595

572596
template <typename P = Particle<num_Real, num_Int>>
573597
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
574-
void VelocityInterpolation_cir(int p_iter, P const& p, Real& Up, Real& Vp, Real& Wp,
598+
void VelocityInterpolation_cir(int p_iter, P const& p, ParticleReal& Up, ParticleReal& Vp, ParticleReal& Wp,
575599
Array4<Real const> const& E, int EulerVIndex,
576600
const int *lo, const int *hi,
577601
GpuArray<Real, AMREX_SPACEDIM> const& plo,
@@ -615,10 +639,10 @@ template<typename P>
615639
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
616640
void VelocityInterpolationRKPM_cir(
617641
P p,
618-
Real& U,
619-
Real& V,
620-
Real& W,
621-
Vector<MAP_INFO> RKPM_MAP,
642+
ParticleReal& U,
643+
ParticleReal& V,
644+
ParticleReal& W,
645+
MAP_INFO const* rkpm_data,
622646
Array4<Real const> const& E,
623647
GpuArray<Real, AMREX_SPACEDIM> const& plo,
624648
GpuArray<Real, AMREX_SPACEDIM> const& dx,
@@ -640,10 +664,10 @@ void VelocityInterpolationRKPM_cir(
640664
for(int ii = -1; ii < 2; ii++){
641665
for(int jj = -1; jj < 2; jj++){
642666
for(int kk = -1; kk < 2; kk ++){
643-
auto RKPM = RKPM_MAP.at(cell_index++);
644-
U += RKPM.weight * RKPM.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex );
645-
V += RKPM.weight * RKPM.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex + 1);
646-
W += RKPM.weight * RKPM.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex + 2);
667+
auto rkpm = rkpm_data[cell_index++];
668+
U += rkpm.weight * rkpm.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex );
669+
V += rkpm.weight * rkpm.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex + 1);
670+
W += rkpm.weight * rkpm.Vcell * E(i + ii, j + jj, k + kk, EulerVIndex + 2);
647671
}
648672
}
649673
}
@@ -680,10 +704,13 @@ void mParticle::VelocityInterpolation(MultiFab &EulerVel,
680704
const auto& E = EulerVel.array(pti);
681705

682706
if (do_RKPM) {
707+
const MAP_INFO* rkpm_ptr = d_rkpm_flat.data();
708+
constexpr int STENCIL = mParticle::RKPM_STENCIL_SIZE;
683709
ParallelFor(np,
684710
[=] AMREX_GPU_DEVICE (const int i) {
685711
const auto id = p_ptr[i].id() - 1;
686-
VelocityInterpolationRKPM_cir(p_ptr[i], Up[i], Vp[i], Wp[i], RKPM_MAP[id], E, plo, dx, EulerVelocityIndex);
712+
VelocityInterpolationRKPM_cir(p_ptr[i], Up[i], Vp[i], Wp[i],
713+
rkpm_ptr + id * STENCIL, E, plo, dx, EulerVelocityIndex);
687714
});
688715
}else {
689716
ParallelFor(np,
@@ -714,12 +741,12 @@ void mParticle::ComputeLagrangianForce(Real dt)
714741
auto* FzP = attri[P_ATTR_REAL::Fz_Marker].data();
715742

716743
const auto p_ids = pti.GetIDs().data();
717-
const auto ps = particle_kernels.data();
744+
const auto ps = d_kernels.data();
718745

719746
ParallelFor(np,
720747
[=] AMREX_GPU_DEVICE (const int i) noexcept{
721748
const auto p_id = p_ids[i];
722-
auto p = ps[p_id];
749+
const auto& p = ps[p_id];
723750
const Real Ub = p.velocity[0];
724751
const Real Vb = p.velocity[1];
725752
const Real Wb = p.velocity[2];
@@ -783,9 +810,9 @@ void ForceSpreading_cic (P const& p,
783810
deltaFunction( p.pos(1), yj, dx[1], tV, type);
784811
deltaFunction( p.pos(2), kz, dx[2], tW, type);
785812
Real delta_value = tU * tV * tW;
786-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex ), delta_value * fxP);
787-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex+1), delta_value * fyP);
788-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex+2), delta_value * fzP);
813+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex ), Real(delta_value * fxP));
814+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex+1), Real(delta_value * fyP));
815+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex+2), Real(delta_value * fzP));
789816
}
790817
}
791818
}
@@ -798,13 +825,13 @@ void ForceSpreadingRKPM_cir(
798825
Real Px,
799826
Real Py,
800827
Real Pz,
801-
Real& fxP,
802-
Real& fyP,
803-
Real& fzP,
804-
Real& mxP,
805-
Real& myP,
806-
Real& mzP,
807-
Vector<MAP_INFO> RKPM_MAP,
828+
ParticleReal& fxP,
829+
ParticleReal& fyP,
830+
ParticleReal& fzP,
831+
ParticleReal& mxP,
832+
ParticleReal& myP,
833+
ParticleReal& mzP,
834+
MAP_INFO const* rkpm_data,
808835
Real dv,
809836
Array4<Real> const &E,
810837
GpuArray<Real,AMREX_SPACEDIM> const& plo,
@@ -822,7 +849,8 @@ void ForceSpreadingRKPM_cir(
822849
fxP *= dv;
823850
fyP *= dv;
824851
fzP *= dv;
825-
RealVect moment = RealVect(p.pos(0) - Px, p.pos(1) - Py, p.pos(2) - Pz).crossProduct(RealVect(fxP, fyP, fzP));
852+
RealVect moment = RealVect(Real(p.pos(0) - Px), Real(p.pos(1) - Py), Real(p.pos(2) - Pz)).crossProduct(
853+
RealVect(Real(fxP), Real(fyP), Real(fzP)));
826854
mxP = moment[0];
827855
myP = moment[1];
828856
mzP = moment[2];
@@ -831,10 +859,10 @@ void ForceSpreadingRKPM_cir(
831859
for(int ii = -1; ii < 2; ii++){
832860
for(int jj = -1; jj < 2; jj++){
833861
for(int kk = -1; kk < 2; kk ++){
834-
auto RKPM = RKPM_MAP.at(cell_index++);
835-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex ), RKPM.weight * fxP);
836-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex + 1), RKPM.weight * fyP);
837-
Gpu::Atomic::AddNoRet(&E(i + ii, j + jj, k + kk, EulerForceIndex + 2), RKPM.weight * fzP);
862+
auto rkpm = rkpm_data[cell_index++];
863+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex ), Real(rkpm.weight * fxP));
864+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex + 1), Real(rkpm.weight * fyP));
865+
HostDevice::Atomic::Add(&E(i + ii, j + jj, k + kk, EulerForceIndex + 2), Real(rkpm.weight * fzP));
838866
}
839867
}
840868
}
@@ -864,19 +892,21 @@ void mParticle::ForceSpreading(MultiFab & EulerForce,
864892
const auto *const p_ptr = particles().data();
865893

866894
auto force_index = ParticleProperties::euler_force_index;
867-
const auto ps = particle_kernels.data();
895+
const auto ps = d_kernels.data();
868896

869897
if (do_RKPM) {
898+
const MAP_INFO* rkpm_ptr = d_rkpm_flat.data();
899+
constexpr int STENCIL = mParticle::RKPM_STENCIL_SIZE;
870900
ParallelFor(np,
871901
[=] AMREX_GPU_DEVICE (const int i) noexcept{
872902
const auto p_id = p_ptr[i].id() - 1; // lagrangian marker's id
873903
const auto id = ids[i]; // particle's id
874904
auto loc_ptr = ps[id].location;
875-
auto dv = RKPM_MAP[p_id][0].eps;
905+
auto dv = rkpm_ptr[p_id * STENCIL].eps;
876906
ForceSpreadingRKPM_cir(p_ptr[i], loc_ptr[0], loc_ptr[1], loc_ptr[2],
877907
fxP_ptr[i], fyP_ptr[i], fzP_ptr[i],
878908
mxP_ptr[i], myP_ptr[i], mzP_ptr[i],
879-
RKPM_MAP[p_id], dv, Uarray, plo, dxi, force_index);
909+
rkpm_ptr + p_id * STENCIL, dv, Uarray, plo, dxi, force_index);
880910
});
881911
}else {
882912
ParallelFor(np,
@@ -967,8 +997,8 @@ void mParticle::ForceSpreading(MultiFab & EulerForce,
967997
ParallelAllReduce::Sum(my, ParallelDescriptor::Communicator());
968998
ParallelAllReduce::Sum(mz, ParallelDescriptor::Communicator());
969999

970-
cur_p.ib_force += {fx, fy, fz};
971-
cur_p.ib_moment += {mx, my, mz};
1000+
cur_p.ib_force += {Real(fx), Real(fy), Real(fz)};
1001+
cur_p.ib_moment += {Real(mx), Real(my), Real(mz)};
9721002
}
9731003

9741004
EulerForce.SumBoundary(ParticleProperties::euler_force_index, 3, gm.periodicity());
@@ -1278,6 +1308,23 @@ void mParticle::ResolveWithRPKM(std::string RKPM_file) {
12781308
}
12791309
}
12801310
Print() << "\tRKPM mapping size : " << RKPM_MAP.size() << "\n";
1311+
1312+
// Flatten RKPM_MAP into a contiguous device-accessible array for GPU ParallelFor
1313+
int max_key = 0;
1314+
for (const auto& [key, vec] : RKPM_MAP) {
1315+
max_key = std::max(max_key, key);
1316+
AMREX_ALWAYS_ASSERT(int(vec.size()) <= RKPM_STENCIL_SIZE);
1317+
}
1318+
Gpu::HostVector<MAP_INFO> h_rkpm_flat((max_key + 1) * RKPM_STENCIL_SIZE);
1319+
for (const auto& [key, vec] : RKPM_MAP) {
1320+
for (int j = 0; j < int(vec.size()); ++j) {
1321+
h_rkpm_flat[key * RKPM_STENCIL_SIZE + j] = vec[j];
1322+
}
1323+
}
1324+
d_rkpm_flat.resize(h_rkpm_flat.size());
1325+
Gpu::copyAsync(Gpu::hostToDevice, h_rkpm_flat.begin(), h_rkpm_flat.end(),
1326+
d_rkpm_flat.begin());
1327+
Gpu::streamSynchronize();
12811328
}
12821329

12831330
int mParticle::StartOfLagrangianMarker(size_t index) {

Source/DiffusedIB_Parallel.h

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,25 @@ struct kernel{
119119
RealVect Fcp{0.0,0.0,0.0};
120120
RealVect Tcp{0.0,0.0,0.0};
121121

122-
Real* phiK;
123-
Real* thetaK;
122+
Gpu::DeviceVector<Real> phiK;
123+
Gpu::DeviceVector<Real> thetaK;
124124

125125
IntVect TL{0, 0, 0}, RL{0, 0, 0};
126126
};
127127

128+
/// Lightweight POD subset of kernel data, safe for GPU device code.
129+
/// Stored in managed/device memory via Gpu::DeviceVector.
130+
struct kernel_gpu {
131+
RealVect location;
132+
RealVect velocity;
133+
RealVect omega;
134+
Real radius;
135+
Real dv;
136+
Real const* phiK; ///< device pointer into kernel::phiK
137+
Real const* thetaK; ///< device pointer into kernel::thetaK
138+
int start_id;
139+
};
140+
128141
using CusParIter = ParIter<0, 0, num_Real, num_Int>;
129142
// lagrangian marker manager
130143
using mParticleContainer = ParticleContainer<0, 0, num_Real, num_Int>;
@@ -328,6 +341,15 @@ class mParticle
328341

329342
Vector<kernel> particle_kernels;
330343

344+
/// GPU-accessible copy of kernel data for use in device lambdas.
345+
Gpu::DeviceVector<kernel_gpu> d_kernels;
346+
347+
/** \brief Sync particle_kernels host data to d_kernels device memory.
348+
* Must be called after any host-side modification to particle_kernels
349+
* and before GPU ParallelFor that reads kernel data.
350+
*/
351+
void syncKernelsToDevice ();
352+
331353
mParticleContainer *mContainer{nullptr};
332354

333355
ParticleCollision m_Collision;
@@ -340,6 +362,10 @@ class mParticle
340362

341363
std::map<int, Vector<MAP_INFO>> RKPM_MAP;
342364

365+
/// GPU-friendly flattened RKPM stencil data: d_rkpm_flat[id * RKPM_STENCIL_SIZE + cell_index]
366+
static constexpr int RKPM_STENCIL_SIZE = 27;
367+
Gpu::DeviceVector<MAP_INFO> d_rkpm_flat;
368+
343369
int max_largrangian_num = 0;
344370

345371
uint32_t ib_force_file_index = 0;

0 commit comments

Comments
 (0)