Skip to content

Commit d884f44

Browse files
ax3latmyers
andauthored
Pure SoA Particle: Separate Array for IdCPU (#3585)
## Summary This addresses a regression we see when moving to pure SoA particles: - slightly slower read/write to Ids when needed, e.g., for sorting - issues going up to the full 64bit range ## Additional background Once finished, this will close #3569. ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Andrew Myers <[email protected]>
1 parent 43d71da commit d884f44

File tree

8 files changed

+212
-56
lines changed

8 files changed

+212
-56
lines changed

Src/Particle/AMReX_ParticleContainerI.H

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
1818
if (h_redistribute_int_comp[i]) {++num_int_comm_comps;}
1919
}
2020

21-
if constexpr(!ParticleType::is_soa_particle) {
22-
particle_size = sizeof(ParticleType);
21+
if constexpr (ParticleType::is_soa_particle) {
22+
particle_size = sizeof(uint64_t); // idcpu
2323
} else {
24-
particle_size = 0;
24+
particle_size = sizeof(ParticleType);
2525
}
2626
superparticle_size = particle_size +
2727
num_real_comm_comps*sizeof(ParticleReal) + num_int_comm_comps*sizeof(int);
@@ -1095,7 +1095,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
10951095
const size_t np_total = np + ptile.numNeighborParticles();
10961096

10971097
if (memEfficientSort) {
1098-
if constexpr(!ParticleType::is_soa_particle) {
1098+
if constexpr (!ParticleType::is_soa_particle) {
10991099
static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
11001100
using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
11011101
uint64_t, uint32_t>;
@@ -1530,7 +1530,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
15301530
unsigned npart = ptile_ptrs[pmap_it]->numParticles();
15311531
ParticleLocData pld;
15321532

1533-
if constexpr(!ParticleType::is_soa_particle){
1533+
if constexpr (!ParticleType::is_soa_particle){
15341534

15351535
if (npart != 0) {
15361536
Long last = npart - 1;
@@ -1647,7 +1647,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
16471647
}
16481648
}
16491649

1650-
} else{ // soa particle
1650+
} else { // soa particle
16511651

16521652
auto particle_tile = ptile_ptrs[pmap_it];
16531653
if (npart != 0) {
@@ -1663,6 +1663,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
16631663
}
16641664

16651665
if (p.id() < 0){
1666+
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
16661667
for (int comp = 0; comp < NumRealComps(); comp++) {
16671668
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
16681669
}
@@ -1679,6 +1680,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
16791680
particlePostLocate(p, pld, lev);
16801681

16811682
if (p.id() < 0) {
1683+
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
16821684
for (int comp = 0; comp < NumRealComps(); comp++) {
16831685
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
16841686
}
@@ -1696,6 +1698,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
16961698
// We own it but must shift it to another place.
16971699
auto index = std::make_pair(pld.m_grid, pld.m_tile);
16981700
AMREX_ASSERT(soa_local[pld.m_lev][index].size() == num_threads);
1701+
{
1702+
auto& arr = soa_local[pld.m_lev][index][thread_num].GetIdCPUData();
1703+
arr.push_back(soa.GetIdCPUData()[pindex]);
1704+
}
16991705
for (int comp = 0; comp < NumRealComps(); ++comp) {
17001706
RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
17011707
arr.push_back(soa.GetRealData(comp)[pindex]);
@@ -1715,6 +1721,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
17151721
particles_to_send.resize(new_size);
17161722

17171723
char* dst = &particles_to_send[old_size];
1724+
{
1725+
std::memcpy(dst, &soa.GetIdCPUData()[pindex], sizeof(uint64_t));
1726+
dst += sizeof(uint64_t);
1727+
}
17181728
int array_comp_start = AMREX_SPACEDIM + NStructReal;
17191729
for (int comp = 0; comp < NumRealComps(); comp++) {
17201730
if (h_redistribute_real_comp[array_comp_start + comp]) {
@@ -1733,6 +1743,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
17331743
}
17341744

17351745
if (p.id() < 0){
1746+
soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
17361747
for (int comp = 0; comp < NumRealComps(); comp++) {
17371748
soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
17381749
}
@@ -1747,6 +1758,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
17471758
++pindex;
17481759
}
17491760

1761+
{
1762+
auto& iddata = soa.GetIdCPUData();
1763+
iddata.erase(iddata.begin() + last + 1, iddata.begin() + npart);
1764+
}
17501765
for (int comp = 0; comp < NumRealComps(); comp++) {
17511766
RealVector& rdata = soa.GetRealData(comp);
17521767
rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
@@ -1828,6 +1843,12 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
18281843
auto& soa = ptile.GetStructOfArrays();
18291844
auto& soa_tmp = soa_local[lev][index];
18301845
for (int i = 0; i < num_threads; ++i) {
1846+
{
1847+
auto& arr = soa.GetIdCPUData();
1848+
auto& tmp = soa_tmp[i].GetIdCPUData();
1849+
arr.insert(arr.end(), tmp.begin(), tmp.end());
1850+
tmp.erase(tmp.begin(), tmp.end());
1851+
}
18311852
for (int comp = 0; comp < NumRealComps(); ++comp) {
18321853
RealVector& arr = soa.GetRealData(comp);
18331854
RealVector& tmp = soa_tmp[i].GetRealData(comp);
@@ -2045,20 +2066,16 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
20452066

20462067
Particle<NStructReal, NStructInt> p;
20472068

2048-
if constexpr (!ParticleType::is_soa_particle) {
2049-
std::memcpy(&p, pbuf, sizeof(ParticleType));
2050-
} else {
2069+
if constexpr (ParticleType::is_soa_particle) {
2070+
std::memcpy(&p.m_idcpu, pbuf, sizeof(uint64_t));
2071+
20512072
ParticleReal pos[AMREX_SPACEDIM];
2052-
std::memcpy(&pos[0], pbuf, AMREX_SPACEDIM*sizeof(ParticleReal));
2073+
std::memcpy(&pos[0], pbuf + sizeof(uint64_t), AMREX_SPACEDIM*sizeof(ParticleReal));
20532074
AMREX_D_TERM(p.pos(0) = pos[0];,
20542075
p.pos(1) = pos[1];,
20552076
p.pos(2) = pos[2]);
2056-
2057-
int idcpu[2];
2058-
std::memcpy(&idcpu[0], pbuf + NumRealComps()*sizeof(ParticleReal), 2*sizeof(int));
2059-
2060-
p.id() = idcpu[0];
2061-
p.cpu() = idcpu[1];
2077+
} else {
2078+
std::memcpy(&p, pbuf, sizeof(ParticleType));
20622079
}
20632080

20642081
bool success = Where(p, pld, lev_min, lev_max, 0);
@@ -2097,7 +2114,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
20972114
rcv_tile[ipart])];
20982115
char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
20992116

2100-
if constexpr(! ParticleType::is_soa_particle) {
2117+
if constexpr (ParticleType::is_soa_particle) {
2118+
uint64_t idcpudata;
2119+
std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
2120+
pbuf += sizeof(uint64_t);
2121+
ptile.GetStructOfArrays().GetIdCPUData().push_back(idcpudata);
2122+
} else {
21012123
ParticleType p;
21022124
std::memcpy(&p, pbuf, sizeof(ParticleType));
21032125
pbuf += sizeof(ParticleType);
@@ -2146,6 +2168,10 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
21462168
host_int_attribs.reserve(15);
21472169
host_int_attribs.resize(finestLevel()+1);
21482170

2171+
Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
2172+
host_idcpu.reserve(15);
2173+
host_idcpu.resize(finestLevel()+1);
2174+
21492175
ipart = 0;
21502176
for (int i = 0; i < nrcvs; ++i)
21512177
{
@@ -2159,7 +2185,15 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
21592185

21602186
char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
21612187

2162-
if constexpr(! ParticleType::is_soa_particle) {
2188+
host_real_attribs[lev][ind].resize(NumRealComps());
2189+
host_int_attribs[lev][ind].resize(NumIntComps());
2190+
2191+
if constexpr (ParticleType::is_soa_particle) {
2192+
uint64_t idcpudata;
2193+
std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
2194+
pbuf += sizeof(uint64_t);
2195+
host_idcpu[lev][ind].push_back(idcpudata);
2196+
} else {
21632197
ParticleType p;
21642198
std::memcpy(&p, pbuf, sizeof(ParticleType));
21652199
pbuf += sizeof(ParticleType);
@@ -2210,7 +2244,12 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
22102244
auto new_size = old_size + src_tile.size();
22112245
dst_tile.resize(new_size);
22122246

2213-
if constexpr(! ParticleType::is_soa_particle) {
2247+
if constexpr (ParticleType::is_soa_particle) {
2248+
Gpu::copyAsync(Gpu::hostToDevice,
2249+
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
2250+
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
2251+
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
2252+
} else {
22142253
Gpu::copyAsync(Gpu::hostToDevice,
22152254
src_tile.begin(), src_tile.end(),
22162255
dst_tile.GetArrayOfStructs().begin() + old_size);

Src/Particle/AMReX_ParticleIO.H

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,10 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
954954
host_int_attribs.reserve(15);
955955
host_int_attribs.resize(finest_level_in_file+1);
956956

957+
Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
958+
host_idcpu.reserve(15);
959+
host_idcpu.resize(finestLevel()+1);
960+
957961
for (int i = 0; i < cnt; i++) {
958962
// note: for pure SoA particle layouts, we do write the id, cpu and positions as a struct
959963
// for backwards compatibility with readers
@@ -1021,8 +1025,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
10211025
host_real_attribs[pld.m_lev][ind][j].push_back(ptemp.pos(j));
10221026
}
10231027

1024-
host_int_attribs[pld.m_lev][ind][0].push_back(ptemp.id());
1025-
host_int_attribs[pld.m_lev][ind][1].push_back(ptemp.cpu());
1028+
host_idcpu[pld.m_lev][ind].push_back(ptemp.m_idcpu);
10261029

10271030
// read all other SoA
10281031
// add the real...
@@ -1032,7 +1035,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
10321035
}
10331036

10341037
// ... and int array data
1035-
for (int icomp = 2; icomp < NumIntComps(); icomp++) {
1038+
for (int icomp = 0; icomp < NumIntComps(); icomp++) {
10361039
host_int_attribs[lev][ind][icomp].push_back(*iptr);
10371040
++iptr;
10381041
}
@@ -1061,6 +1064,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
10611064
{
10621065
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
10631066
dst_tile.GetArrayOfStructs().begin() + old_size);
1067+
} else {
1068+
Gpu::copyAsync(Gpu::hostToDevice,
1069+
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
1070+
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
1071+
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
10641072
}
10651073

10661074
for (int i = 0; i < NumRealComps(); ++i) { // NOLINT(readability-misleading-indentation)

Src/Particle/AMReX_ParticleInit.H

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,10 @@ InitRandom (Long icount,
10621062
host_int_attribs.reserve(15);
10631063
host_int_attribs.resize(finestLevel()+1);
10641064

1065+
Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
1066+
host_idcpu.reserve(15);
1067+
host_idcpu.resize(finestLevel()+1);
1068+
10651069
for (Long j = 0; j < icount; j++)
10661070
{
10671071
Particle<0, 0> ptest;
@@ -1117,8 +1121,9 @@ InitRandom (Long icount,
11171121
host_real_attribs[pld.m_lev][ind][i].push_back(pos[j*AMREX_SPACEDIM+i]);
11181122
}
11191123

1120-
host_int_attribs[pld.m_lev][ind][0].push_back(ParticleType::NextID());
1121-
host_int_attribs[pld.m_lev][ind][1].push_back(MyProc);
1124+
host_idcpu[pld.m_lev][ind].push_back(0);
1125+
ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
1126+
ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
11221127

11231128
host_particles[pld.m_lev][ind];
11241129

@@ -1157,6 +1162,11 @@ InitRandom (Long icount,
11571162
{
11581163
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
11591164
dst_tile.GetArrayOfStructs().begin() + old_size);
1165+
} else {
1166+
Gpu::copyAsync(Gpu::hostToDevice,
1167+
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
1168+
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
1169+
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
11601170
}
11611171

11621172
for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)
@@ -1201,6 +1211,10 @@ InitRandom (Long icount,
12011211
host_int_attribs.reserve(15);
12021212
host_int_attribs.resize(finestLevel()+1);
12031213

1214+
Vector<std::map<std::pair<int, int>, Gpu::HostVector<uint64_t> > > host_idcpu;
1215+
host_idcpu.reserve(15);
1216+
host_idcpu.resize(finestLevel()+1);
1217+
12041218
for (Long icnt = 0; icnt < M; icnt++) {
12051219
Particle<0, 0> ptest;
12061220
for (int i = 0; i < AMREX_SPACEDIM; i++) {
@@ -1261,8 +1275,9 @@ InitRandom (Long icount,
12611275
host_real_attribs[pld.m_lev][ind][i].push_back(ptest.pos(i));
12621276
}
12631277

1264-
host_int_attribs[pld.m_lev][ind][0].push_back(ptest.id());
1265-
host_int_attribs[pld.m_lev][ind][1].push_back(ptest.cpu());
1278+
host_idcpu[pld.m_lev][ind].push_back(0);
1279+
ParticleIDWrapper(host_idcpu[pld.m_lev][ind].back()) = ParticleType::NextID();
1280+
ParticleCPUWrapper(host_idcpu[pld.m_lev][ind].back()) = ParallelDescriptor::MyProc();
12661281

12671282
host_particles[pld.m_lev][ind];
12681283

@@ -1300,6 +1315,11 @@ InitRandom (Long icount,
13001315
{
13011316
Gpu::copyAsync(Gpu::hostToDevice, src_tile.begin(), src_tile.end(),
13021317
dst_tile.GetArrayOfStructs().begin() + old_size);
1318+
} else {
1319+
Gpu::copyAsync(Gpu::hostToDevice,
1320+
host_idcpu[host_lev][std::make_pair(grid,tile)].begin(),
1321+
host_idcpu[host_lev][std::make_pair(grid,tile)].end(),
1322+
dst_tile.GetStructOfArrays().GetIdCPUData().begin() + old_size);
13031323
}
13041324

13051325
for (int i = 0; i < NArrayReal; ++i) { // NOLINT(readability-misleading-indentation)

0 commit comments

Comments
 (0)