Skip to content

Commit 2b7a20b

Browse files
Felix Schlepperf3sch
authored andcommitted
ITS: GPU: avoid host copies
1 parent 68c9913 commit 2b7a20b

File tree

5 files changed

+226
-155
lines changed

5 files changed

+226
-155
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <gsl/gsl>
1717
#include <bitset>
18+
#include <vector>
1819

1920
#include "ITStracking/BoundedAllocator.h"
2021
#include "ITStracking/TimeFrame.h"
@@ -57,6 +58,10 @@ class TimeFrameGPU : public TimeFrame<nLayers>
5758
void loadMultiplicityCutMask(const int);
5859
void loadVertices(const int);
5960

61+
// Device configuration
62+
void loadConfigMinPt(const std::vector<float>&);
63+
float* getConfigMinPtDevice() { return mConfigMinPtDevice; }
64+
6065
///
6166
void createTrackletsLUTDevice(const int, const int);
6267
void createTrackletsLUTDeviceArray(const int);
@@ -65,7 +70,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
6570
void loadCellsDevice();
6671
void loadCellsLUTDevice();
6772
void loadTrackSeedsDevice();
68-
void loadTrackSeedsChi2Device();
6973
void loadRoadsDevice();
7074
void loadTrackSeedsDevice(bounded_vector<CellSeedN>&);
7175
void createTrackletsBuffers(const int);
@@ -78,8 +82,9 @@ class TimeFrameGPU : public TimeFrame<nLayers>
7882
void createNeighboursIndexTablesDevice(const int);
7983
void createNeighboursDevice(const unsigned int layer);
8084
void createNeighboursLUTDevice(const int, const unsigned int);
81-
void createTrackITSExtDevice(bounded_vector<CellSeedN>&);
85+
void createTrackITSExtDevice(const int);
8286
void downloadTrackITSExtDevice(bounded_vector<CellSeedN>&);
87+
void downloadTrackITSExtDevice(const int);
8388
void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
8489
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
8590
void downloadCellsDevice();
@@ -130,8 +135,9 @@ class TimeFrameGPU : public TimeFrame<nLayers>
130135
int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
131136
CellSeedN** getDeviceArrayCells() { return mCellsDeviceArray; }
132137
CellSeedN* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
138+
auto& getDeviceCellSeeds() { return mCellSeedsCandidates; }
139+
void clearDeviceCellSeeds();
133140
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
134-
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
135141
int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
136142
uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; }
137143

@@ -158,6 +164,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
158164
private:
159165
void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream
160166
void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream
167+
void deallocMem(void*, size_t, bool); // Abstract owned and unowned memory deallocations on default stream
161168
TimeFrameGPUParameters mGpuParams;
162169

163170
// Host-available device buffer sizes
@@ -198,8 +205,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
198205
CellSeedN* mTrackSeedsDevice{nullptr};
199206
std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
200207
o2::track::TrackParCovF** mCellSeedsDeviceArray;
201-
std::array<float*, nLayers - 2> mCellSeedsChi2Device;
202-
float** mCellSeedsChi2DeviceArray;
203208

204209
Road<nLayers - 2>* mRoadsDevice;
205210
TrackITSExt* mTrackITSExtDevice;
@@ -218,7 +223,12 @@ class TimeFrameGPU : public TimeFrame<nLayers>
218223
std::bitset<nLayers + 1> mPinnedTrackingFrameInfo{0};
219224

220225
// Temporary buffer for storing output tracks from GPU tracking
226+
std::vector<gpuPair<CellSeedN*, size_t>> mCellSeedsCandidates;
221227
bounded_vector<TrackITSExt> mTrackITSExt;
228+
229+
// Configuration
230+
// TODO eventually put into permanent storage
231+
float* mConfigMinPtDevice{nullptr};
222232
};
223233

224234
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -187,31 +187,31 @@ int filterCellNeighboursHandler(gpuPair<int, int>*,
187187
o2::its::ExternalAllocator* = nullptr);
188188

189189
template <int nLayers = 7>
190-
void processNeighboursHandler(const int startLayer,
191-
const int startLevel,
192-
CellSeed<nLayers>** allCellSeeds,
193-
CellSeed<nLayers>* currentCellSeeds,
194-
std::array<int, nLayers - 2>& nCells,
195-
const unsigned char** usedClusters,
196-
std::array<int*, nLayers - 2>& neighbours,
197-
gsl::span<int*> neighboursDeviceLUTs,
198-
const TrackingFrameInfo** foundTrackingFrameInfo,
199-
bounded_vector<CellSeed<nLayers>>& seedsHost,
200-
const float bz,
201-
const float MaxChi2ClusterAttachment,
202-
const float maxChi2NDF,
203-
const o2::base::Propagator* propagator,
204-
const o2::base::PropagatorF::MatCorrType matCorrType,
205-
o2::its::ExternalAllocator* alloc,
206-
const int nBlocks,
207-
const int nThreads);
190+
int processNeighboursHandler(const int startLayer,
191+
const int startLevel,
192+
CellSeed<nLayers>** allCellSeeds,
193+
CellSeed<nLayers>* currentCellSeeds,
194+
std::array<int, nLayers - 2>& nCells,
195+
const unsigned char** usedClusters,
196+
std::array<int*, nLayers - 2>& neighbours,
197+
gsl::span<int*> neighboursDeviceLUTs,
198+
const TrackingFrameInfo** foundTrackingFrameInfo,
199+
std::vector<gpuPair<CellSeed<nLayers>*, size_t>>& cellSeeds,
200+
const float bz,
201+
const float MaxChi2ClusterAttachment,
202+
const float maxChi2NDF,
203+
const o2::base::Propagator* propagator,
204+
const o2::base::PropagatorF::MatCorrType matCorrType,
205+
o2::its::ExternalAllocator* alloc,
206+
const int nBlocks,
207+
const int nThreads);
208208

209209
template <int nLayers = 7>
210-
void trackSeedHandler(CellSeed<nLayers>* trackSeeds,
210+
void trackSeedHandler(gpuPair<CellSeed<nLayers>*, size_t> trackSeeds,
211211
const TrackingFrameInfo** foundTrackingFrameInfo,
212212
o2::its::TrackITSExt* tracks,
213-
std::vector<float>& minPtsHost,
214-
const unsigned int nSeeds,
213+
const float* minPts,
214+
const int offset,
215215
const float Bz,
216216
const int startLevel,
217217
float maxChi2ClusterAttachment,
@@ -220,5 +220,10 @@ void trackSeedHandler(CellSeed<nLayers>* trackSeeds,
220220
const o2::base::PropagatorF::MatCorrType matCorrType,
221221
const int nBlocks,
222222
const int nThreads);
223+
224+
void sortTrackITSExtDevice(o2::its::TrackITSExt* tracks,
225+
int n,
226+
o2::its::ExternalAllocator* alloc);
227+
223228
} // namespace o2::its
224229
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,25 @@ void TimeFrameGPU<nLayers>::allocMem(void** ptr, size_t size, bool extAllocator)
5656
}
5757
}
5858

59+
template <int nLayers>
60+
void TimeFrameGPU<nLayers>::deallocMem(void* ptr, size_t size, bool extAllocator)
61+
{
62+
if (extAllocator) {
63+
this->mAllocator->deallocate(reinterpret_cast<char*>(ptr), size);
64+
} else {
65+
GPULog("Calling default CUDA deallocator");
66+
GPUChkErrS(cudaFree(ptr));
67+
}
68+
}
69+
70+
template <int nLayers>
71+
void TimeFrameGPU<nLayers>::loadConfigMinPt(const std::vector<float>& minPts)
72+
{
73+
GPUTimer timer("loading config minpts");
74+
allocMem(reinterpret_cast<void**>(&mConfigMinPtDevice), minPts.size() * sizeof(float), this->getExtAllocator());
75+
GPUChkErrS(cudaMemcpy(mConfigMinPtDevice, minPts.data(), minPts.size() * sizeof(float), cudaMemcpyHostToDevice));
76+
}
77+
5978
template <int nLayers>
6079
void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
6180
{
@@ -450,14 +469,14 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
450469
}
451470

452471
template <int nLayers>
453-
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
472+
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(const int total)
454473
{
455474
GPUTimer timer("reserving tracks");
456-
mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
457-
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
458-
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->getExtAllocator());
459-
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt)));
460-
GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
475+
mTrackITSExt = bounded_vector<TrackITSExt>(total, {}, this->getMemoryPool().get());
476+
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", total, total * sizeof(o2::its::TrackITSExt) / constants::MB);
477+
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), total * sizeof(o2::its::TrackITSExt), this->getExtAllocator());
478+
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, total * sizeof(o2::its::TrackITSExt)));
479+
// GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), total * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
461480
}
462481

463482
template <int nLayers>
@@ -508,6 +527,14 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeedN>&
508527
GPUChkErrS(cudaHostUnregister(seeds.data()));
509528
}
510529

530+
template <int nLayers>
531+
void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(const int total)
532+
{
533+
GPUTimer timer("downloading tracks");
534+
GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
535+
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, total * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
536+
}
537+
511538
template <int nLayers>
512539
void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
513540
{
@@ -543,6 +570,16 @@ void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
543570
checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice);
544571
}
545572

573+
template <int nLayers>
574+
void TimeFrameGPU<nLayers>::clearDeviceCellSeeds()
575+
{
576+
GPUTimer timer("clearing device seeds");
577+
for (auto& p : mCellSeedsCandidates) {
578+
deallocMem(reinterpret_cast<void*>(p.first), p.second, this->getExtAllocator());
579+
}
580+
mCellSeedsCandidates.clear();
581+
}
582+
546583
template <int nLayers>
547584
void TimeFrameGPU<nLayers>::initialise(const int iteration,
548585
const TrackingParameters& trkParam,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 46 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
2929
{
3030
mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers);
3131

32+
// Configuration
33+
// TODO only do once and put these into permanent storage
34+
mTimeFrameGPU->loadConfigMinPt(this->mTrkParams[iteration].MinPt);
35+
3236
// on default stream
3337
mTimeFrameGPU->loadVertices(iteration);
3438
mTimeFrameGPU->loadIndexTableUtils(iteration);
@@ -290,56 +294,55 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
290294
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
291295
for (int startLevel{this->mTrkParams[iteration].CellsPerRoad()}; startLevel >= this->mTrkParams[iteration].CellMinimumLevel(); --startLevel) {
292296
const int minimumLayer{startLevel - 1};
293-
bounded_vector<CellSeed<nLayers>> trackSeeds(this->getMemoryPool().get());
294-
for (int startLayer{this->mTrkParams[iteration].CellsPerRoad() - 1}; startLayer >= minimumLayer; --startLayer) {
297+
int totalCellSeeds{0};
298+
for (int startLayer{this->mTrkParams[iteration].NeighboursPerRoad()}; startLayer >= minimumLayer; --startLayer) {
295299
if ((this->mTrkParams[iteration].StartLayerMask & (1 << (startLayer + 2))) == 0) {
296300
continue;
297301
}
298-
processNeighboursHandler<nLayers>(startLayer,
299-
startLevel,
300-
mTimeFrameGPU->getDeviceArrayCells(),
301-
mTimeFrameGPU->getDeviceCells()[startLayer],
302-
mTimeFrameGPU->getArrayNCells(),
303-
mTimeFrameGPU->getDeviceArrayUsedClusters(),
304-
mTimeFrameGPU->getDeviceNeighboursAll(),
305-
mTimeFrameGPU->getDeviceNeighboursLUTs(),
306-
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
307-
trackSeeds,
308-
this->mBz,
309-
this->mTrkParams[0].MaxChi2ClusterAttachment,
310-
this->mTrkParams[0].MaxChi2NDF,
311-
mTimeFrameGPU->getDevicePropagator(),
312-
this->mTrkParams[0].CorrType,
313-
mTimeFrameGPU->getExternalAllocator(),
314-
conf.nBlocksProcessNeighbours[iteration],
315-
conf.nThreadsProcessNeighbours[iteration]);
302+
totalCellSeeds += processNeighboursHandler<nLayers>(startLayer,
303+
startLevel,
304+
mTimeFrameGPU->getDeviceArrayCells(),
305+
mTimeFrameGPU->getDeviceCells()[startLayer],
306+
mTimeFrameGPU->getArrayNCells(),
307+
mTimeFrameGPU->getDeviceArrayUsedClusters(),
308+
mTimeFrameGPU->getDeviceNeighboursAll(),
309+
mTimeFrameGPU->getDeviceNeighboursLUTs(),
310+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
311+
mTimeFrameGPU->getDeviceCellSeeds(),
312+
this->mBz,
313+
this->mTrkParams[0].MaxChi2ClusterAttachment,
314+
this->mTrkParams[0].MaxChi2NDF,
315+
mTimeFrameGPU->getDevicePropagator(),
316+
this->mTrkParams[0].CorrType,
317+
mTimeFrameGPU->getExternalAllocator(),
318+
conf.nBlocksProcessNeighbours[iteration],
319+
conf.nThreadsProcessNeighbours[iteration]);
316320
}
317-
// fixme: I don't want to move tracks back and forth, but I need a way to use a thrust::allocator that is aware of our managed memory.
318-
if (trackSeeds.empty()) {
319-
LOGP(debug, "No track seeds found, skipping track finding");
321+
if (!totalCellSeeds) {
320322
continue;
321323
}
322-
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
323-
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
324-
325-
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds
326-
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo
327-
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks
328-
this->mTrkParams[iteration].MinPt, // std::vector<float>& minPtsHost,
329-
trackSeeds.size(), // const size_t nSeeds
330-
this->mBz, // const float Bz
331-
startLevel, // const int startLevel,
332-
this->mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
333-
this->mTrkParams[0].MaxChi2NDF, // float maxChi2NDF
334-
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
335-
this->mTrkParams[0].CorrType, // o2::base::PropagatorImpl<float>::MatCorrType
336-
conf.nBlocksTracksSeeds[iteration],
337-
conf.nThreadsTracksSeeds[iteration]);
338-
339-
mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds);
324+
mTimeFrameGPU->createTrackITSExtDevice(totalCellSeeds);
325+
int offset{0};
326+
for (auto& p : mTimeFrameGPU->getDeviceCellSeeds()) {
327+
trackSeedHandler<nLayers>(p,
328+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
329+
mTimeFrameGPU->getDeviceTrackITSExt(),
330+
mTimeFrameGPU->getConfigMinPtDevice(),
331+
offset,
332+
this->mBz,
333+
startLevel,
334+
this->mTrkParams[0].MaxChi2ClusterAttachment,
335+
this->mTrkParams[0].MaxChi2NDF,
336+
mTimeFrameGPU->getDevicePropagator(),
337+
this->mTrkParams[0].CorrType,
338+
conf.nBlocksTracksSeeds[iteration],
339+
conf.nThreadsTracksSeeds[iteration]);
340+
offset += p.second;
341+
}
342+
sortTrackITSExtDevice(mTimeFrameGPU->getDeviceTrackITSExt(), offset, mTimeFrameGPU->getExternalAllocator());
343+
mTimeFrameGPU->downloadTrackITSExtDevice(offset);
340344

341345
auto& tracks = mTimeFrameGPU->getTrackITSExt();
342-
343346
for (auto& track : tracks) {
344347
if (!track.getChi2()) {
345348
continue; // this is to skip the unset tracks that are put at the beginning of the vector by the sorting. To see if this can be optimised.
@@ -382,6 +385,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
382385
}
383386
mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
384387
}
388+
mTimeFrameGPU->clearDeviceCellSeeds();
385389
mTimeFrameGPU->loadUsedClustersDevice();
386390
}
387391
};

0 commit comments

Comments
 (0)