3131#include " ITStracking/Configuration.h"
3232#include " ITStracking/IndexTableUtils.h"
3333#include " ITStracking/MathUtils.h"
34+ #include " ITStracking/ExternalAllocator.h"
3435#include " DataFormatsITS/TrackITS.h"
3536#include " ReconstructionDataFormats/Vertex.h"
3637
3738#include " ITStrackingGPU/TrackerTraitsGPU.h"
3839#include " ITStrackingGPU/TrackingKernels.h"
3940#include " ITStrackingGPU/Utils.h"
4041
41- #include " GPUCommonHelpers.h"
42-
4342#ifndef __HIPCC__
4443#define THRUST_NAMESPACE thrust::cuda
4544#else
@@ -67,6 +66,37 @@ GPUd() float Sq(float v)
6766namespace gpu
6867{
6968
69+ template <typename T>
70+ class TypedAllocator : public thrust ::device_allocator<T>
71+ {
72+ public:
73+ using value_type = T;
74+ using pointer = T*;
75+
76+ template <typename U>
77+ struct rebind {
78+ using other = TypedAllocator<U>;
79+ };
80+
81+ explicit TypedAllocator (ExternalAllocator* allocPtr)
82+ : mInternalAllocator(allocPtr) {}
83+
84+ T* allocate (size_t n)
85+ {
86+ return reinterpret_cast <T*>(mInternalAllocator ->allocate (n * sizeof (T)));
87+ }
88+
89+ void deallocate (T* p, size_t n)
90+ {
91+ char * raw_ptr = reinterpret_cast <char *>(p);
92+ size_t bytes = n * sizeof (T);
93+ mInternalAllocator ->deallocate (raw_ptr, bytes); // redundant as internal dealloc is no-op.
94+ }
95+
96+ private:
97+ ExternalAllocator* mInternalAllocator ;
98+ };
99+
70100GPUd () const int4 getBinsRect (const Cluster& currentCluster, const int layerIndex,
71101 const o2::its::IndexTableUtils& utils,
72102 const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
@@ -1146,18 +1176,19 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
11461176int filterCellNeighboursHandler (std::vector<int >& neighHost, // TODO: eventually remove this!
11471177 gpuPair<int , int >* cellNeighbourPairs,
11481178 int * cellNeighbours,
1149- unsigned int nNeigh)
1179+ unsigned int nNeigh,
1180+ o2::its::ExternalAllocator* allocator)
11501181{
11511182 thrust::device_ptr<gpuPair<int , int >> neighVectorPairs (cellNeighbourPairs);
11521183 thrust::device_ptr<int > validNeighs (cellNeighbours);
1153- thrust::device_vector<int > keys (nNeigh); // TODO: externally allocate.
1154- thrust::device_vector<int > vals (nNeigh); // TODO: externally allocate.
1184+ thrust::device_vector<int > keys (nNeigh, gpu::TypedAllocator< int >(allocator));
1185+ thrust::device_vector<int > vals (nNeigh, gpu::TypedAllocator< int >(allocator));
11551186 thrust::copy (thrust::make_transform_iterator (neighVectorPairs, gpu::pair_to_second<int , int >()),
11561187 thrust::make_transform_iterator (neighVectorPairs + nNeigh, gpu::pair_to_second<int , int >()),
11571188 keys.begin ());
11581189 thrust::sequence (vals.begin (), vals.end ());
11591190 thrust::sort_by_key (keys.begin (), keys.end (), vals.begin ());
1160- thrust::device_vector<gpuPair<int , int >> sortedNeigh (nNeigh);
1191+ thrust::device_vector<gpuPair<int , int >> sortedNeigh (nNeigh, gpu::TypedAllocator<gpuPair< int , int >>(allocator) );
11611192 thrust::copy (thrust::make_permutation_iterator (neighVectorPairs, vals.begin ()),
11621193 thrust::make_permutation_iterator (neighVectorPairs, vals.end ()),
11631194 sortedNeigh.begin ());
@@ -1182,6 +1213,7 @@ void processNeighboursHandler(const int startLayer,
11821213 gsl::span<int *> neighboursDeviceLUTs,
11831214 const TrackingFrameInfo** foundTrackingFrameInfo,
11841215 std::vector<CellSeed>& seedsHost,
1216+ o2::its::ExternalAllocator* allocator,
11851217 const float bz,
11861218 const float maxChi2ClusterAttachment,
11871219 const float maxChi2NDF,
@@ -1190,11 +1222,13 @@ void processNeighboursHandler(const int startLayer,
11901222 const int nBlocks,
11911223 const int nThreads)
11921224{
1193- thrust::device_vector<int > foundSeedsTable (nCells[startLayer] + 1 ); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this.
1194- // thrust::device_vector<int> lastCellIds(lastCellIdHost);
1195- // thrust::device_vector<CellSeed> lastCellSeed(lastCellSeedHost);
1196- thrust::device_vector<int > lastCellId, updatedCellId;
1197- thrust::device_vector<CellSeed> lastCellSeed, updatedCellSeed;
1225+ // thrust::device_vector<int> lastCellIds(lastCellIdHost);
1226+ // thrust::device_vector<CellSeed> lastCellSeed(lastCellSeedHost);
1227+ auto allocInt = gpu::TypedAllocator<int >(allocator);
1228+ auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
1229+ thrust::device_vector<int > foundSeedsTable (nCells[startLayer] + 1 );
1230+ thrust::device_vector<int , gpu::TypedAllocator<int >> lastCellId (allocInt), updatedCellId (allocInt);
1231+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> lastCellSeed (allocCellSeed), updatedCellSeed (allocCellSeed);
11981232 gpu::processNeighboursKernel<true ><<<nBlocks, nThreads>>> (startLayer,
11991233 startLevel,
12001234 allCellSeeds,
@@ -1255,8 +1289,8 @@ void processNeighboursHandler(const int startLayer,
12551289 temp_storage_bytes = 0 ;
12561290 lastCellSeed.swap (updatedCellSeed);
12571291 lastCellId.swap (updatedCellId);
1258- thrust::device_vector<CellSeed>( ).swap (updatedCellSeed);
1259- thrust::device_vector<int >( ).swap (updatedCellId);
1292+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed ).swap (updatedCellSeed);
1293+ thrust::device_vector<int , gpu::TypedAllocator< int >>(allocInt ).swap (updatedCellId);
12601294 auto lastCellSeedSize{lastCellSeed.size ()};
12611295 foundSeedsTable.resize (nCells[iLayer] + 1 );
12621296 thrust::fill (foundSeedsTable.begin (), foundSeedsTable.end (), 0 );
@@ -1316,7 +1350,7 @@ void processNeighboursHandler(const int startLayer,
13161350 matCorrType);
13171351 GPUChkErrS (cudaFree (d_temp_storage));
13181352 }
1319- thrust::device_vector<CellSeed> outSeeds (updatedCellSeed.size ());
1353+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds (updatedCellSeed.size (), allocCellSeed );
13201354 auto end = thrust::copy_if (updatedCellSeed.begin (), updatedCellSeed.end (), outSeeds.begin (), gpu::seed_selector (1 .e3 , maxChi2NDF * ((startLevel + 2 ) * 2 - 5 )));
13211355 auto s{end - outSeeds.begin ()};
13221356 std::vector<CellSeed> outSeedsHost (s);
@@ -1429,6 +1463,7 @@ template void processNeighboursHandler<7>(const int startLayer,
14291463 gsl::span<int *> neighboursDeviceLUTs,
14301464 const TrackingFrameInfo** foundTrackingFrameInfo,
14311465 std::vector<CellSeed>& seedsHost,
1466+ o2::its::ExternalAllocator*,
14321467 const float bz,
14331468 const float maxChi2ClusterAttachment,
14341469 const float maxChi2NDF,
0 commit comments