Skip to content

Commit a0a2c48

Browse files
committed
Checkpoint
1 parent b0b0fa3 commit a0a2c48

File tree

8 files changed

+70
-25
lines changed

8 files changed

+70
-25
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@
1616
#include "DetectorsBase/Propagator.h"
1717
#include "GPUCommonDef.h"
1818

19-
namespace o2::its
19+
namespace o2
20+
{
21+
namespace its
2022
{
2123
class CellSeed;
24+
class ExternalAllocator;
2225
namespace gpu
2326
{
2427
#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
@@ -179,7 +182,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
179182
int filterCellNeighboursHandler(std::vector<int>&,
180183
gpuPair<int, int>*,
181184
int*,
182-
unsigned int);
185+
unsigned int,
186+
o2::its::ExternalAllocator* = nullptr);
183187

184188
template <int nLayers = 7>
185189
void processNeighboursHandler(const int startLayer,
@@ -192,6 +196,7 @@ void processNeighboursHandler(const int startLayer,
192196
gsl::span<int*> neighboursDeviceLUTs,
193197
const TrackingFrameInfo** foundTrackingFrameInfo,
194198
std::vector<CellSeed>& seedsHost,
199+
o2::its::ExternalAllocator*,
195200
const float bz,
196201
const float MaxChi2ClusterAttachment,
197202
const float maxChi2NDF,
@@ -213,5 +218,6 @@ void trackSeedHandler(CellSeed* trackSeeds,
213218
const o2::base::PropagatorF::MatCorrType matCorrType,
214219
const int nBlocks,
215220
const int nThreads);
216-
} // namespace o2::its
221+
} // namespace its
222+
} // namespace o2
217223
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_

Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
if(CUDA_ENABLED)
1414
find_package(CUDAToolkit)
1515
message(STATUS "Building ITS CUDA tracker")
16-
# add_compile_options(-O0 -g -lineinfo -fPIC)
16+
add_compile_options(-O0 -g -lineinfo -fPIC)
1717
# add_compile_definitions(ITS_MEASURE_GPU_TIME)
1818
o2_add_library(ITStrackingCUDA
1919
SOURCES ClusterLinesGPU.cu

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "ITStrackingGPU/TrackerTraitsGPU.h"
2222
#include "ITStrackingGPU/TrackingKernels.h"
2323
#include "ITStracking/TrackingConfigParam.h"
24+
2425
namespace o2::its
2526
{
2627
constexpr int UnusedIndex{-1};
@@ -253,7 +254,8 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
253254
filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
254255
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
255256
mTimeFrameGPU->getDeviceNeighbours(iLayer),
256-
nNeigh);
257+
nNeigh,
258+
mTimeFrameGPU->getExternalAllocator());
257259
}
258260
mTimeFrameGPU->createNeighboursDeviceArray();
259261
mTimeFrameGPU->unregisterRest();
@@ -283,6 +285,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
283285
mTimeFrameGPU->getDeviceNeighboursLUTs(),
284286
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
285287
trackSeeds,
288+
mTimeFrameGPU->getExternalAllocator(),
286289
mBz,
287290
mTrkParams[0].MaxChi2ClusterAttachment,
288291
mTrkParams[0].MaxChi2NDF,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,14 @@
3131
#include "ITStracking/Configuration.h"
3232
#include "ITStracking/IndexTableUtils.h"
3333
#include "ITStracking/MathUtils.h"
34+
#include "ITStracking/ExternalAllocator.h"
3435
#include "DataFormatsITS/TrackITS.h"
3536
#include "ReconstructionDataFormats/Vertex.h"
3637

3738
#include "ITStrackingGPU/TrackerTraitsGPU.h"
3839
#include "ITStrackingGPU/TrackingKernels.h"
3940
#include "ITStrackingGPU/Utils.h"
4041

41-
#include "GPUCommonHelpers.h"
42-
4342
#ifndef __HIPCC__
4443
#define THRUST_NAMESPACE thrust::cuda
4544
#else
@@ -67,6 +66,37 @@ GPUd() float Sq(float v)
6766
namespace gpu
6867
{
6968

69+
template <typename T>
70+
class TypedAllocator : public thrust::device_allocator<T>
71+
{
72+
public:
73+
using value_type = T;
74+
using pointer = T*;
75+
76+
template <typename U>
77+
struct rebind {
78+
using other = TypedAllocator<U>;
79+
};
80+
81+
explicit TypedAllocator(ExternalAllocator* allocPtr)
82+
: mInternalAllocator(allocPtr) {}
83+
84+
T* allocate(size_t n)
85+
{
86+
return reinterpret_cast<T*>(mInternalAllocator->allocate(n * sizeof(T)));
87+
}
88+
89+
void deallocate(T* p, size_t n)
90+
{
91+
char* raw_ptr = reinterpret_cast<char*>(p);
92+
size_t bytes = n * sizeof(T);
93+
mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op.
94+
}
95+
96+
private:
97+
ExternalAllocator* mInternalAllocator;
98+
};
99+
70100
GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
71101
const o2::its::IndexTableUtils& utils,
72102
const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
@@ -1146,18 +1176,19 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
11461176
int filterCellNeighboursHandler(std::vector<int>& neighHost, // TODO: eventually remove this!
11471177
gpuPair<int, int>* cellNeighbourPairs,
11481178
int* cellNeighbours,
1149-
unsigned int nNeigh)
1179+
unsigned int nNeigh,
1180+
o2::its::ExternalAllocator* allocator)
11501181
{
11511182
thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
11521183
thrust::device_ptr<int> validNeighs(cellNeighbours);
1153-
thrust::device_vector<int> keys(nNeigh); // TODO: externally allocate.
1154-
thrust::device_vector<int> vals(nNeigh); // TODO: externally allocate.
1184+
thrust::device_vector<int> keys(nNeigh, gpu::TypedAllocator<int>(allocator));
1185+
thrust::device_vector<int> vals(nNeigh, gpu::TypedAllocator<int>(allocator));
11551186
thrust::copy(thrust::make_transform_iterator(neighVectorPairs, gpu::pair_to_second<int, int>()),
11561187
thrust::make_transform_iterator(neighVectorPairs + nNeigh, gpu::pair_to_second<int, int>()),
11571188
keys.begin());
11581189
thrust::sequence(vals.begin(), vals.end());
11591190
thrust::sort_by_key(keys.begin(), keys.end(), vals.begin());
1160-
thrust::device_vector<gpuPair<int, int>> sortedNeigh(nNeigh);
1191+
thrust::device_vector<gpuPair<int, int>> sortedNeigh(nNeigh, gpu::TypedAllocator<gpuPair<int, int>>(allocator));
11611192
thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()),
11621193
thrust::make_permutation_iterator(neighVectorPairs, vals.end()),
11631194
sortedNeigh.begin());
@@ -1182,6 +1213,7 @@ void processNeighboursHandler(const int startLayer,
11821213
gsl::span<int*> neighboursDeviceLUTs,
11831214
const TrackingFrameInfo** foundTrackingFrameInfo,
11841215
std::vector<CellSeed>& seedsHost,
1216+
o2::its::ExternalAllocator* allocator,
11851217
const float bz,
11861218
const float maxChi2ClusterAttachment,
11871219
const float maxChi2NDF,
@@ -1190,11 +1222,13 @@ void processNeighboursHandler(const int startLayer,
11901222
const int nBlocks,
11911223
const int nThreads)
11921224
{
1193-
thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this.
1194-
// thrust::device_vector<int> lastCellIds(lastCellIdHost);
1195-
// thrust::device_vector<CellSeed> lastCellSeed(lastCellSeedHost);
1196-
thrust::device_vector<int> lastCellId, updatedCellId;
1197-
thrust::device_vector<CellSeed> lastCellSeed, updatedCellSeed;
1225+
// thrust::device_vector<int> lastCellIds(lastCellIdHost);
1226+
// thrust::device_vector<CellSeed> lastCellSeed(lastCellSeedHost);
1227+
auto allocInt = gpu::TypedAllocator<int>(allocator);
1228+
auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
1229+
thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1);
1230+
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt), updatedCellId(allocInt);
1231+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> lastCellSeed(allocCellSeed), updatedCellSeed(allocCellSeed);
11981232
gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(startLayer,
11991233
startLevel,
12001234
allCellSeeds,
@@ -1255,8 +1289,8 @@ void processNeighboursHandler(const int startLayer,
12551289
temp_storage_bytes = 0;
12561290
lastCellSeed.swap(updatedCellSeed);
12571291
lastCellId.swap(updatedCellId);
1258-
thrust::device_vector<CellSeed>().swap(updatedCellSeed);
1259-
thrust::device_vector<int>().swap(updatedCellId);
1292+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed).swap(updatedCellSeed);
1293+
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
12601294
auto lastCellSeedSize{lastCellSeed.size()};
12611295
foundSeedsTable.resize(nCells[iLayer] + 1);
12621296
thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
@@ -1316,7 +1350,7 @@ void processNeighboursHandler(const int startLayer,
13161350
matCorrType);
13171351
GPUChkErrS(cudaFree(d_temp_storage));
13181352
}
1319-
thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
1353+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
13201354
auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
13211355
auto s{end - outSeeds.begin()};
13221356
std::vector<CellSeed> outSeedsHost(s);
@@ -1429,6 +1463,7 @@ template void processNeighboursHandler<7>(const int startLayer,
14291463
gsl::span<int*> neighboursDeviceLUTs,
14301464
const TrackingFrameInfo** foundTrackingFrameInfo,
14311465
std::vector<CellSeed>& seedsHost,
1466+
o2::its::ExternalAllocator*,
14321467
const float bz,
14331468
const float maxChi2ClusterAttachment,
14341469
const float maxChi2NDF,

Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@
2828
#endif
2929

3030
#if defined(__CUDACC__) || defined(__HIPCC__)
31-
#ifdef __CUDACC__
32-
#include <cuda_runtime.h>
31+
#if defined(__CUDACC__)
3332
typedef cudaStream_t GPUStream;
34-
#else // __HIPCC__
33+
#else
34+
#ifndef GPUCA_GPUCODE_DEVICE
3535
#include <hip/hip_runtime.h>
36+
#endif
3637
typedef hipStream_t GPUStream;
3738
#endif
3839
#else

Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class ExternalAllocator
2323
{
2424
public:
2525
virtual void* allocate(size_t) = 0;
26+
virtual void deallocate(char*, size_t) = 0;
2627
};
27-
2828
} // namespace o2::its
2929

3030
#endif

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ class TimeFrame
232232
LOGP(debug, "External allocator is currently only supported for GPU");
233233
}
234234
}
235-
235+
ExternalAllocator* getExternalAllocator() { return mAllocator; }
236236
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
237237
{
238238
return;

GPU/GPUTracking/Global/GPUChainITS.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
2929
{
3030
return mFWReco->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU);
3131
}
32-
32+
void deallocate(char* ptr, size_t) {}
3333
void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
3434

3535
private:

0 commit comments

Comments
 (0)