Skip to content

Commit e9fcf75

Browse files
authored
Merge a129ab1 into sapling-pr-archive-ktf
2 parents 0e1076d + a129ab1 commit e9fcf75

File tree

30 files changed

+1019
-194
lines changed

30 files changed

+1019
-194
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \brief Helper interface to the GPU device, meant to be compatible with manual allocation/streams and GPUReconstruction ones.
13+
/// \author [email protected]
14+
15+
#ifndef DCAFITTER_GPU_INTERFACE
16+
#define DCAFITTER_GPU_INTERFACE
17+
18+
#include <thread>
19+
#include <vector>
20+
#include <atomic>
21+
22+
namespace o2
23+
{
24+
namespace vertexing
25+
{
26+
namespace device
27+
{
28+
29+
#if !defined(__HIPCC__) && !defined(__CUDACC__)
30+
typedef struct _dummyStream {
31+
} Stream;
32+
#else
33+
#ifdef __HIPCC__
34+
typedef hipStream_t Stream;
35+
#else
36+
typedef cudaStream_t Stream;
37+
#endif
38+
#endif
39+
40+
class GPUInterface
41+
{
42+
public:
43+
GPUInterface(GPUInterface& other) = delete;
44+
void operator=(const GPUInterface&) = delete;
45+
46+
static GPUInterface* Instance();
47+
48+
// APIs
49+
void registerBuffer(void*, size_t);
50+
void unregisterBuffer(void* addr);
51+
void allocDevice(void**, size_t);
52+
void freeDevice(void*);
53+
Stream& getStream(unsigned short N = 0);
54+
Stream& getNextStream();
55+
56+
protected:
57+
GPUInterface(size_t N = 1);
58+
~GPUInterface();
59+
60+
void resize(size_t);
61+
62+
std::atomic<unsigned short> mLastUsedStream{0};
63+
static GPUInterface* sGPUInterface;
64+
std::vector<std::thread> mPool{};
65+
std::vector<Stream> mStreams{};
66+
};
67+
68+
} // namespace device
69+
} // namespace vertexing
70+
} // namespace o2
71+
#endif

Common/DCAFitter/GPU/cuda/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
o2_add_library(DCAFitterCUDA
1313
TARGETVARNAME targetName
1414
SOURCES DCAFitterN.cu
15+
GPUInterface.cu
1516
PUBLIC_INCLUDE_DIRECTORIES ../../include
17+
PUBLIC_INCLUDE_DIRECTORIES ../
1618
PUBLIC_LINK_LIBRARIES O2::MathUtils
1719
O2::ReconstructionDataFormats
1820
O2::DetectorsBase
1921
PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider)
2022
set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
23+
# add_compile_options(-lineinfo)
2124

2225
o2_add_test(DCAFitterNCUDA
2326
SOURCES test/testDCAFitterNGPU.cxx

Common/DCAFitter/GPU/cuda/DCAFitterN.cu

Lines changed: 128 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
#include <cuda.h>
1616
#endif
1717

18+
#include <numeric>
19+
1820
#include "GPUCommonDef.h"
1921
#include "DCAFitter/DCAFitterN.h"
20-
// #include "MathUtils/SMatrixGPU.h"
22+
#include "DeviceInterface/GPUInterface.h"
2123

2224
#define gpuCheckError(x) \
2325
{ \
@@ -54,17 +56,25 @@ GPUg() void printKernel(Fitter* fitter)
5456
}
5557
}
5658

59+
template <typename Fitter>
60+
GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N)
61+
{
62+
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) {
63+
fitters[iThread + off] = fitters[off];
64+
}
65+
}
66+
5767
template <typename Fitter, typename... Tr>
5868
GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks)
5969
{
6070
*res = fitter->process(*tracks...);
6171
}
6272

6373
template <typename Fitter, typename... Tr>
64-
GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks)
74+
GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks)
6575
{
6676
for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) {
67-
results[iThread] = fitters[iThread].process(tracks[iThread]...);
77+
results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...);
6878
}
6979
}
7080

@@ -131,64 +141,138 @@ int process(const int nBlocks,
131141
}
132142

133143
template <typename Fitter, class... Tr>
134-
std::vector<int> processBulk(const int nBlocks,
135-
const int nThreads,
136-
std::vector<Fitter>& fitters,
137-
std::vector<Tr>&... args)
144+
void processBulk(const int nBlocks,
145+
const int nThreads,
146+
const int nBatches,
147+
std::vector<Fitter>& fitters,
148+
std::vector<int>& results,
149+
std::vector<Tr>&... args)
138150
{
139-
kernel::warmUpGpuKernel<<<1, 1>>>();
151+
auto* gpuInterface = GPUInterface::Instance();
152+
kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>();
140153
141-
cudaEvent_t start, stop;
142-
gpuCheckError(cudaEventCreate(&start));
143-
gpuCheckError(cudaEventCreate(&stop));
144-
const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet.
145-
std::vector<int> results(nFits);
146-
int* results_device;
147-
Fitter* fitters_device;
148-
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
154+
// Benchmarking events
155+
std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches);
156+
std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches);
157+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
158+
gpuCheckError(cudaEventCreate(&startIOUp[iBatch]));
159+
gpuCheckError(cudaEventCreate(&endIOUp[iBatch]));
160+
gpuCheckError(cudaEventCreate(&startIODown[iBatch]));
161+
gpuCheckError(cudaEventCreate(&endIODown[iBatch]));
162+
gpuCheckError(cudaEventCreate(&startKer[iBatch]));
163+
gpuCheckError(cudaEventCreate(&endKer[iBatch]));
164+
}
149165
166+
// Tracks
167+
std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device;
150168
int iArg{0};
151169
([&] {
152-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()));
153-
gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice));
170+
gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size());
171+
gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size());
154172
++iArg;
155173
}(),
156174
...);
157-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits));
158-
gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits));
159-
gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice));
160175
161-
gpuCheckError(cudaEventRecord(start));
162-
std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device);
163-
gpuCheckError(cudaEventRecord(stop));
176+
// Fitters
177+
gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size());
178+
Fitter* fitters_device;
179+
gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size());
164180
165-
gpuCheckError(cudaPeekAtLastError());
166-
gpuCheckError(cudaDeviceSynchronize());
181+
// Results
182+
gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size());
183+
int* results_device;
184+
gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size());
167185
168-
gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost));
169-
gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost));
186+
// R.R. Computation
187+
int totalSize = fitters.size();
188+
int batchSize = totalSize / nBatches;
189+
int remainder = totalSize % nBatches;
170190
171-
iArg = 0;
172-
([&] {
173-
gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost));
174-
gpuCheckError(cudaFree(tracks_device[iArg]));
175-
++iArg;
176-
}(),
177-
...);
191+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
192+
auto& stream = gpuInterface->getNextStream();
193+
auto offset = iBatch * batchSize + std::min(iBatch, remainder);
194+
auto nFits = batchSize + (iBatch < remainder ? 1 : 0);
195+
196+
gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream));
197+
gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer
198+
iArg = 0;
199+
([&] {
200+
gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream));
201+
++iArg;
202+
}(),
203+
...);
204+
gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream));
205+
206+
gpuCheckError(cudaEventRecord(startKer[iBatch], stream));
207+
kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits);
208+
std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device);
209+
gpuCheckError(cudaEventRecord(endKer[iBatch], stream));
210+
211+
gpuCheckError(cudaPeekAtLastError());
212+
iArg = 0;
213+
gpuCheckError(cudaEventRecord(startIODown[iBatch], stream));
214+
([&] {
215+
gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream));
216+
++iArg;
217+
}(),
218+
...);
219+
220+
gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream));
221+
gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream));
222+
gpuCheckError(cudaEventRecord(endIODown[iBatch], stream));
223+
}
224+
225+
([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...);
226+
227+
for (auto* tracksD : tracks_device) {
228+
gpuInterface->freeDevice(tracksD);
229+
}
178230
179-
gpuCheckError(cudaFree(fitters_device));
180-
gpuCheckError(cudaFree(results_device));
181-
gpuCheckError(cudaEventSynchronize(stop));
231+
gpuInterface->freeDevice(fitters_device);
232+
gpuInterface->freeDevice(results_device);
233+
gpuInterface->unregisterBuffer(fitters.data());
234+
gpuInterface->unregisterBuffer(results.data());
182235
183-
float milliseconds = 0;
184-
gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop));
236+
// Do benchmarks
237+
gpuCheckError(cudaDeviceSynchronize());
238+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
239+
gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch]));
240+
gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch]));
241+
gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch]));
242+
}
185243
186-
LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads);
187-
return results;
244+
float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f);
245+
float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f);
246+
float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f);
247+
LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads);
248+
LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches);
249+
LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches);
250+
251+
for (int iBatch{0}; iBatch < nBatches; ++iBatch) {
252+
gpuCheckError(cudaEventDestroy(startIOUp[iBatch]));
253+
gpuCheckError(cudaEventDestroy(endIOUp[iBatch]));
254+
gpuCheckError(cudaEventDestroy(startIODown[iBatch]));
255+
gpuCheckError(cudaEventDestroy(endIODown[iBatch]));
256+
gpuCheckError(cudaEventDestroy(startKer[iBatch]));
257+
gpuCheckError(cudaEventDestroy(endKer[iBatch]));
258+
}
188259
}
189260
190-
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
191-
template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&);
261+
template void processBulk(const int,
262+
const int,
263+
const int,
264+
std::vector<o2::vertexing::DCAFitterN<2>>&,
265+
std::vector<int>&,
266+
std::vector<o2::track::TrackParCov>&,
267+
std::vector<o2::track::TrackParCov>&);
268+
template void processBulk(const int,
269+
const int,
270+
const int,
271+
std::vector<o2::vertexing::DCAFitterN<3>>&,
272+
std::vector<int>&,
273+
std::vector<o2::track::TrackParCov>&,
274+
std::vector<o2::track::TrackParCov>&,
275+
std::vector<o2::track::TrackParCov>&);
192276
template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&);
193277
template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&);
194278
template void print(const int, const int, o2::vertexing::DCAFitterN<2>&);

0 commit comments

Comments
 (0)