|
15 | 15 | #include <cuda.h> |
16 | 16 | #endif |
17 | 17 |
|
| 18 | +#include <numeric> |
| 19 | + |
18 | 20 | #include "GPUCommonDef.h" |
19 | 21 | #include "DCAFitter/DCAFitterN.h" |
20 | | -// #include "MathUtils/SMatrixGPU.h" |
| 22 | +#include "DeviceInterface/GPUInterface.h" |
21 | 23 |
|
22 | 24 | #define gpuCheckError(x) \ |
23 | 25 | { \ |
@@ -54,17 +56,25 @@ GPUg() void printKernel(Fitter* fitter) |
54 | 56 | } |
55 | 57 | } |
56 | 58 |
|
| 59 | +template <typename Fitter> |
| 60 | +GPUg() void initFitters(Fitter* fitters, unsigned int off, unsigned int N) |
| 61 | +{ |
| 62 | + for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x + 1}; iThread < N; iThread += blockDim.x * gridDim.x) { |
| 63 | + fitters[iThread + off] = fitters[off]; |
| 64 | + } |
| 65 | +} |
| 66 | + |
57 | 67 | template <typename Fitter, typename... Tr> |
58 | 68 | GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks) |
59 | 69 | { |
60 | 70 | *res = fitter->process(*tracks...); |
61 | 71 | } |
62 | 72 |
|
63 | 73 | template <typename Fitter, typename... Tr> |
64 | | -GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks) |
| 74 | +GPUg() void processBatchKernel(Fitter* fitters, int* results, unsigned int off, unsigned int N, Tr*... tracks) |
65 | 75 | { |
66 | 76 | for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) { |
67 | | - results[iThread] = fitters[iThread].process(tracks[iThread]...); |
| 77 | + results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...); |
68 | 78 | } |
69 | 79 | } |
70 | 80 |
|
@@ -131,64 +141,138 @@ int process(const int nBlocks, |
131 | 141 | } |
132 | 142 |
|
133 | 143 | template <typename Fitter, class... Tr> |
134 | | -std::vector<int> processBulk(const int nBlocks, |
135 | | - const int nThreads, |
136 | | - std::vector<Fitter>& fitters, |
137 | | - std::vector<Tr>&... args) |
| 144 | +void processBulk(const int nBlocks, |
| 145 | + const int nThreads, |
| 146 | + const int nBatches, |
| 147 | + std::vector<Fitter>& fitters, |
| 148 | + std::vector<int>& results, |
| 149 | + std::vector<Tr>&... args) |
138 | 150 | { |
139 | | - kernel::warmUpGpuKernel<<<1, 1>>>(); |
| 151 | + auto* gpuInterface = GPUInterface::Instance(); |
| 152 | + kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>(); |
140 | 153 |
|
141 | | - cudaEvent_t start, stop; |
142 | | - gpuCheckError(cudaEventCreate(&start)); |
143 | | - gpuCheckError(cudaEventCreate(&stop)); |
144 | | - const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet. |
145 | | - std::vector<int> results(nFits); |
146 | | - int* results_device; |
147 | | - Fitter* fitters_device; |
148 | | - std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device; |
| 154 | + // Benchmarking events |
| 155 | + std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches); |
| 156 | + std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches); |
| 157 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 158 | + gpuCheckError(cudaEventCreate(&startIOUp[iBatch])); |
| 159 | + gpuCheckError(cudaEventCreate(&endIOUp[iBatch])); |
| 160 | + gpuCheckError(cudaEventCreate(&startIODown[iBatch])); |
| 161 | + gpuCheckError(cudaEventCreate(&endIODown[iBatch])); |
| 162 | + gpuCheckError(cudaEventCreate(&startKer[iBatch])); |
| 163 | + gpuCheckError(cudaEventCreate(&endKer[iBatch])); |
| 164 | + } |
149 | 165 |
|
| 166 | + // Tracks |
| 167 | + std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device; |
150 | 168 | int iArg{0}; |
151 | 169 | ([&] { |
152 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size())); |
153 | | - gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice)); |
| 170 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size()); |
| 171 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()); |
154 | 172 | ++iArg; |
155 | 173 | }(), |
156 | 174 | ...); |
157 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits)); |
158 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits)); |
159 | | - gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice)); |
160 | 175 |
|
161 | | - gpuCheckError(cudaEventRecord(start)); |
162 | | - std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device); |
163 | | - gpuCheckError(cudaEventRecord(stop)); |
| 176 | + // Fitters |
| 177 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size()); |
| 178 | + Fitter* fitters_device; |
| 179 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size()); |
164 | 180 |
|
165 | | - gpuCheckError(cudaPeekAtLastError()); |
166 | | - gpuCheckError(cudaDeviceSynchronize()); |
| 181 | + // Results |
| 182 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size()); |
| 183 | + int* results_device; |
| 184 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size()); |
167 | 185 |
|
168 | | - gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost)); |
169 | | - gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost)); |
| 186 | + // R.R. Computation |
| 187 | + int totalSize = fitters.size(); |
| 188 | + int batchSize = totalSize / nBatches; |
| 189 | + int remainder = totalSize % nBatches; |
170 | 190 |
|
171 | | - iArg = 0; |
172 | | - ([&] { |
173 | | - gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost)); |
174 | | - gpuCheckError(cudaFree(tracks_device[iArg])); |
175 | | - ++iArg; |
176 | | - }(), |
177 | | - ...); |
| 191 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 192 | + auto& stream = gpuInterface->getNextStream(); |
| 193 | + auto offset = iBatch * batchSize + std::min(iBatch, remainder); |
| 194 | + auto nFits = batchSize + (iBatch < remainder ? 1 : 0); |
| 195 | +
|
| 196 | + gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream)); |
| 197 | + gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) /* * nFits */, cudaMemcpyHostToDevice, stream)); // copying just the first element of the buffer |
| 198 | + iArg = 0; |
| 199 | + ([&] { |
| 200 | + gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream)); |
| 201 | + ++iArg; |
| 202 | + }(), |
| 203 | + ...); |
| 204 | + gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream)); |
| 205 | +
|
| 206 | + gpuCheckError(cudaEventRecord(startKer[iBatch], stream)); |
| 207 | + kernel::initFitters<<<nBlocks, nThreads, 0, stream>>>(fitters_device, offset, nFits); |
| 208 | + std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device); |
| 209 | + gpuCheckError(cudaEventRecord(endKer[iBatch], stream)); |
| 210 | +
|
| 211 | + gpuCheckError(cudaPeekAtLastError()); |
| 212 | + iArg = 0; |
| 213 | + gpuCheckError(cudaEventRecord(startIODown[iBatch], stream)); |
| 214 | + ([&] { |
| 215 | + gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 216 | + ++iArg; |
| 217 | + }(), |
| 218 | + ...); |
| 219 | +
|
| 220 | + gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 221 | + gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 222 | + gpuCheckError(cudaEventRecord(endIODown[iBatch], stream)); |
| 223 | + } |
| 224 | +
|
| 225 | + ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...); |
| 226 | +
|
| 227 | + for (auto* tracksD : tracks_device) { |
| 228 | + gpuInterface->freeDevice(tracksD); |
| 229 | + } |
178 | 230 |
|
179 | | - gpuCheckError(cudaFree(fitters_device)); |
180 | | - gpuCheckError(cudaFree(results_device)); |
181 | | - gpuCheckError(cudaEventSynchronize(stop)); |
| 231 | + gpuInterface->freeDevice(fitters_device); |
| 232 | + gpuInterface->freeDevice(results_device); |
| 233 | + gpuInterface->unregisterBuffer(fitters.data()); |
| 234 | + gpuInterface->unregisterBuffer(results.data()); |
182 | 235 |
|
183 | | - float milliseconds = 0; |
184 | | - gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop)); |
| 236 | + // Do benchmarks |
| 237 | + gpuCheckError(cudaDeviceSynchronize()); |
| 238 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 239 | + gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch])); |
| 240 | + gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch])); |
| 241 | + gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch])); |
| 242 | + } |
185 | 243 |
|
186 | | - LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads); |
187 | | - return results; |
| 244 | + float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f); |
| 245 | + float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f); |
| 246 | + float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f); |
| 247 | + LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads); |
| 248 | + LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches); |
| 249 | + LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches); |
| 250 | +
|
| 251 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 252 | + gpuCheckError(cudaEventDestroy(startIOUp[iBatch])); |
| 253 | + gpuCheckError(cudaEventDestroy(endIOUp[iBatch])); |
| 254 | + gpuCheckError(cudaEventDestroy(startIODown[iBatch])); |
| 255 | + gpuCheckError(cudaEventDestroy(endIODown[iBatch])); |
| 256 | + gpuCheckError(cudaEventDestroy(startKer[iBatch])); |
| 257 | + gpuCheckError(cudaEventDestroy(endKer[iBatch])); |
| 258 | + } |
188 | 259 | } |
189 | 260 |
|
190 | | -template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&); |
191 | | -template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&); |
| 261 | +template void processBulk(const int, |
| 262 | + const int, |
| 263 | + const int, |
| 264 | + std::vector<o2::vertexing::DCAFitterN<2>>&, |
| 265 | + std::vector<int>&, |
| 266 | + std::vector<o2::track::TrackParCov>&, |
| 267 | + std::vector<o2::track::TrackParCov>&); |
| 268 | +template void processBulk(const int, |
| 269 | + const int, |
| 270 | + const int, |
| 271 | + std::vector<o2::vertexing::DCAFitterN<3>>&, |
| 272 | + std::vector<int>&, |
| 273 | + std::vector<o2::track::TrackParCov>&, |
| 274 | + std::vector<o2::track::TrackParCov>&, |
| 275 | + std::vector<o2::track::TrackParCov>&); |
192 | 276 | template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&); |
193 | 277 | template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&); |
194 | 278 | template void print(const int, const int, o2::vertexing::DCAFitterN<2>&); |
|
0 commit comments