|
15 | 15 | #include <cuda.h> |
16 | 16 | #endif |
17 | 17 |
|
| 18 | +#include <numeric> |
| 19 | + |
18 | 20 | #include "GPUCommonDef.h" |
19 | 21 | #include "DCAFitter/DCAFitterN.h" |
20 | | -// #include "MathUtils/SMatrixGPU.h" |
| 22 | +#include "DeviceInterface/GPUInterface.h" |
21 | 23 |
|
22 | 24 | #define gpuCheckError(x) \ |
23 | 25 | { \ |
@@ -61,10 +63,10 @@ GPUg() void processKernel(Fitter* fitter, int* res, Tr*... tracks) |
61 | 63 | } |
62 | 64 |
|
63 | 65 | template <typename Fitter, typename... Tr> |
64 | | -GPUg() void processBulkKernel(Fitter* fitters, int* results, unsigned int N, Tr*... tracks) |
| 66 | +GPUg() void processBatchKernel(Fitter* fitters, int* results, size_t off, size_t N, Tr*... tracks) |
65 | 67 | { |
66 | 68 | for (auto iThread{blockIdx.x * blockDim.x + threadIdx.x}; iThread < N; iThread += blockDim.x * gridDim.x) { |
67 | | - results[iThread] = fitters[iThread].process(tracks[iThread]...); |
| 69 | + results[iThread + off] = fitters[iThread + off].process(tracks[iThread + off]...); |
68 | 70 | } |
69 | 71 | } |
70 | 72 |
|
@@ -131,64 +133,137 @@ int process(const int nBlocks, |
131 | 133 | } |
132 | 134 |
|
133 | 135 | template <typename Fitter, class... Tr> |
134 | | -std::vector<int> processBulk(const int nBlocks, |
135 | | - const int nThreads, |
136 | | - std::vector<Fitter>& fitters, |
137 | | - std::vector<Tr>&... args) |
| 136 | +void processBulk(const int nBlocks, |
| 137 | + const int nThreads, |
| 138 | + const int nBatches, |
| 139 | + std::vector<Fitter>& fitters, |
| 140 | + std::vector<int>& results, |
| 141 | + std::vector<Tr>&... args) |
138 | 142 | { |
139 | | - kernel::warmUpGpuKernel<<<1, 1>>>(); |
| 143 | + auto* gpuInterface = GPUInterface::Instance(); |
| 144 | + kernel::warmUpGpuKernel<<<1, 1, 0, gpuInterface->getNextStream()>>>(); |
140 | 145 |
|
141 | | - cudaEvent_t start, stop; |
142 | | - gpuCheckError(cudaEventCreate(&start)); |
143 | | - gpuCheckError(cudaEventCreate(&stop)); |
144 | | - const auto nFits{fitters.size()}; // for clarity: size of all the vectors needs to be equal, not enforcing it here yet. |
145 | | - std::vector<int> results(nFits); |
146 | | - int* results_device; |
147 | | - Fitter* fitters_device; |
148 | | - std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device; |
| 146 | + // Benchmarking events |
| 147 | + std::vector<float> ioUp(nBatches), ioDown(nBatches), kerElapsed(nBatches); |
| 148 | + std::vector<cudaEvent_t> startIOUp(nBatches), endIOUp(nBatches), startIODown(nBatches), endIODown(nBatches), startKer(nBatches), endKer(nBatches); |
| 149 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 150 | + gpuCheckError(cudaEventCreate(&startIOUp[iBatch])); |
| 151 | + gpuCheckError(cudaEventCreate(&endIOUp[iBatch])); |
| 152 | + gpuCheckError(cudaEventCreate(&startIODown[iBatch])); |
| 153 | + gpuCheckError(cudaEventCreate(&endIODown[iBatch])); |
| 154 | + gpuCheckError(cudaEventCreate(&startKer[iBatch])); |
| 155 | + gpuCheckError(cudaEventCreate(&endKer[iBatch])); |
| 156 | + } |
149 | 157 |
|
| 158 | + // Tracks |
| 159 | + std::array<o2::track::TrackParCov*, Fitter::getNProngs()> tracks_device; |
150 | 160 | int iArg{0}; |
151 | 161 | ([&] { |
152 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size())); |
153 | | - gpuCheckError(cudaMemcpy(tracks_device[iArg], args.data(), sizeof(Tr) * args.size(), cudaMemcpyHostToDevice)); |
| 162 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(args.data()), sizeof(Tr) * args.size()); |
| 163 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&(tracks_device[iArg])), sizeof(Tr) * args.size()); |
154 | 164 | ++iArg; |
155 | 165 | }(), |
156 | 166 | ...); |
157 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&results_device), sizeof(int) * nFits)); |
158 | | - gpuCheckError(cudaMalloc(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * nFits)); |
159 | | - gpuCheckError(cudaMemcpy(fitters_device, fitters.data(), sizeof(Fitter) * nFits, cudaMemcpyHostToDevice)); |
160 | 167 |
|
161 | | - gpuCheckError(cudaEventRecord(start)); |
162 | | - std::apply([&](auto&&... args) { kernel::processBulkKernel<<<nBlocks, nThreads>>>(fitters_device, results_device, nFits, args...); }, tracks_device); |
163 | | - gpuCheckError(cudaEventRecord(stop)); |
| 168 | + // Fitters |
| 169 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(fitters.data()), sizeof(Fitter) * fitters.size()); |
| 170 | + Fitter* fitters_device; |
| 171 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&fitters_device), sizeof(Fitter) * fitters.size()); |
164 | 172 |
|
165 | | - gpuCheckError(cudaPeekAtLastError()); |
166 | | - gpuCheckError(cudaDeviceSynchronize()); |
| 173 | + // Results |
| 174 | + gpuInterface->registerBuffer(reinterpret_cast<void*>(results.data()), sizeof(int) * fitters.size()); |
| 175 | + int* results_device; |
| 176 | + gpuInterface->allocDevice(reinterpret_cast<void**>(&results_device), sizeof(int) * fitters.size()); |
167 | 177 |
|
168 | | - gpuCheckError(cudaMemcpy(results.data(), results_device, sizeof(int) * results.size(), cudaMemcpyDeviceToHost)); |
169 | | - gpuCheckError(cudaMemcpy(fitters.data(), fitters_device, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost)); |
| 178 | + // R.R. Computation |
| 179 | + int totalSize = fitters.size(); |
| 180 | + int batchSize = totalSize / nBatches; |
| 181 | + int remainder = totalSize % nBatches; |
170 | 182 |
|
171 | | - iArg = 0; |
172 | | - ([&] { |
173 | | - gpuCheckError(cudaMemcpy(args.data(), tracks_device[iArg], sizeof(Tr) * args.size(), cudaMemcpyDeviceToHost)); |
174 | | - gpuCheckError(cudaFree(tracks_device[iArg])); |
175 | | - ++iArg; |
176 | | - }(), |
177 | | - ...); |
| 183 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 184 | + auto& stream = gpuInterface->getNextStream(); |
| 185 | + auto offset = iBatch * batchSize + std::min(iBatch, remainder); |
| 186 | + auto nFits = batchSize + (iBatch < remainder ? 1 : 0); |
| 187 | +
|
| 188 | + gpuCheckError(cudaEventRecord(startIOUp[iBatch], stream)); |
| 189 | + gpuCheckError(cudaMemcpyAsync(fitters_device + offset, fitters.data() + offset, sizeof(Fitter) * nFits, cudaMemcpyHostToDevice, stream)); |
| 190 | + iArg = 0; |
| 191 | + ([&] { |
| 192 | + gpuCheckError(cudaMemcpyAsync(tracks_device[iArg] + offset, args.data() + offset, sizeof(Tr) * nFits, cudaMemcpyHostToDevice, stream)); |
| 193 | + ++iArg; |
| 194 | + }(), |
| 195 | + ...); |
| 196 | + gpuCheckError(cudaEventRecord(endIOUp[iBatch], stream)); |
| 197 | +
|
| 198 | + gpuCheckError(cudaEventRecord(startKer[iBatch], stream)); |
| 199 | + std::apply([&](auto&&... args) { kernel::processBatchKernel<<<nBlocks, nThreads, 0, stream>>>(fitters_device, results_device, offset, nFits, args...); }, tracks_device); |
| 200 | + gpuCheckError(cudaEventRecord(endKer[iBatch], stream)); |
| 201 | +
|
| 202 | + gpuCheckError(cudaPeekAtLastError()); |
| 203 | + iArg = 0; |
| 204 | + gpuCheckError(cudaEventRecord(startIODown[iBatch], stream)); |
| 205 | + ([&] { |
| 206 | + gpuCheckError(cudaMemcpyAsync(args.data() + offset, tracks_device[iArg] + offset, sizeof(Tr) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 207 | + ++iArg; |
| 208 | + }(), |
| 209 | + ...); |
| 210 | +
|
| 211 | + gpuCheckError(cudaMemcpyAsync(fitters.data() + offset, fitters_device + offset, sizeof(Fitter) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 212 | + gpuCheckError(cudaMemcpyAsync(results.data() + offset, results_device + offset, sizeof(int) * nFits, cudaMemcpyDeviceToHost, stream)); |
| 213 | + gpuCheckError(cudaEventRecord(endIODown[iBatch], stream)); |
| 214 | + } |
178 | 215 |
|
179 | | - gpuCheckError(cudaFree(fitters_device)); |
180 | | - gpuCheckError(cudaFree(results_device)); |
181 | | - gpuCheckError(cudaEventSynchronize(stop)); |
| 216 | + ([&] { gpuInterface->unregisterBuffer(args.data()); }(), ...); |
182 | 217 |
|
183 | | - float milliseconds = 0; |
184 | | - gpuCheckError(cudaEventElapsedTime(&milliseconds, start, stop)); |
| 218 | + for (auto* tracksD : tracks_device) { |
| 219 | + gpuInterface->freeDevice(tracksD); |
| 220 | + } |
| 221 | +
|
| 222 | + gpuInterface->freeDevice(fitters_device); |
| 223 | + gpuInterface->freeDevice(results_device); |
| 224 | + gpuInterface->unregisterBuffer(fitters.data()); |
| 225 | + gpuInterface->unregisterBuffer(results.data()); |
185 | 226 |
|
186 | | - LOGP(info, "Kernel run in: {} ms using {} blocks and {} threads.", milliseconds, nBlocks, nThreads); |
187 | | - return results; |
| 227 | + // Do benchmarks |
| 228 | + gpuCheckError(cudaDeviceSynchronize()); |
| 229 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 230 | + gpuCheckError(cudaEventElapsedTime(&ioUp[iBatch], startIOUp[iBatch], endIOUp[iBatch])); |
| 231 | + gpuCheckError(cudaEventElapsedTime(&kerElapsed[iBatch], startKer[iBatch], endKer[iBatch])); |
| 232 | + gpuCheckError(cudaEventElapsedTime(&ioDown[iBatch], startIODown[iBatch], endIODown[iBatch])); |
| 233 | + } |
| 234 | +
|
| 235 | + float totalUp = std::accumulate(ioUp.begin(), ioUp.end(), 0.f); |
| 236 | + float totalDown = std::accumulate(ioDown.begin(), ioDown.end(), 0.f); |
| 237 | + float totalKernels = std::accumulate(kerElapsed.begin(), kerElapsed.end(), 0.f); |
| 238 | + LOGP(info, "Config: {} batches, {} blocks, {} threads", nBatches, nBlocks, nThreads); |
| 239 | + LOGP(info, "Total I/O time: Up {} ms Avg {} ms, Down {} ms Avg {} ms", totalUp, totalUp / float(nBatches), totalDown, totalDown / (float)nBatches); |
| 240 | + LOGP(info, "Total Kernel time: {} ms Avg {} ms", totalKernels, totalKernels / (float)nBatches); |
| 241 | +
|
| 242 | + for (int iBatch{0}; iBatch < nBatches; ++iBatch) { |
| 243 | + gpuCheckError(cudaEventDestroy(startIOUp[iBatch])); |
| 244 | + gpuCheckError(cudaEventDestroy(endIOUp[iBatch])); |
| 245 | + gpuCheckError(cudaEventDestroy(startIODown[iBatch])); |
| 246 | + gpuCheckError(cudaEventDestroy(endIODown[iBatch])); |
| 247 | + gpuCheckError(cudaEventDestroy(startKer[iBatch])); |
| 248 | + gpuCheckError(cudaEventDestroy(endKer[iBatch])); |
| 249 | + } |
188 | 250 | } |
189 | 251 |
|
190 | | -template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<2>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&); |
191 | | -template std::vector<int> processBulk(const int, const int, std::vector<o2::vertexing::DCAFitterN<3>>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&, std::vector<o2::track::TrackParCov>&); |
| 252 | +template void processBulk(const int, |
| 253 | + const int, |
| 254 | + const int, |
| 255 | + std::vector<o2::vertexing::DCAFitterN<2>>&, |
| 256 | + std::vector<int>&, |
| 257 | + std::vector<o2::track::TrackParCov>&, |
| 258 | + std::vector<o2::track::TrackParCov>&); |
| 259 | +template void processBulk(const int, |
| 260 | + const int, |
| 261 | + const int, |
| 262 | + std::vector<o2::vertexing::DCAFitterN<3>>&, |
| 263 | + std::vector<int>&, |
| 264 | + std::vector<o2::track::TrackParCov>&, |
| 265 | + std::vector<o2::track::TrackParCov>&, |
| 266 | + std::vector<o2::track::TrackParCov>&); |
192 | 267 | template int process(const int, const int, o2::vertexing::DCAFitterN<2>&, o2::track::TrackParCov&, o2::track::TrackParCov&); |
193 | 268 | template int process(const int, const int, o2::vertexing::DCAFitterN<3>&, o2::track::TrackParCov&, o2::track::TrackParCov&, o2::track::TrackParCov&); |
194 | 269 | template void print(const int, const int, o2::vertexing::DCAFitterN<2>&); |
|
0 commit comments