|
17 | 17 | #include <filesystem>
|
18 | 18 | #include <fstream>
|
19 | 19 | #include <iostream>
|
| 20 | +#include <map> |
| 21 | +#include <mutex> |
| 22 | +#include <time.h> |
20 | 23 |
|
21 | 24 | #include "cuda.h"
|
22 | 25 | #include "cuda_runtime.h"
|
@@ -76,6 +79,112 @@ class ScopedContext {
|
76 | 79 | ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
|
77 | 80 | };
|
78 | 81 |
|
| 82 | +class PGOState { |
| 83 | +public: |
| 84 | + enum Type { Start, End }; |
| 85 | + struct State { |
| 86 | + struct timespec start_clock; |
| 87 | + }; |
| 88 | + |
| 89 | + // TODO define these in cmake files (depending on target OS and used in the |
| 90 | + // compiler too) |
| 91 | + static constexpr const char *dirname = POLYGEIST_PGO_DATA_DIR; |
| 92 | + static constexpr const char *alternativeEnvVar = |
| 93 | + "POLYGEIST_PGO_KERNEL_ALTERNATIVE"; |
| 94 | + |
| 95 | + inline static int alternative; |
| 96 | + inline thread_local static std::mutex mutex; |
| 97 | + inline thread_local static std::map<std::string, State *> states; |
| 98 | + |
| 99 | + std::string kernelId; |
| 100 | + int totalAlternatives; |
| 101 | + |
| 102 | + PGOState(const char *kernelId, int totalAlternatives) |
| 103 | + : kernelId(kernelId), totalAlternatives(totalAlternatives) {} |
| 104 | + void end() { |
| 105 | + struct timespec end_clock; |
| 106 | + cudaDeviceSynchronize(); |
| 107 | + clock_gettime(CLOCK_MONOTONIC, &end_clock); |
| 108 | + |
| 109 | + std::unique_lock<std::mutex> lock(mutex); |
| 110 | + if (states.count(kernelId) == 0) { |
| 111 | + std::cerr << "No kernel with id " << kernelId << "running" << std::endl; |
| 112 | + exit(1); |
| 113 | + } |
| 114 | + State *state = states[kernelId]; |
| 115 | + struct timespec tmp_clock { |
| 116 | + end_clock.tv_sec - state->start_clock.tv_sec, |
| 117 | + end_clock.tv_nsec - state->start_clock.tv_nsec |
| 118 | + }; |
| 119 | + double elapsed = |
| 120 | + (tmp_clock.tv_sec + ((double)tmp_clock.tv_nsec) * .000000001); |
| 121 | + |
| 122 | + // Only write to file if we are profiling a valid alternative |
| 123 | + if (0 <= alternative && alternative < totalAlternatives) { |
| 124 | + // TODO error handling |
| 125 | + std::ofstream ofile; |
| 126 | + ofile.open(std::string(dirname) + kernelId, |
| 127 | + std::ios::out | std::ios::app); |
| 128 | + ofile << alternative << " " << elapsed << std::endl; |
| 129 | + ofile.close(); |
| 130 | + } |
| 131 | + |
| 132 | + delete state; |
| 133 | + states.erase(states.find(kernelId)); |
| 134 | + } |
| 135 | + |
| 136 | + void start() { |
| 137 | + std::unique_lock<std::mutex> lock(mutex); |
| 138 | + State *state = new State(); |
| 139 | + if (states.count(kernelId) == 1) { |
| 140 | + std::cerr << "Two kernels with id " << kernelId |
| 141 | + << "running at the same time" << std::endl; |
| 142 | + exit(1); |
| 143 | + } |
| 144 | + states[kernelId] = state; |
| 145 | + // Start timing |
| 146 | + cudaDeviceSynchronize(); |
| 147 | + clock_gettime(CLOCK_MONOTONIC, &state->start_clock); |
| 148 | + } |
| 149 | + |
| 150 | + int getAlternative() { |
| 151 | + static int init = [&] { |
| 152 | + if (char *i = getenv(alternativeEnvVar)) { |
| 153 | + this->alternative = atoi(i); |
| 154 | + } else { |
| 155 | + std::cerr << alternativeEnvVar << " not defined" << std::endl; |
| 156 | + exit(1); |
| 157 | + } |
| 158 | + std::filesystem::create_directories(dirname); |
| 159 | + return 0; |
| 160 | + }(); |
| 161 | + if (0 <= alternative && alternative < totalAlternatives) |
| 162 | + return alternative; |
| 163 | + else |
| 164 | + return 0; |
| 165 | + } |
| 166 | + |
| 167 | + ~PGOState() {} |
| 168 | +}; |
| 169 | + |
| 170 | +extern "C" MLIR_CUDA_WRAPPERS_EXPORT int32_t |
| 171 | +mgpurtPGOGetAlternative(const char *kernelID, int totalAlternatives) { |
| 172 | + PGOState pgoState(kernelID, totalAlternatives); |
| 173 | + return pgoState.getAlternative(); |
| 174 | +} |
| 175 | + |
| 176 | +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void |
| 177 | +mgpurtPGOStart(const char *kernelID, int totalAlternatives) { |
| 178 | + PGOState pgoState(kernelID, totalAlternatives); |
| 179 | + pgoState.start(); |
| 180 | +} |
| 181 | + |
| 182 | +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpurtPGOEnd(const char *kernelID, |
| 183 | + int totalAlternatives) { |
| 184 | + PGOState pgoState(kernelID, totalAlternatives); |
| 185 | + pgoState.end(); |
| 186 | +} |
| 187 | + |
79 | 188 | extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
|
80 | 189 | mgpurtLaunchKernel(void *function, intptr_t gridX, intptr_t gridY,
|
81 | 190 | intptr_t gridZ, intptr_t blockX, intptr_t blockY,
|
|
0 commit comments