Skip to content

Commit 887bbd5

Browse files
committed
Squashed commit of the following:
* More flexible specification of coarsening factors * Fix device-side get_global ops * For PGO Output the accumulated runtime of kernels, not individual * Allow for granular blcok coarsening factors * Various GPU bug fixes * Remove unneeded kernels from the gpu binary * Collect kernel statistics * Add polygeist::UndefOp for cases when the type is not LLVM::
1 parent 423dd17 commit 887bbd5

28 files changed

+1444
-270
lines changed

include/polygeist/Passes/Passes.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ std::unique_ptr<Pass> createParallelLowerPass(
3737
std::unique_ptr<Pass> createConvertCudaRTtoCPUPass();
3838
std::unique_ptr<Pass> createConvertCudaRTtoGPUPass();
3939
std::unique_ptr<Pass> createConvertCudaRTtoHipRTPass();
40+
std::unique_ptr<Pass> createFixGPUFuncPass();
4041
std::unique_ptr<Pass> createSCFParallelLoopUnrollPass(int unrollFactor = 2);
4142
std::unique_ptr<Pass>
4243
createConvertPolygeistToLLVMPass(const LowerToLLVMOptions &options,
@@ -49,6 +50,8 @@ createConvertParallelToGPUPass1(std::string arch = "sm_60");
4950
std::unique_ptr<Pass>
5051
createConvertParallelToGPUPass2(bool emitGPUKernelLaunchBounds = true);
5152
std::unique_ptr<Pass> createMergeGPUModulesPass();
53+
std::unique_ptr<Pass> createLowerAlternativesPass();
54+
std::unique_ptr<Pass> createCollectKernelStatisticsPass();
5255
std::unique_ptr<Pass> createGpuSerializeToCubinPass(
5356
StringRef arch, StringRef features, int llvmOptLevel, int ptxasOptLevel,
5457
std::string ptxasPath, std::string libDevicePath, bool outputIntermediate);

include/polygeist/Passes/Passes.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,31 @@ def SCFParallelLoopUnroll : Pass<"scf-parallel-loop-unroll"> {
2323
];
2424
}
2525

26+
def CollectKernelStatistics : Pass<"collect-kernel-statistics", "mlir::ModuleOp"> {
27+
let summary = "Lower cudart functions to cpu versions";
28+
let dependentDialects = [];
29+
let constructor = "mlir::polygeist::createCollectKernelStatisticsPass()";
30+
}
31+
32+
def LowerAlternatives : Pass<"lower-alternatives", "mlir::ModuleOp"> {
33+
let summary = "Lower alternatives if in opt mode";
34+
let dependentDialects = [];
35+
let constructor = "mlir::polygeist::createLowerAlternativesPass()";
36+
}
37+
2638
def ConvertCudaRTtoCPU : Pass<"convert-cudart-to-cpu", "mlir::ModuleOp"> {
2739
let summary = "Lower cudart functions to cpu versions";
2840
let dependentDialects =
2941
["memref::MemRefDialect", "func::FuncDialect", "LLVM::LLVMDialect"];
3042
let constructor = "mlir::polygeist::createConvertCudaRTtoCPUPass()";
3143
}
3244

45+
def FixGPUFunc : Pass<"fix-gpu-func", "mlir::gpu::GPUModuleOp"> {
46+
let summary = "Fix nested calls to gpu functions we generate in the frontend";
47+
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "gpu::GPUDialect"];
48+
let constructor = "mlir::polygeist::createFixGPUFuncPass()";
49+
}
50+
3351
def ConvertCudaRTtoGPU : Pass<"convert-cudart-to-gpu", "mlir::ModuleOp"> {
3452
let summary = "Lower cudart functions to generic gpu versions";
3553
let dependentDialects =

include/polygeist/PolygeistOps.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,18 @@ include "mlir/IR/SymbolInterfaces.td"
1818
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
1919
include "mlir/Dialect/LLVMIR/LLVMOpsInterfaces.td"
2020

21+
def UndefOp
22+
: Polygeist_Op<"undef", [Pure]> {
23+
let summary = "More flexible undef op";
24+
let skipDefaultBuilders = 1;
25+
let results = (outs AnyType:$result);
26+
let builders = [
27+
OpBuilder<(ins "Type":$type), [{
28+
$_state.types.push_back(type);
29+
}]>];
30+
let hasCanonicalizer = true;
31+
}
32+
2133
def NoopOp
2234
: Polygeist_Op<"noop",
2335
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
@@ -29,6 +41,16 @@ def NoopOp
2941
let description = [{}];
3042
}
3143

44+
def GetDeviceGlobalOp
45+
: Polygeist_Op<"get_device_global",
46+
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
47+
DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
48+
let summary = "";
49+
let arguments = (ins FlatSymbolRefAttr:$name);
50+
let results = (outs AnyStaticShapeMemRef:$result);
51+
let description = [{}];
52+
}
53+
3254
def CacheLoad
3355
: Polygeist_Op<"cacheload"> {
3456

lib/polygeist/ExecutionEngine/PGORuntime.h

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
// PGO functions which should know whether the code in the alternatives op is
55
// GPU code - we can add an attrib to the alternatives op for that
66

7+
#include <cstdlib>
78
#include <ctime>
89
#include <filesystem>
910
#include <fstream>
1011
#include <iostream>
1112
#include <map>
1213
#include <mutex>
14+
#include <numeric>
15+
#include <vector>
1316

1417
extern "C" int32_t mgpurtDeviceSynchronizeErr(void);
1518

@@ -26,26 +29,38 @@ class PGOState {
2629
struct timespec start_clock;
2730
};
2831

32+
struct Logger {
33+
std::map<std::string, std::vector<double>> timings;
34+
~Logger() { PGOState::writeResults(); }
35+
};
36+
2937
inline static int alternative;
3038
inline static std::string dirname;
3139
inline thread_local static std::mutex mutex;
3240
inline thread_local static std::map<std::string, State *> states;
41+
inline static Logger logger;
3342

34-
std::string kernelId;
43+
const char *kernelId_c;
3544
int totalAlternatives;
3645

37-
PGOState(const char *kernelId_c, int totalAlternatives)
38-
: totalAlternatives(totalAlternatives) {
39-
kernelId = kernelId_c;
46+
std::string getKernelId() {
47+
std::string kernelId = kernelId_c;
4048
for (char &c : kernelId)
4149
if (c == '/')
4250
c = '+';
51+
return kernelId;
52+
}
53+
54+
PGOState(const char *kernelId_c, int totalAlternatives)
55+
: totalAlternatives(totalAlternatives) {
56+
this->kernelId_c = kernelId_c;
4357
}
4458
void end() {
4559
struct timespec end_clock;
4660
mgpurtDeviceSynchronizeErr();
4761
clock_gettime(CLOCK_MONOTONIC, &end_clock);
4862

63+
auto kernelId = getKernelId();
4964
std::unique_lock<std::mutex> lock(mutex);
5065
if (states.count(kernelId) == 0) {
5166
std::cerr << "No kernel with id " << kernelId << "running" << std::endl;
@@ -59,21 +74,16 @@ class PGOState {
5974
double elapsed =
6075
(tmp_clock.tv_sec + ((double)tmp_clock.tv_nsec) * .000000001);
6176

62-
// Only write to file if we are profiling a valid alternative
63-
if (0 <= alternative && alternative < totalAlternatives) {
64-
// TODO error handling
65-
std::ofstream ofile;
66-
ofile.open(std::string(dirname) + "/" + kernelId,
67-
std::ios::out | std::ios::app);
68-
ofile << alternative << " " << elapsed << std::endl;
69-
ofile.close();
70-
}
77+
if (states.count(kernelId) == 0)
78+
logger.timings[kernelId] = {};
79+
logger.timings[kernelId].push_back(elapsed);
7180

7281
delete state;
7382
states.erase(states.find(kernelId));
7483
}
7584

7685
void start() {
86+
auto kernelId = getKernelId();
7787
std::unique_lock<std::mutex> lock(mutex);
7888
State *state = new State();
7989
if (states.count(kernelId) == 1) {
@@ -87,6 +97,21 @@ class PGOState {
8797
clock_gettime(CLOCK_MONOTONIC, &state->start_clock);
8898
}
8999

100+
static void writeResults() {
101+
// Only write to file if we are profiling a valid alternative
102+
for (auto &pair : logger.timings) {
103+
auto &kernelId = std::get<0>(pair);
104+
auto &timings = std::get<1>(pair);
105+
auto elapsed = std::accumulate(timings.begin(), timings.end(), 0.0f);
106+
// TODO error handling
107+
std::ofstream ofile;
108+
ofile.open(std::string(dirname) + "/" + kernelId,
109+
std::ios::out | std::ios::app);
110+
ofile << alternative << " " << elapsed << std::endl;
111+
ofile.close();
112+
}
113+
}
114+
90115
int getAlternative() {
91116
static int init = [&] {
92117
if (char *i = getenv(POLYGEIST_PGO_ALTERNATIVE_ENV_VAR)) {
@@ -102,12 +127,10 @@ class PGOState {
102127
this->dirname = POLYGEIST_PGO_DEFAULT_DATA_DIR;
103128
}
104129
std::filesystem::create_directories(dirname);
130+
105131
return 0;
106132
}();
107-
if (0 <= alternative && alternative < totalAlternatives)
108-
return alternative;
109-
else
110-
return 0;
133+
return alternative % totalAlternatives;
111134
}
112135

113136
~PGOState() {}

0 commit comments

Comments
 (0)