Skip to content

Commit 1184b8d

Browse files
authored
[CIR][CUDA] Generate registration function (Part 1) (#1415)
The generation is quite complicated so I plan to separate it into several parts. The registration function should be like: ```cpp const char *__cuda_fatbin_str = /* Raw content of file in -fcuda-include-gpubinary */; struct { int magicNum, version; void *binaryData, *unused; } __cuda_fatbin_wrapper = { /*CUDA Magic Num*/, 1, __cuda_fatbin_str, nullptr }; void __cuda_module_ctor() { handle = __cudaRegisterFatBinary(&wrapper); __cuda_register_globals(); } ``` In this PR, we generate everything except the `__cuda_register_globals` function. OG doesn't give a name to `__cuda_fatbin_str`, which isn't allowed for cir::GlobalOp, so I invented a name for it. Other names are kept consistent with OG.
1 parent 0e7b6c7 commit 1184b8d

File tree

3 files changed

+227
-2
lines changed

3 files changed

+227
-2
lines changed

clang/include/clang/CIR/MissingFeatures.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ struct MissingFeatures {
252252
static bool emitEmptyRecordCheck() { return false; }
253253
static bool isPPC_FP128Ty() { return false; }
254254
static bool createLaunderInvariantGroup() { return false; }
255+
static bool hipModuleCtor() { return false; }
256+
static bool checkMacOSXTriple() { return false; }
255257

256258
// Inline assembly
257259
static bool asmGoto() { return false; }

clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "clang/AST/ASTContext.h"
1414
#include "clang/AST/CharUnits.h"
1515
#include "clang/AST/Mangle.h"
16+
#include "clang/Basic/Cuda.h"
1617
#include "clang/Basic/Module.h"
1718
#include "clang/Basic/TargetInfo.h"
1819
#include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
@@ -27,6 +28,7 @@
2728
#include "llvm/ADT/Twine.h"
2829
#include "llvm/Support/ErrorHandling.h"
2930
#include "llvm/Support/Path.h"
31+
#include "llvm/Support/VirtualFileSystem.h"
3032

3133
#include <memory>
3234

@@ -117,6 +119,17 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
117119
/// has an empty name, and prevent collisions.
118120
uint64_t annonGlobalConstArrayCount = 0;
119121

122+
///
123+
/// CUDA related
124+
/// ------------
125+
126+
// Maps CUDA device stub name to kernel name.
127+
llvm::DenseMap<llvm::StringRef, std::string> cudaKernelMap;
128+
129+
void buildCUDAModuleCtor();
130+
void buildCUDAModuleDtor();
131+
std::optional<FuncOp> buildCUDARegisterGlobals();
132+
120133
///
121134
/// AST related
122135
/// -----------
@@ -964,6 +977,146 @@ void LoweringPreparePass::buildCXXGlobalInitFunc() {
964977
builder.create<ReturnOp>(f.getLoc());
965978
}
966979

980+
void LoweringPreparePass::buildCUDAModuleCtor() {
981+
if (astCtx->getLangOpts().HIP)
982+
assert(!cir::MissingFeatures::hipModuleCtor());
983+
if (astCtx->getLangOpts().GPURelocatableDeviceCode)
984+
llvm_unreachable("NYI");
985+
986+
// There's no device-side binary, so no need to proceed for CUDA.
987+
// HIP has to create an external symbol in this case, which is NYI.
988+
auto cudaBinaryHandleAttr =
989+
theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName());
990+
if (!cudaBinaryHandleAttr) {
991+
if (astCtx->getLangOpts().HIP)
992+
assert(!cir::MissingFeatures::hipModuleCtor());
993+
return;
994+
}
995+
std::string cudaGPUBinaryName =
996+
cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
997+
998+
llvm::StringRef prefix = "cuda";
999+
1000+
constexpr unsigned cudaFatMagic = 0x466243b1;
1001+
constexpr unsigned hipFatMagic = 0x48495046; // "HIPF"
1002+
1003+
const unsigned fatMagic =
1004+
astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic;
1005+
1006+
auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string {
1007+
return ("__" + prefix + name).str();
1008+
};
1009+
1010+
// MAC OS X needs special care, but we haven't supported that in CIR yet.
1011+
assert(!cir::MissingFeatures::checkMacOSXTriple());
1012+
1013+
CIRBaseBuilderTy builder(getContext());
1014+
builder.setInsertionPointToStart(theModule.getBody());
1015+
1016+
mlir::Location loc = theModule.getLoc();
1017+
1018+
// Extract types from the module.
1019+
auto typeSizesAttr = cast<TypeSizeInfoAttr>(
1020+
theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
1021+
1022+
auto voidTy = VoidType::get(&getContext());
1023+
auto voidPtrTy = PointerType::get(voidTy);
1024+
auto voidPtrPtrTy = PointerType::get(voidPtrTy);
1025+
auto intTy = typeSizesAttr.getIntType(&getContext());
1026+
auto charTy = typeSizesAttr.getCharType(&getContext());
1027+
1028+
// Read the GPU binary and create a constant array for it.
1029+
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> cudaGPUBinaryOrErr =
1030+
llvm::MemoryBuffer::getFile(cudaGPUBinaryName);
1031+
if (std::error_code ec = cudaGPUBinaryOrErr.getError()) {
1032+
theModule->emitError("cannot open file: " + cudaGPUBinaryName +
1033+
ec.message());
1034+
return;
1035+
}
1036+
std::unique_ptr<llvm::MemoryBuffer> cudaGPUBinary =
1037+
std::move(cudaGPUBinaryOrErr.get());
1038+
1039+
// The section names are different for MAC OS X.
1040+
llvm::StringRef fatbinConstName = ".nv_fatbin";
1041+
llvm::StringRef fatbinSectionName = ".nvFatBinSegment";
1042+
1043+
// Create a global variable with the contents of GPU binary.
1044+
auto fatbinType =
1045+
ArrayType::get(&getContext(), charTy, cudaGPUBinary->getBuffer().size());
1046+
1047+
// OG gives an empty name to this global constant,
1048+
// which is not allowed in CIR.
1049+
std::string fatbinStrName = addUnderscoredPrefix("_fatbin_str");
1050+
GlobalOp fatbinStr = builder.create<GlobalOp>(
1051+
loc, fatbinStrName, fatbinType, /*isConstant=*/true,
1052+
/*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
1053+
fatbinStr.setAlignment(8);
1054+
fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
1055+
fatbinType, builder.getStringAttr(cudaGPUBinary->getBuffer())));
1056+
fatbinStr.setSection(fatbinConstName);
1057+
fatbinStr.setPrivate();
1058+
1059+
// Create a struct FatbinWrapper, pointing to the GPU binary.
1060+
// Struct layout:
1061+
// struct { int magicNum; int version; void *fatbin; void *unused; };
1062+
// This will be initialized in the module ctor below.
1063+
auto fatbinWrapperType = StructType::get(
1064+
&getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false,
1065+
/*padded=*/false, StructType::RecordKind::Struct);
1066+
1067+
std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper");
1068+
GlobalOp fatbinWrapper = builder.create<GlobalOp>(
1069+
loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false,
1070+
/*linkage=*/cir::GlobalLinkageKind::InternalLinkage);
1071+
fatbinWrapper.setPrivate();
1072+
fatbinWrapper.setSection(fatbinSectionName);
1073+
1074+
auto magicInit = IntAttr::get(intTy, fatMagic);
1075+
auto versionInit = IntAttr::get(intTy, 1);
1076+
// `fatbinInit` is only a placeholder. The value will be initialized at the
1077+
// beginning of module ctor.
1078+
auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy);
1079+
auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
1080+
fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get(
1081+
fatbinWrapperType,
1082+
ArrayAttr::get(&getContext(),
1083+
{magicInit, versionInit, fatbinInit, unusedInit})));
1084+
1085+
// Declare this function:
1086+
// void **__{cuda|hip}RegisterFatBinary(void *);
1087+
1088+
std::string regFuncName = addUnderscoredPrefix("RegisterFatBinary");
1089+
auto regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy);
1090+
auto regFunc = buildRuntimeFunction(builder, regFuncName, loc, regFuncType);
1091+
1092+
// Create the module constructor.
1093+
1094+
std::string moduleCtorName = addUnderscoredPrefix("_module_ctor");
1095+
auto moduleCtor = buildRuntimeFunction(builder, moduleCtorName, loc,
1096+
FuncType::get({}, voidTy),
1097+
GlobalLinkageKind::InternalLinkage);
1098+
globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName));
1099+
builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
1100+
1101+
auto wrapper = builder.createGetGlobal(fatbinWrapper);
1102+
// Put fatbinStr inside fatbinWrapper.
1103+
mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr);
1104+
mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2);
1105+
builder.createStore(loc, fatbinStrValue, fatbinField);
1106+
1107+
// Register binary with CUDA runtime. This is substantially different in
1108+
// default mode vs. separate compilation.
1109+
// Corresponding code:
1110+
// gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper);
1111+
auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
1112+
auto gpuBinaryHandle = builder.createCallOp(loc, regFunc, fatbinVoidPtr);
1113+
1114+
// This is currently incomplete.
1115+
// TODO(cir): create __cuda_register_globals(), and call it here.
1116+
1117+
builder.create<cir::ReturnOp>(loc);
1118+
}
1119+
9671120
void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
9681121
CIRBaseBuilderTy builder(getContext());
9691122
builder.setInsertionPointAfter(op);
@@ -1224,6 +1377,13 @@ void LoweringPreparePass::runOnOp(Operation *op) {
12241377
} else if (auto globalDtor = fnOp.getGlobalDtorAttr()) {
12251378
globalDtorList.push_back(globalDtor);
12261379
}
1380+
if (auto attr = fnOp.getExtraAttrs().getElements().get(
1381+
CIRDialect::getCUDABinaryHandleAttrName())) {
1382+
auto cudaBinaryAttr = dyn_cast<CUDABinaryHandleAttr>(attr);
1383+
std::string kernelName = cudaBinaryAttr.getName();
1384+
llvm::StringRef stubName = fnOp.getSymName();
1385+
cudaKernelMap[stubName] = kernelName;
1386+
}
12271387
if (std::optional<mlir::ArrayAttr> annotations = fnOp.getAnnotations())
12281388
addGlobalAnnotations(fnOp, annotations.value());
12291389
} else if (auto throwOp = dyn_cast<cir::ThrowOp>(op)) {
@@ -1251,6 +1411,10 @@ void LoweringPreparePass::runOnOperation() {
12511411
for (auto *o : opsToTransform)
12521412
runOnOp(o);
12531413

1414+
if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) {
1415+
buildCUDAModuleCtor();
1416+
}
1417+
12541418
buildCXXGlobalInitFunc();
12551419
buildGlobalCtorDtorList();
12561420
buildGlobalAnnotationValues();
Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,68 @@
11
#include "../Inputs/cuda.h"
22

3+
// RUN: echo "sample fatbin" > %t.fatbin
34
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
45
// RUN: -x cuda -emit-cir -target-sdk-version=12.3 \
5-
// RUN: -fcuda-include-gpubinary fatbin.o\
6+
// RUN: -fcuda-include-gpubinary %t.fatbin \
67
// RUN: %s -o %t.cir
78
// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
89

9-
// CIR-HOST: module @"{{.*}}" attributes{{.*}}cir.cu.binary_handle = #cir.cu.binary_handle<fatbin.o>{{.*}}
10+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
11+
// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \
12+
// RUN: -fcuda-include-gpubinary %t.fatbin \
13+
// RUN: %s -o %t.ll
14+
// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
15+
16+
// COM: OG doesn't emit anything if there is nothing to register.
17+
// COM: Here we still emit the template for test purposes,
18+
// COM: and the behaviour will be fixed later.
19+
20+
// CIR-HOST: module @"{{.*}}" attributes {
21+
// CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
22+
// CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>]
23+
// CIR-HOST: }
24+
25+
// The content in const array should be the same as echoed above,
26+
// with a trailing line break ('\n', 0x0A).
27+
// CIR-HOST: cir.global "private" constant cir_private @__cuda_fatbin_str =
28+
// CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
29+
// CIR-HOST-SAME: {{.*}}section = ".nv_fatbin"
30+
31+
// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
32+
33+
// The first value is CUDA file head magic number.
34+
// CIR-HOST: cir.global "private" internal @__cuda_fatbin_wrapper
35+
// CIR-HOST: = #cir.const_struct<{
36+
// CIR-HOST: #cir.int<1180844977> : !s32i,
37+
// CIR-HOST: #cir.int<1> : !s32i,
38+
// CIR-HOST: #cir.ptr<null> : !cir.ptr<!void>,
39+
// CIR-HOST: #cir.ptr<null> : !cir.ptr<!void>
40+
// CIR-HOST: }>
41+
// CIR-HOST-SAME: {{.*}}section = ".nvFatBinSegment"
42+
43+
// COM: @__cuda_fatbin_wrapper is constant for OG.
44+
// COM: However, as we don't have a way to put @__cuda_fatbin_str directly
45+
// COM: to its third field in Clang IR, we can't mark this variable as
46+
// COM: constant: we need to initialize it later, at the beginning
47+
// COM: of @__cuda_module_ctor.
48+
49+
// LLVM-HOST: @__cuda_fatbin_wrapper = internal global {
50+
// LLVM-HOST: i32 1180844977, i32 1, ptr null, ptr null
51+
// LLVM-HOST: }
52+
53+
// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
54+
55+
// CIR-HOST: cir.func private @__cudaRegisterFatBinary
56+
// CIR-HOST: cir.func {{.*}} @__cuda_module_ctor() {
57+
// CIR-HOST: %[[#F0:]] = cir.get_global @__cuda_fatbin_wrapper
58+
// CIR-HOST: %[[#F1:]] = cir.get_global @__cuda_fatbin_str
59+
// CIR-HOST: %[[#F2:]] = cir.get_member %[[#F0]][2]
60+
// CIR-HOST: %[[#F3:]] = cir.cast(bitcast, %[[#F2]]
61+
// CIR-HOST: cir.store %[[#F1]], %[[#F3]]
62+
// CIR-HOST: cir.call @__cudaRegisterFatBinary
63+
// CIR-HOST: }
64+
65+
// LLVM-HOST: define internal void @__cuda_module_ctor() {
66+
// LLVM-HOST: store ptr @__cuda_fatbin_str, ptr getelementptr {{.*}}, ptr @__cuda_fatbin_wrapper
67+
// LLVM-HOST: call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
68+
// LLVM-HOST: }

0 commit comments

Comments
 (0)