|
13 | 13 | #include "clang/AST/ASTContext.h"
|
14 | 14 | #include "clang/AST/CharUnits.h"
|
15 | 15 | #include "clang/AST/Mangle.h"
|
| 16 | +#include "clang/Basic/Cuda.h" |
16 | 17 | #include "clang/Basic/Module.h"
|
17 | 18 | #include "clang/Basic/TargetInfo.h"
|
18 | 19 | #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
|
|
27 | 28 | #include "llvm/ADT/Twine.h"
|
28 | 29 | #include "llvm/Support/ErrorHandling.h"
|
29 | 30 | #include "llvm/Support/Path.h"
|
| 31 | +#include "llvm/Support/VirtualFileSystem.h" |
30 | 32 |
|
31 | 33 | #include <memory>
|
32 | 34 |
|
@@ -117,6 +119,17 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
|
117 | 119 | /// has an empty name, and prevent collisions.
|
118 | 120 | uint64_t annonGlobalConstArrayCount = 0;
|
119 | 121 |
|
| 122 | + /// |
| 123 | + /// CUDA related |
| 124 | + /// ------------ |
| 125 | + |
| 126 | + // Maps CUDA device stub name to kernel name. |
| 127 | + llvm::DenseMap<llvm::StringRef, std::string> cudaKernelMap; |
| 128 | + |
| 129 | + void buildCUDAModuleCtor(); |
| 130 | + void buildCUDAModuleDtor(); |
| 131 | + std::optional<FuncOp> buildCUDARegisterGlobals(); |
| 132 | + |
120 | 133 | ///
|
121 | 134 | /// AST related
|
122 | 135 | /// -----------
|
@@ -964,6 +977,146 @@ void LoweringPreparePass::buildCXXGlobalInitFunc() {
|
964 | 977 | builder.create<ReturnOp>(f.getLoc());
|
965 | 978 | }
|
966 | 979 |
|
| 980 | +void LoweringPreparePass::buildCUDAModuleCtor() { |
| 981 | + if (astCtx->getLangOpts().HIP) |
| 982 | + assert(!cir::MissingFeatures::hipModuleCtor()); |
| 983 | + if (astCtx->getLangOpts().GPURelocatableDeviceCode) |
| 984 | + llvm_unreachable("NYI"); |
| 985 | + |
| 986 | + // There's no device-side binary, so no need to proceed for CUDA. |
| 987 | + // HIP has to create an external symbol in this case, which is NYI. |
| 988 | + auto cudaBinaryHandleAttr = |
| 989 | + theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()); |
| 990 | + if (!cudaBinaryHandleAttr) { |
| 991 | + if (astCtx->getLangOpts().HIP) |
| 992 | + assert(!cir::MissingFeatures::hipModuleCtor()); |
| 993 | + return; |
| 994 | + } |
| 995 | + std::string cudaGPUBinaryName = |
| 996 | + cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName(); |
| 997 | + |
| 998 | + llvm::StringRef prefix = "cuda"; |
| 999 | + |
| 1000 | + constexpr unsigned cudaFatMagic = 0x466243b1; |
| 1001 | + constexpr unsigned hipFatMagic = 0x48495046; // "HIPF" |
| 1002 | + |
| 1003 | + const unsigned fatMagic = |
| 1004 | + astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic; |
| 1005 | + |
| 1006 | + auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string { |
| 1007 | + return ("__" + prefix + name).str(); |
| 1008 | + }; |
| 1009 | + |
| 1010 | + // MAC OS X needs special care, but we haven't supported that in CIR yet. |
| 1011 | + assert(!cir::MissingFeatures::checkMacOSXTriple()); |
| 1012 | + |
| 1013 | + CIRBaseBuilderTy builder(getContext()); |
| 1014 | + builder.setInsertionPointToStart(theModule.getBody()); |
| 1015 | + |
| 1016 | + mlir::Location loc = theModule.getLoc(); |
| 1017 | + |
| 1018 | + // Extract types from the module. |
| 1019 | + auto typeSizesAttr = cast<TypeSizeInfoAttr>( |
| 1020 | + theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName())); |
| 1021 | + |
| 1022 | + auto voidTy = VoidType::get(&getContext()); |
| 1023 | + auto voidPtrTy = PointerType::get(voidTy); |
| 1024 | + auto voidPtrPtrTy = PointerType::get(voidPtrTy); |
| 1025 | + auto intTy = typeSizesAttr.getIntType(&getContext()); |
| 1026 | + auto charTy = typeSizesAttr.getCharType(&getContext()); |
| 1027 | + |
| 1028 | + // Read the GPU binary and create a constant array for it. |
| 1029 | + llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> cudaGPUBinaryOrErr = |
| 1030 | + llvm::MemoryBuffer::getFile(cudaGPUBinaryName); |
| 1031 | + if (std::error_code ec = cudaGPUBinaryOrErr.getError()) { |
| 1032 | + theModule->emitError("cannot open file: " + cudaGPUBinaryName + |
| 1033 | + ec.message()); |
| 1034 | + return; |
| 1035 | + } |
| 1036 | + std::unique_ptr<llvm::MemoryBuffer> cudaGPUBinary = |
| 1037 | + std::move(cudaGPUBinaryOrErr.get()); |
| 1038 | + |
| 1039 | + // The section names are different for MAC OS X. |
| 1040 | + llvm::StringRef fatbinConstName = ".nv_fatbin"; |
| 1041 | + llvm::StringRef fatbinSectionName = ".nvFatBinSegment"; |
| 1042 | + |
| 1043 | + // Create a global variable with the contents of GPU binary. |
| 1044 | + auto fatbinType = |
| 1045 | + ArrayType::get(&getContext(), charTy, cudaGPUBinary->getBuffer().size()); |
| 1046 | + |
| 1047 | + // OG gives an empty name to this global constant, |
| 1048 | + // which is not allowed in CIR. |
| 1049 | + std::string fatbinStrName = addUnderscoredPrefix("_fatbin_str"); |
| 1050 | + GlobalOp fatbinStr = builder.create<GlobalOp>( |
| 1051 | + loc, fatbinStrName, fatbinType, /*isConstant=*/true, |
| 1052 | + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); |
| 1053 | + fatbinStr.setAlignment(8); |
| 1054 | + fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get( |
| 1055 | + fatbinType, builder.getStringAttr(cudaGPUBinary->getBuffer()))); |
| 1056 | + fatbinStr.setSection(fatbinConstName); |
| 1057 | + fatbinStr.setPrivate(); |
| 1058 | + |
| 1059 | + // Create a struct FatbinWrapper, pointing to the GPU binary. |
| 1060 | + // Struct layout: |
| 1061 | + // struct { int magicNum; int version; void *fatbin; void *unused; }; |
| 1062 | + // This will be initialized in the module ctor below. |
| 1063 | + auto fatbinWrapperType = StructType::get( |
| 1064 | + &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false, |
| 1065 | + /*padded=*/false, StructType::RecordKind::Struct); |
| 1066 | + |
| 1067 | + std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper"); |
| 1068 | + GlobalOp fatbinWrapper = builder.create<GlobalOp>( |
| 1069 | + loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false, |
| 1070 | + /*linkage=*/cir::GlobalLinkageKind::InternalLinkage); |
| 1071 | + fatbinWrapper.setPrivate(); |
| 1072 | + fatbinWrapper.setSection(fatbinSectionName); |
| 1073 | + |
| 1074 | + auto magicInit = IntAttr::get(intTy, fatMagic); |
| 1075 | + auto versionInit = IntAttr::get(intTy, 1); |
| 1076 | + // `fatbinInit` is only a placeholder. The value will be initialized at the |
| 1077 | + // beginning of module ctor. |
| 1078 | + auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy); |
| 1079 | + auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy); |
| 1080 | + fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get( |
| 1081 | + fatbinWrapperType, |
| 1082 | + ArrayAttr::get(&getContext(), |
| 1083 | + {magicInit, versionInit, fatbinInit, unusedInit}))); |
| 1084 | + |
| 1085 | + // Declare this function: |
| 1086 | + // void **__{cuda|hip}RegisterFatBinary(void *); |
| 1087 | + |
| 1088 | + std::string regFuncName = addUnderscoredPrefix("RegisterFatBinary"); |
| 1089 | + auto regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy); |
| 1090 | + auto regFunc = buildRuntimeFunction(builder, regFuncName, loc, regFuncType); |
| 1091 | + |
| 1092 | + // Create the module constructor. |
| 1093 | + |
| 1094 | + std::string moduleCtorName = addUnderscoredPrefix("_module_ctor"); |
| 1095 | + auto moduleCtor = buildRuntimeFunction(builder, moduleCtorName, loc, |
| 1096 | + FuncType::get({}, voidTy), |
| 1097 | + GlobalLinkageKind::InternalLinkage); |
| 1098 | + globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName)); |
| 1099 | + builder.setInsertionPointToStart(moduleCtor.addEntryBlock()); |
| 1100 | + |
| 1101 | + auto wrapper = builder.createGetGlobal(fatbinWrapper); |
| 1102 | + // Put fatbinStr inside fatbinWrapper. |
| 1103 | + mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr); |
| 1104 | + mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2); |
| 1105 | + builder.createStore(loc, fatbinStrValue, fatbinField); |
| 1106 | + |
| 1107 | + // Register binary with CUDA runtime. This is substantially different in |
| 1108 | + // default mode vs. separate compilation. |
| 1109 | + // Corresponding code: |
| 1110 | + // gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper); |
| 1111 | + auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy); |
| 1112 | + auto gpuBinaryHandle = builder.createCallOp(loc, regFunc, fatbinVoidPtr); |
| 1113 | + |
| 1114 | + // This is currently incomplete. |
| 1115 | + // TODO(cir): create __cuda_register_globals(), and call it here. |
| 1116 | + |
| 1117 | + builder.create<cir::ReturnOp>(loc); |
| 1118 | +} |
| 1119 | + |
967 | 1120 | void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
|
968 | 1121 | CIRBaseBuilderTy builder(getContext());
|
969 | 1122 | builder.setInsertionPointAfter(op);
|
@@ -1224,6 +1377,13 @@ void LoweringPreparePass::runOnOp(Operation *op) {
|
1224 | 1377 | } else if (auto globalDtor = fnOp.getGlobalDtorAttr()) {
|
1225 | 1378 | globalDtorList.push_back(globalDtor);
|
1226 | 1379 | }
|
| 1380 | + if (auto attr = fnOp.getExtraAttrs().getElements().get( |
| 1381 | + CIRDialect::getCUDABinaryHandleAttrName())) { |
| 1382 | + auto cudaBinaryAttr = dyn_cast<CUDABinaryHandleAttr>(attr); |
| 1383 | + std::string kernelName = cudaBinaryAttr.getName(); |
| 1384 | + llvm::StringRef stubName = fnOp.getSymName(); |
| 1385 | + cudaKernelMap[stubName] = kernelName; |
| 1386 | + } |
1227 | 1387 | if (std::optional<mlir::ArrayAttr> annotations = fnOp.getAnnotations())
|
1228 | 1388 | addGlobalAnnotations(fnOp, annotations.value());
|
1229 | 1389 | } else if (auto throwOp = dyn_cast<cir::ThrowOp>(op)) {
|
@@ -1251,6 +1411,10 @@ void LoweringPreparePass::runOnOperation() {
|
1251 | 1411 | for (auto *o : opsToTransform)
|
1252 | 1412 | runOnOp(o);
|
1253 | 1413 |
|
| 1414 | + if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) { |
| 1415 | + buildCUDAModuleCtor(); |
| 1416 | + } |
| 1417 | + |
1254 | 1418 | buildCXXGlobalInitFunc();
|
1255 | 1419 | buildGlobalCtorDtorList();
|
1256 | 1420 | buildGlobalAnnotationValues();
|
|
0 commit comments