[CIR][CUDA] Generate registration function (Part 1) (#1415)

AdUhTkJm · web-flow · commit 1184b8d205ff · 2025-03-05T10:37:32.000-08:00
The generation is quite complicated so I plan to separate it into
several parts.

The registration function should be like:
```cpp
const char *__cuda_fatbin_str = /* Raw content of file in -fcuda-include-gpubinary */;
struct {
  int magicNum, version;
  void *binaryData, *unused;
} __cuda_fatbin_wrapper = { /*CUDA Magic Num*/, 1, __cuda_fatbin_str, nullptr };

void __cuda_module_ctor() {
  handle = __cudaRegisterFatBinary(&amp;wrapper);
  __cuda_register_globals();
}
```
In this PR, we generate everything except the `__cuda_register_globals`
function.

OG doesn't give a name to `__cuda_fatbin_str`, which isn't allowed for
cir::GlobalOp, so I invented a name for it. Other names are kept
consistent with OG.
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
@@ -252,6 +252,8 @@ struct MissingFeatures {
   static bool emitEmptyRecordCheck() { return false; }
   static bool isPPC_FP128Ty() { return false; }
   static bool createLaunderInvariantGroup() { return false; }
+  static bool hipModuleCtor() { return false; }
+  static bool checkMacOSXTriple() { return false; }
 
   // Inline assembly
   static bool asmGoto() { return false; }
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Mangle.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
@@ -27,6 +28,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
 
 #include <memory>
 
@@ -117,6 +119,17 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// has an empty name, and prevent collisions.
   uint64_t annonGlobalConstArrayCount = 0;
 
+  ///
+  /// CUDA related
+  /// ------------
+
+  // Maps CUDA device stub name to kernel name.
+  llvm::DenseMap<llvm::StringRef, std::string> cudaKernelMap;
+
+  void buildCUDAModuleCtor();
+  void buildCUDAModuleDtor();
+  std::optional<FuncOp> buildCUDARegisterGlobals();
+
   ///
   /// AST related
   /// -----------
@@ -964,6 +977,146 @@ void LoweringPreparePass::buildCXXGlobalInitFunc() {
   builder.create<ReturnOp>(f.getLoc());
 }
 
+void LoweringPreparePass::buildCUDAModuleCtor() {
+  if (astCtx->getLangOpts().HIP)
+    assert(!cir::MissingFeatures::hipModuleCtor());
+  if (astCtx->getLangOpts().GPURelocatableDeviceCode)
+    llvm_unreachable("NYI");
+
+  // There's no device-side binary, so no need to proceed for CUDA.
+  // HIP has to create an external symbol in this case, which is NYI.
+  auto cudaBinaryHandleAttr =
+      theModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName());
+  if (!cudaBinaryHandleAttr) {
+    if (astCtx->getLangOpts().HIP)
+      assert(!cir::MissingFeatures::hipModuleCtor());
+    return;
+  }
+  std::string cudaGPUBinaryName =
+      cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
+
+  llvm::StringRef prefix = "cuda";
+
+  constexpr unsigned cudaFatMagic = 0x466243b1;
+  constexpr unsigned hipFatMagic = 0x48495046; // "HIPF"
+
+  const unsigned fatMagic =
+      astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic;
+
+  auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string {
+    return ("__" + prefix + name).str();
+  };
+
+  // MAC OS X needs special care, but we haven't supported that in CIR yet.
+  assert(!cir::MissingFeatures::checkMacOSXTriple());
+
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(theModule.getBody());
+
+  mlir::Location loc = theModule.getLoc();
+
+  // Extract types from the module.
+  auto typeSizesAttr = cast<TypeSizeInfoAttr>(
+      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrTy = PointerType::get(voidTy);
+  auto voidPtrPtrTy = PointerType::get(voidPtrTy);
+  auto intTy = typeSizesAttr.getIntType(&getContext());
+  auto charTy = typeSizesAttr.getCharType(&getContext());
+
+  // Read the GPU binary and create a constant array for it.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> cudaGPUBinaryOrErr =
+      llvm::MemoryBuffer::getFile(cudaGPUBinaryName);
+  if (std::error_code ec = cudaGPUBinaryOrErr.getError()) {
+    theModule->emitError("cannot open file: " + cudaGPUBinaryName +
+                         ec.message());
+    return;
+  }
+  std::unique_ptr<llvm::MemoryBuffer> cudaGPUBinary =
+      std::move(cudaGPUBinaryOrErr.get());
+
+  // The section names are different for MAC OS X.
+  llvm::StringRef fatbinConstName = ".nv_fatbin";
+  llvm::StringRef fatbinSectionName = ".nvFatBinSegment";
+
+  // Create a global variable with the contents of GPU binary.
+  auto fatbinType =
+      ArrayType::get(&getContext(), charTy, cudaGPUBinary->getBuffer().size());
+
+  // OG gives an empty name to this global constant,
+  // which is not allowed in CIR.
+  std::string fatbinStrName = addUnderscoredPrefix("_fatbin_str");
+  GlobalOp fatbinStr = builder.create<GlobalOp>(
+      loc, fatbinStrName, fatbinType, /*isConstant=*/true,
+      /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
+  fatbinStr.setAlignment(8);
+  fatbinStr.setInitialValueAttr(cir::ConstArrayAttr::get(
+      fatbinType, builder.getStringAttr(cudaGPUBinary->getBuffer())));
+  fatbinStr.setSection(fatbinConstName);
+  fatbinStr.setPrivate();
+
+  // Create a struct FatbinWrapper, pointing to the GPU binary.
+  // Struct layout:
+  //    struct { int magicNum; int version; void *fatbin; void *unused; };
+  // This will be initialized in the module ctor below.
+  auto fatbinWrapperType = StructType::get(
+      &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false,
+      /*padded=*/false, StructType::RecordKind::Struct);
+
+  std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper");
+  GlobalOp fatbinWrapper = builder.create<GlobalOp>(
+      loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false,
+      /*linkage=*/cir::GlobalLinkageKind::InternalLinkage);
+  fatbinWrapper.setPrivate();
+  fatbinWrapper.setSection(fatbinSectionName);
+
+  auto magicInit = IntAttr::get(intTy, fatMagic);
+  auto versionInit = IntAttr::get(intTy, 1);
+  // `fatbinInit` is only a placeholder. The value will be initialized at the
+  // beginning of module ctor.
+  auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy);
+  auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
+  fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get(
+      fatbinWrapperType,
+      ArrayAttr::get(&getContext(),
+                     {magicInit, versionInit, fatbinInit, unusedInit})));
+
+  // Declare this function:
+  //    void **__{cuda|hip}RegisterFatBinary(void *);
+
+  std::string regFuncName = addUnderscoredPrefix("RegisterFatBinary");
+  auto regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy);
+  auto regFunc = buildRuntimeFunction(builder, regFuncName, loc, regFuncType);
+
+  // Create the module constructor.
+
+  std::string moduleCtorName = addUnderscoredPrefix("_module_ctor");
+  auto moduleCtor = buildRuntimeFunction(builder, moduleCtorName, loc,
+                                         FuncType::get({}, voidTy),
+                                         GlobalLinkageKind::InternalLinkage);
+  globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName));
+  builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
+
+  auto wrapper = builder.createGetGlobal(fatbinWrapper);
+  // Put fatbinStr inside fatbinWrapper.
+  mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr);
+  mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2);
+  builder.createStore(loc, fatbinStrValue, fatbinField);
+
+  // Register binary with CUDA runtime. This is substantially different in
+  // default mode vs. separate compilation.
+  // Corresponding code:
+  //     gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper);
+  auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
+  auto gpuBinaryHandle = builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+
+  // This is currently incomplete.
+  // TODO(cir): create __cuda_register_globals(), and call it here.
+
+  builder.create<cir::ReturnOp>(loc);
+}
+
 void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
@@ -1224,6 +1377,13 @@ void LoweringPreparePass::runOnOp(Operation *op) {
     } else if (auto globalDtor = fnOp.getGlobalDtorAttr()) {
       globalDtorList.push_back(globalDtor);
     }
+    if (auto attr = fnOp.getExtraAttrs().getElements().get(
+            CIRDialect::getCUDABinaryHandleAttrName())) {
+      auto cudaBinaryAttr = dyn_cast<CUDABinaryHandleAttr>(attr);
+      std::string kernelName = cudaBinaryAttr.getName();
+      llvm::StringRef stubName = fnOp.getSymName();
+      cudaKernelMap[stubName] = kernelName;
+    }
     if (std::optional<mlir::ArrayAttr> annotations = fnOp.getAnnotations())
       addGlobalAnnotations(fnOp, annotations.value());
   } else if (auto throwOp = dyn_cast<cir::ThrowOp>(op)) {
@@ -1251,6 +1411,10 @@ void LoweringPreparePass::runOnOperation() {
   for (auto *o : opsToTransform)
     runOnOp(o);
 
+  if (astCtx->getLangOpts().CUDA && !astCtx->getLangOpts().CUDAIsDevice) {
+    buildCUDAModuleCtor();
+  }
+
   buildCXXGlobalInitFunc();
   buildGlobalCtorDtorList();
   buildGlobalAnnotationValues();
diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu
@@ -1,9 +1,68 @@
 #include "../Inputs/cuda.h"
 
+// RUN: echo "sample fatbin" > %t.fatbin
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
 // RUN:            -x cuda -emit-cir -target-sdk-version=12.3 \
-// RUN:            -fcuda-include-gpubinary fatbin.o\
+// RUN:            -fcuda-include-gpubinary %t.fatbin \
 // RUN:            %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
 
-// CIR-HOST: module @"{{.*}}" attributes{{.*}}cir.cu.binary_handle = #cir.cu.binary_handle<fatbin.o>{{.*}}
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
+// RUN:            -x cuda -emit-llvm -target-sdk-version=12.3 \
+// RUN:            -fcuda-include-gpubinary %t.fatbin \
+// RUN:            %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
+
+// COM: OG doesn't emit anything if there is nothing to register.
+// COM: Here we still emit the template for test purposes,
+// COM: and the behaviour will be fixed later.
+
+// CIR-HOST: module @"{{.*}}" attributes {
+// CIR-HOST:   cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
+// CIR-HOST:   cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>]
+// CIR-HOST: }
+
+// The content in const array should be the same as echoed above,
+// with a trailing line break ('\n', 0x0A).
+// CIR-HOST: cir.global "private" constant cir_private @__cuda_fatbin_str =
+// CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
+// CIR-HOST-SAME: {{.*}}section = ".nv_fatbin"
+
+// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
+
+// The first value is CUDA file head magic number.
+// CIR-HOST: cir.global "private" internal @__cuda_fatbin_wrapper
+// CIR-HOST: = #cir.const_struct<{
+// CIR-HOST:   #cir.int<1180844977> : !s32i,
+// CIR-HOST:   #cir.int<1> : !s32i,
+// CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>,
+// CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>
+// CIR-HOST: }>
+// CIR-HOST-SAME: {{.*}}section = ".nvFatBinSegment"
+
+// COM: @__cuda_fatbin_wrapper is constant for OG.
+// COM: However, as we don't have a way to put @__cuda_fatbin_str directly
+// COM: to its third field in Clang IR, we can't mark this variable as 
+// COM: constant: we need to initialize it later, at the beginning
+// COM: of @__cuda_module_ctor.
+
+// LLVM-HOST: @__cuda_fatbin_wrapper = internal global {
+// LLVM-HOST:   i32 1180844977, i32 1, ptr null, ptr null
+// LLVM-HOST: }
+
+// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
+
+// CIR-HOST: cir.func private @__cudaRegisterFatBinary
+// CIR-HOST: cir.func {{.*}} @__cuda_module_ctor() {
+// CIR-HOST:   %[[#F0:]] = cir.get_global @__cuda_fatbin_wrapper
+// CIR-HOST:   %[[#F1:]] = cir.get_global @__cuda_fatbin_str
+// CIR-HOST:   %[[#F2:]] = cir.get_member %[[#F0]][2]
+// CIR-HOST:   %[[#F3:]] = cir.cast(bitcast, %[[#F2]]
+// CIR-HOST:   cir.store %[[#F1]], %[[#F3]]
+// CIR-HOST:   cir.call @__cudaRegisterFatBinary
+// CIR-HOST: }
+
+// LLVM-HOST: define internal void @__cuda_module_ctor() {
+// LLVM-HOST:   store ptr @__cuda_fatbin_str, ptr getelementptr {{.*}}, ptr @__cuda_fatbin_wrapper
+// LLVM-HOST:   call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST: }