[CIR][CUDA] Handle clang builtin functions (#1496)

AdUhTkJm · lanza · commit 3af8b3fadc8e · 2025-04-09T15:41:20.000-07:00
Clang relies on `llvm::Intrinsic::getOrInsertDeclaration` to handle
functions marked as `ClangBuiltin` in TableGen. That function receives a
`CodeGenModule*` so CIR can't use that. We need to re-implement parts of
it.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -467,6 +467,46 @@ static bool isMemBuiltinOutOfBoundPossible(const clang::Expr *sizeArg,
   return size.ugt(dstSize);
 }
 
+static mlir::Type
+decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
+                mlir::MLIRContext *context) {
+  using namespace llvm::Intrinsic;
+
+  IITDescriptor descriptor = infos.front();
+  infos = infos.slice(1);
+
+  switch (descriptor.Kind) {
+  case IITDescriptor::Void:
+    return VoidType::get(context);
+  case IITDescriptor::Integer:
+    return IntType::get(context, descriptor.Integer_Width, /*signed=*/true);
+  case IITDescriptor::Float:
+    return SingleType::get(context);
+  case IITDescriptor::Double:
+    return DoubleType::get(context);
+  default:
+    llvm_unreachable("NYI");
+  }
+}
+
+// llvm::Intrinsics accepts only LLVMContext. We need to reimplement it here.
+static cir::FuncType getIntrinsicType(mlir::MLIRContext *context,
+                                      llvm::Intrinsic::ID id) {
+  using namespace llvm::Intrinsic;
+
+  SmallVector<IITDescriptor, 8> table;
+  getIntrinsicInfoTableEntries(id, table);
+
+  ArrayRef<IITDescriptor> tableRef = table;
+  mlir::Type resultTy = decodeFixedType(tableRef, context);
+
+  SmallVector<mlir::Type, 8> argTypes;
+  while (!tableRef.empty())
+    argTypes.push_back(decodeFixedType(tableRef, context));
+
+  return FuncType::get(argTypes, resultTy);
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                        const CallExpr *E,
                                        ReturnValueSlot ReturnValue) {
@@ -2526,25 +2566,58 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
   // See if we have a target specific intrinsic.
   std::string Name = getContext().BuiltinInfo.getName(BuiltinID);
-  Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
+  Intrinsic::ID intrinsicID = Intrinsic::not_intrinsic;
   StringRef Prefix =
       llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
   if (!Prefix.empty()) {
-    IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name);
+    intrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name);
     // NOTE we don't need to perform a compatibility flag check here since the
     // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
     // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
-    if (IntrinsicID == Intrinsic::not_intrinsic)
-      IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
+    if (intrinsicID == Intrinsic::not_intrinsic)
+      intrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
   }
 
-  if (IntrinsicID != Intrinsic::not_intrinsic) {
+  if (intrinsicID != Intrinsic::not_intrinsic) {
     unsigned iceArguments = 0;
     ASTContext::GetBuiltinTypeError error;
     getContext().GetBuiltinType(BuiltinID, error, &iceArguments);
     assert(error == ASTContext::GE_None && "Should not codegen an error");
-    if (iceArguments > 0)
+
+    llvm::StringRef name = llvm::Intrinsic::getName(intrinsicID);
+    // cir::LLVMIntrinsicCallOp expects intrinsic name to not have prefix
+    // "llvm." For example, `llvm.nvvm.barrier0` should be passed as
+    // `nvvm.barrier0`.
+    if (!name.consume_front("llvm."))
+      assert(false && "bad intrinsic name!");
+
+    cir::FuncType intrinsicType =
+        getIntrinsicType(&getMLIRContext(), intrinsicID);
+
+    SmallVector<mlir::Value> args;
+    for (unsigned i = 0; i < E->getNumArgs(); i++) {
+      mlir::Value arg = emitScalarOrConstFoldImmArg(iceArguments, i, E);
+      mlir::Type argType = arg.getType();
+      if (argType != intrinsicType.getInput(i))
+        llvm_unreachable("NYI");
+
+      args.push_back(arg);
+    }
+
+    auto intrinsicCall = builder.create<cir::LLVMIntrinsicCallOp>(
+        getLoc(E->getExprLoc()), builder.getStringAttr(name),
+        intrinsicType.getReturnType(), args);
+
+    mlir::Type builtinReturnType = intrinsicCall.getResult().getType();
+    mlir::Type retTy = intrinsicType.getReturnType();
+
+    if (builtinReturnType != retTy)
       llvm_unreachable("NYI");
+
+    if (isa<cir::VoidType>(retTy))
+      return RValue::get(nullptr);
+
+    return RValue::get(intrinsicCall.getResult());
   }
 
   // Some target-specific builtins can have aggregate return values, e.g.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
@@ -40,44 +40,11 @@ mlir::Value CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId,
         .getResult();
   };
   switch (builtinId) {
-  case NVPTX::BI__nvvm_read_ptx_sreg_tid_x:
-    return getIntrinsic("nvvm.read.ptx.sreg.tid.x");
-  case NVPTX::BI__nvvm_read_ptx_sreg_tid_y:
-    return getIntrinsic("nvvm.read.ptx.sreg.tid.y");
-  case NVPTX::BI__nvvm_read_ptx_sreg_tid_z:
-    return getIntrinsic("nvvm.read.ptx.sreg.tid.z");
-  case NVPTX::BI__nvvm_read_ptx_sreg_tid_w:
-    return getIntrinsic("nvvm.read.ptx.sreg.tid.w");
-
-  case NVPTX::BI__nvvm_read_ptx_sreg_ntid_x:
-    return getIntrinsic("nvvm.read.ptx.sreg.ntid.x");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ntid_y:
-    return getIntrinsic("nvvm.read.ptx.sreg.ntid.y");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ntid_z:
-    return getIntrinsic("nvvm.read.ptx.sreg.ntid.z");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ntid_w:
-    return getIntrinsic("nvvm.read.ptx.sreg.ntid.w");
-
-  case NVPTX::BI__nvvm_read_ptx_sreg_ctaid_x:
-    return getIntrinsic("nvvm.read.ptx.sreg.ctaid.x");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ctaid_y:
-    return getIntrinsic("nvvm.read.ptx.sreg.ctaid.y");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ctaid_z:
-    return getIntrinsic("nvvm.read.ptx.sreg.ctaid.z");
-  case NVPTX::BI__nvvm_read_ptx_sreg_ctaid_w:
-    return getIntrinsic("nvvm.read.ptx.sreg.ctaid.w");
-
-  case NVPTX::BI__nvvm_read_ptx_sreg_nctaid_x:
-    return getIntrinsic("nvvm.read.ptx.sreg.nctaid.x");
-  case NVPTX::BI__nvvm_read_ptx_sreg_nctaid_y:
-    return getIntrinsic("nvvm.read.ptx.sreg.nctaid.y");
-  case NVPTX::BI__nvvm_read_ptx_sreg_nctaid_z:
-    return getIntrinsic("nvvm.read.ptx.sreg.nctaid.z");
-  case NVPTX::BI__nvvm_read_ptx_sreg_nctaid_w:
-    return getIntrinsic("nvvm.read.ptx.sreg.nctaid.w");
-
   default:
-    llvm_unreachable("NYI");
+    // Returning nullptr means the intrinsic is not implemented.
+    // This will be checked in `emitBuiltinExpr`, and will cause clang to output
+    // "unsupported builtin" diagnostics.
+    return nullptr;
   }
 }
 
diff --git a/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu b/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu
@@ -0,0 +1,64 @@
+#include "../Inputs/cuda.h"
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -fcuda-is-device -emit-cir -target-sdk-version=12.3 \
+// RUN:            %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
+// RUN:            %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+__device__ void builtins() {
+  float f1, f2;
+  double d1, d2;
+
+  // CIR: cir.llvm.intrinsic "nvvm.fmax.f" {{.*}} : (!cir.float, !cir.float) -> !cir.float
+  // LLVM: call float @llvm.nvvm.fmax.f(float {{.*}}, float {{.*}})
+  float t1 = __nvvm_fmax_f(f1, f2);
+  // CIR: cir.llvm.intrinsic "nvvm.fmin.f" {{.*}} : (!cir.float, !cir.float) -> !cir.float
+  // LLVM: call float @llvm.nvvm.fmin.f(float {{.*}}, float {{.*}})
+  float t2 = __nvvm_fmin_f(f1, f2);
+  // CIR: cir.llvm.intrinsic "nvvm.sqrt.rn.f" {{.*}} : (!cir.float) -> !cir.float
+  // LLVM: call float @llvm.nvvm.sqrt.rn.f(float {{.*}})
+  float t3 = __nvvm_sqrt_rn_f(f1);
+  // CIR: cir.llvm.intrinsic "nvvm.rcp.rn.f" {{.*}} : (!cir.float) -> !cir.float
+  // LLVM: call float @llvm.nvvm.rcp.rn.f(float {{.*}})
+  float t4 = __nvvm_rcp_rn_f(f2);
+  // CIR: cir.llvm.intrinsic "nvvm.add.rn.f" {{.*}} : (!cir.float, !cir.float) -> !cir.float
+  // LLVM: call float @llvm.nvvm.add.rn.f(float {{.*}}, float {{.*}})
+  float t5 = __nvvm_add_rn_f(f1, f2);
+
+  // CIR: cir.llvm.intrinsic "nvvm.fmax.d" {{.*}} : (!cir.double, !cir.double) -> !cir.double
+  // LLVM: call double @llvm.nvvm.fmax.d(double {{.*}}, double {{.*}})
+  double td1 = __nvvm_fmax_d(d1, d2);
+  // CIR: cir.llvm.intrinsic "nvvm.fmin.d" {{.*}} : (!cir.double, !cir.double) -> !cir.double
+  // LLVM: call double @llvm.nvvm.fmin.d(double {{.*}}, double {{.*}})
+  double td2 = __nvvm_fmin_d(d1, d2);
+  // CIR: cir.llvm.intrinsic "nvvm.sqrt.rn.d" {{.*}} : (!cir.double) -> !cir.double
+  // LLVM: call double @llvm.nvvm.sqrt.rn.d(double {{.*}})
+  double td3 = __nvvm_sqrt_rn_d(d1);
+  // CIR: cir.llvm.intrinsic "nvvm.rcp.rn.d" {{.*}} : (!cir.double) -> !cir.double
+  // LLVM: call double @llvm.nvvm.rcp.rn.d(double {{.*}})
+  double td4 = __nvvm_rcp_rn_d(d2);
+
+  int i1, i2;
+
+  // CIR: cir.llvm.intrinsic "nvvm.mulhi.i" {{.*}} : (!s32i, !s32i) -> !s32i
+  // LLVM: call i32 @llvm.nvvm.mulhi.i(i32 {{.*}}, i32 {{.*}})
+  int ti1 = __nvvm_mulhi_i(i1, i2);
+
+  // CIR: cir.llvm.intrinsic "nvvm.membar.cta"
+  // LLVM: call void @llvm.nvvm.membar.cta()
+  __nvvm_membar_cta();
+  // CIR: cir.llvm.intrinsic "nvvm.membar.gl"
+  // LLVM: call void @llvm.nvvm.membar.gl()
+  __nvvm_membar_gl();
+  // CIR: cir.llvm.intrinsic "nvvm.membar.sys"
+  // LLVM: call void @llvm.nvvm.membar.sys()
+  __nvvm_membar_sys();
+  // CIR: cir.llvm.intrinsic "nvvm.barrier0"
+  // LLVM: call void @llvm.nvvm.barrier0()
+  __syncthreads();
+}