[CIR][CUDA] Initial support for host compilation (#1309)

AdUhTkJm · web-flow · commit 90a5b619b9a9 · 2025-02-05T12:06:23.000-08:00
Adds support for `__host__` and `__device__` functions when compiling
for CUDA host.

The conditions I checked against is taken from OG.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -514,7 +514,19 @@ void CIRGenModule::emitGlobal(GlobalDecl GD) {
 
   assert(!Global->hasAttr<IFuncAttr>() && "NYI");
   assert(!Global->hasAttr<CPUDispatchAttr>() && "NYI");
-  assert(!langOpts.CUDA && "NYI");
+
+  if (langOpts.CUDA) {
+    if (langOpts.CUDAIsDevice)
+      llvm_unreachable("NYI");
+
+    if (dyn_cast<VarDecl>(Global))
+      llvm_unreachable("NYI");
+
+    // We must skip __device__ functions when compiling for host.
+    if (!Global->hasAttr<CUDAHostAttr>() && Global->hasAttr<CUDADeviceAttr>()) {
+      return;
+    }
+  }
 
   if (langOpts.OpenMP) {
     // If this is OpenMP, check if it is legal to emit this global normally.
@@ -557,6 +569,7 @@ void CIRGenModule::emitGlobal(GlobalDecl GD) {
       return;
     }
   } else {
+    assert(!langOpts.CUDA && "NYI");
     const auto *VD = cast<VarDecl>(Global);
     assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
     if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
@@ -2322,7 +2335,13 @@ cir::FuncOp CIRGenModule::GetAddrOfFunction(clang::GlobalDecl GD, mlir::Type Ty,
   auto F = GetOrCreateCIRFunction(MangledName, Ty, GD, ForVTable, DontDefer,
                                   /*IsThunk=*/false, IsForDefinition);
 
-  assert(!langOpts.CUDA && "NYI");
+  // As __global__ functions always reside on device,
+  // we need special care when accessing them from host;
+  // otherwise, CUDA functions behave as normal functions
+  if (langOpts.CUDA && !langOpts.CUDAIsDevice &&
+      cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
+    llvm_unreachable("NYI");
+  }
 
   return F;
 }
@@ -3164,9 +3183,6 @@ void CIRGenModule::Release() {
   assert(!MissingFeatures::registerGlobalDtorsWithAtExit());
   assert(!MissingFeatures::emitCXXThreadLocalInitFunc());
   assert(!MissingFeatures::objCRuntime());
-  if (astContext.getLangOpts().CUDA) {
-    llvm_unreachable("NYI");
-  }
   assert(!MissingFeatures::openMPRuntime());
   assert(!MissingFeatures::pgoReader());
   assert(!MissingFeatures::emitCtorList()); // GlobalCtors, GlobalDtors
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -305,6 +305,30 @@ class SPIRVTargetCIRGenInfo : public CommonSPIRTargetCIRGenInfo {
 
 } // namespace
 
+//===----------------------------------------------------------------------===//
+// NVPTX ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class NVPTXABIInfo : public ABIInfo {
+public:
+  NVPTXABIInfo(CIRGenTypes &cgt) : ABIInfo(cgt) {}
+
+  cir::ABIArgInfo classifyReturnType(QualType retTy) const;
+  cir::ABIArgInfo classifyArgumentType(QualType ty) const;
+
+  void computeInfo(CIRGenFunctionInfo &fnInfo) const override;
+};
+
+class NVPTXTargetCIRGenInfo : public TargetCIRGenInfo {
+public:
+  NVPTXTargetCIRGenInfo(CIRGenTypes &cgt)
+      : TargetCIRGenInfo(std::make_unique<NVPTXABIInfo>(cgt)) {}
+};
+
+} // namespace
+
 // TODO(cir): remove the attribute once this gets used.
 LLVM_ATTRIBUTE_UNUSED
 static bool classifyReturnType(const CIRGenCXXABI &CXXABI,
@@ -443,6 +467,34 @@ cir::ABIArgInfo X86_64ABIInfo::classifyArgumentType(QualType Ty,
   return cir::ABIArgInfo::getDirect(ResType);
 }
 
+// Skeleton only. Implement when used in TargetLower stage.
+cir::ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType retTy) const {
+  llvm_unreachable("not yet implemented");
+}
+
+cir::ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType ty) const {
+  llvm_unreachable("not yet implemented");
+}
+
+void NVPTXABIInfo::computeInfo(CIRGenFunctionInfo &fnInfo) const {
+  // Top level CIR has unlimited arguments and return types. Lowering for ABI
+  // specific concerns should happen during a lowering phase. Assume everything
+  // is direct for now.
+  for (CIRGenFunctionInfo::arg_iterator it = fnInfo.arg_begin(),
+                                        ie = fnInfo.arg_end();
+       it != ie; ++it) {
+    if (testIfIsVoidTy(it->type))
+      it->info = cir::ABIArgInfo::getIgnore();
+    else
+      it->info = cir::ABIArgInfo::getDirect(CGT.convertType(it->type));
+  }
+  auto retTy = fnInfo.getReturnType();
+  if (testIfIsVoidTy(retTy))
+    fnInfo.getReturnInfo() = cir::ABIArgInfo::getIgnore();
+  else
+    fnInfo.getReturnInfo() = cir::ABIArgInfo::getDirect(CGT.convertType(retTy));
+}
+
 ABIInfo::~ABIInfo() {}
 
 bool ABIInfo::isPromotableIntegerTypeForABI(QualType Ty) const {
@@ -634,5 +686,9 @@ const TargetCIRGenInfo &CIRGenModule::getTargetCIRGenInfo() {
   case llvm::Triple::spirv64: {
     return SetCIRGenInfo(new SPIRVTargetCIRGenInfo(genTypes));
   }
+
+  case llvm::Triple::nvptx64: {
+    return SetCIRGenInfo(new NVPTXTargetCIRGenInfo(genTypes));
+  }
   }
 }
diff --git a/clang/test/CIR/CodeGen/CUDA/simple.cu b/clang/test/CIR/CodeGen/CUDA/simple.cu
@@ -0,0 +1,16 @@
+#include "../Inputs/cuda.h"
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
+// RUN:            -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+
+// This should emit as a normal C++ function.
+__host__ void host_fn(int *a, int *b, int *c) {}
+
+// CIR: cir.func @_Z7host_fnPiS_S_
+
+// This shouldn't emit.
+__device__ void device_fn(int* a, double b, float c) {}
+
+// CHECK-NOT: cir.func @_Z9device_fnPidf
diff --git a/clang/test/CIR/CodeGen/Inputs/cuda.h b/clang/test/CIR/CodeGen/Inputs/cuda.h
@@ -0,0 +1,74 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+/* From test/CodeGenCUDA/Inputs/cuda.h. */
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#define __grid_constant__
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef __HIP_API_PER_THREAD_DEFAULT_STREAM__
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+
+#endif
+
+extern "C" __device__ int printf(const char*, ...);