From 4069ee96ef8a1a86edeef371b5a30fdc2cd8f3d0 Mon Sep 17 00:00:00 2001 From: David Rivera Date: Thu, 20 Nov 2025 17:53:03 -0500 Subject: [PATCH 1/3] [CIR][CUDA] Add Support for stream per thread --- clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 18 ++++++++++++------ clang/test/CIR/CodeGen/CUDA/simple.cu | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index 2b64997de866..5f0d4ea1757a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -161,12 +161,18 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, // The default stream is usually stream 0 (the legacy default stream). // For per-thread default stream, we need a different LaunchKernel function. + std::string kernelLaunchAPI = "LaunchKernel"; if (cgm.getLangOpts().GPUDefaultStream == - LangOptions::GPUDefaultStreamKind::PerThread) - llvm_unreachable("NYI"); + LangOptions::GPUDefaultStreamKind::PerThread) { + if (cgf.getLangOpts().HIP) + kernelLaunchAPI = kernelLaunchAPI + "_spt"; + else if (cgf.getLangOpts().CUDA) + kernelLaunchAPI = kernelLaunchAPI + "_ptsz"; + } - std::string launchAPI = addPrefixToName("LaunchKernel"); - const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchAPI); + std::string launchKernelName = addPrefixToName(kernelLaunchAPI); + const IdentifierInfo &launchII = + cgm.getASTContext().Idents.get(launchKernelName); FunctionDecl *launchFD = nullptr; for (auto *result : dc->lookup(&launchII)) { if (FunctionDecl *fd = dyn_cast(result)) @@ -175,7 +181,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, if (launchFD == nullptr) { cgm.Error(cgf.CurFuncDecl->getLocation(), - "Can't find declaration for " + launchAPI); + "Can't find declaration for " + launchKernelName); return; } @@ -257,7 +263,7 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, mlir::Type launchTy = cgm.getTypes().convertType(launchFD->getType()); mlir::Operation *launchFn = - cgm.createRuntimeFunction(cast(launchTy), launchAPI); + cgm.createRuntimeFunction(cast(launchTy), launchKernelName); const auto &callInfo = cgm.getTypes().arrangeFunctionDeclaration(launchFD); cgf.emitCall(callInfo, CIRGenCallee::forDirect(launchFn), ReturnValueSlot(), launchArgs); diff --git a/clang/test/CIR/CodeGen/CUDA/simple.cu b/clang/test/CIR/CodeGen/CUDA/simple.cu index 2d4f12da39e0..1c55a89a0670 100644 --- a/clang/test/CIR/CodeGen/CUDA/simple.cu +++ b/clang/test/CIR/CodeGen/CUDA/simple.cu @@ -30,6 +30,26 @@ // RUN: %s -o %t.ll // RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s +// Per Thread Stream test cases: + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-cir -target-sdk-version=12.3 \ +// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefixes=CIR-HOST-PTH --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefixes=LLVM-HOST-PTH --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefixes=OGCG-HOST-PTH --input-file=%t.ll %s + // Attribute for global_fn // CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cu.kernel_name<_Z9global_fni>{{.*}} @@ -54,6 +74,7 @@ __global__ void global_fn(int a) {} // CIR-HOST: cir.call @__cudaPopCallConfiguration // CIR-HOST: cir.get_global @_Z24__device_stub__global_fni // CIR-HOST: cir.call @cudaLaunchKernel +// CIR-HOST-PTH: cir.call @cudaLaunchKernel_ptsz // LLVM-HOST: void @_Z24__device_stub__global_fni // LLVM-HOST: %[[#KernelArgs:]] = alloca [1 x ptr], i64 1, align 16 @@ -61,12 +82,14 @@ __global__ void global_fn(int a) {} // LLVM-HOST: %[[#GEP2:]] = getelementptr [1 x ptr], ptr %[[#KernelArgs]], i32 0, i64 0 // LLVM-HOST: call i32 @__cudaPopCallConfiguration // LLVM-HOST: call i32 @cudaLaunchKernel(ptr @_Z24__device_stub__global_fni +// LLVM-HOST-PTH: call i32 @cudaLaunchKernel_ptsz(ptr @_Z24__device_stub__global_fni // OGCG-HOST: void @_Z24__device_stub__global_fni // OGCG-HOST: %kernel_args = alloca ptr, i64 1, align 16 // OGCG-HOST: getelementptr ptr, ptr %kernel_args, i32 0 // OGCG-HOST: call i32 @__cudaPopCallConfiguration // OGCG-HOST: call noundef i32 @cudaLaunchKernel(ptr noundef @_Z24__device_stub__global_fni +// OGCG-HOST-PTH: call noundef i32 @cudaLaunchKernel_ptsz(ptr noundef @_Z24__device_stub__global_fni int main() { From d40f8ce42d82be59e72a83e4b75237ed892f36de Mon Sep 17 00:00:00 2001 From: David Rivera Date: Thu, 20 Nov 2025 18:05:19 -0500 Subject: [PATCH 2/3] Fix format --- clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index 5f0d4ea1757a..4377d27d438d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -262,8 +262,8 @@ void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf, launchFD->getParamDecl(5)->getType()); mlir::Type launchTy = cgm.getTypes().convertType(launchFD->getType()); - mlir::Operation *launchFn = - cgm.createRuntimeFunction(cast(launchTy), launchKernelName); + mlir::Operation *launchFn = cgm.createRuntimeFunction( + cast(launchTy), launchKernelName); const auto &callInfo = cgm.getTypes().arrangeFunctionDeclaration(launchFD); cgf.emitCall(callInfo, CIRGenCallee::forDirect(launchFn), ReturnValueSlot(), launchArgs); From 09bf69ea36820571c663c04bfd5c2f46bfaa978c Mon Sep 17 00:00:00 2001 From: David Rivera Date: Fri, 21 Nov 2025 08:45:22 -0500 Subject: [PATCH 3/3] Add hip tests cases --- clang/test/CIR/CodeGen/HIP/simple.cpp | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/clang/test/CIR/CodeGen/HIP/simple.cpp b/clang/test/CIR/CodeGen/HIP/simple.cpp index 1f1049856d8d..f9146bada144 100644 --- a/clang/test/CIR/CodeGen/HIP/simple.cpp +++ b/clang/test/CIR/CodeGen/HIP/simple.cpp @@ -30,6 +30,26 @@ // RUN: -I%S/../Inputs/ -emit-llvm %s -o %t.ll // RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s +// Per Thread Stream test cases: + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: -x hip -fhip-new-launch-api \ +// RUN: -I%S/../Inputs/ -emit-cir %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR-HOST-PTH --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: -x hip -emit-llvm -fhip-new-launch-api \ +// RUN: -I%S/../Inputs/ %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST-PTH --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: -x hip -emit-llvm -fhip-new-launch-api \ +// RUN: -I%S/../Inputs/ %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST-PTH --input-file=%t.ll %s + // Attribute for global_fn // CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cu.kernel_name<_Z9global_fni>{{.*}} @@ -53,7 +73,7 @@ __global__ void global_fn(int a) {} // CIR-HOST: %[[#Decayed:]] = cir.cast array_to_ptrdecay %[[#CIRKernelArgs]] // CIR-HOST: cir.call @__hipPopCallConfiguration // CIR-HOST: cir.get_global @_Z9global_fni : !cir.ptr>> -// CIR-HOST: cir.call @hipLaunchKernel +// CIR-HOST-PTH: cir.call @hipLaunchKernel_spt // LLVM-HOST: void @_Z24__device_stub__global_fni // LLVM-HOST: %[[#KernelArgs:]] = alloca [1 x ptr], i64 1, align 16 @@ -61,12 +81,14 @@ __global__ void global_fn(int a) {} // LLVM-HOST: %[[#GEP2:]] = getelementptr [1 x ptr], ptr %[[#KernelArgs]], i32 0, i64 0 // LLVM-HOST: call i32 @__hipPopCallConfiguration // LLVM-HOST: call i32 @hipLaunchKernel(ptr @_Z9global_fni +// LLVM-HOST-PTH: call i32 @hipLaunchKernel_spt(ptr @_Z9global_fni // // OGCG-HOST: define dso_local void @_Z24__device_stub__global_fni // OGCG-HOST: %kernel_args = alloca ptr, i64 1, align 16 // OGCG-HOST: getelementptr ptr, ptr %kernel_args, i32 0 // OGCG-HOST: call i32 @__hipPopCallConfiguration // OGCG-HOST: %call = call noundef i32 @hipLaunchKernel(ptr noundef @_Z9global_fni +// OGCG-HOST-PTH: %call = call noundef i32 @hipLaunchKernel_spt(ptr noundef @_Z9global_fni int main() {