From a3e883a4dc55029b8a7e3c591c3bc85289869686 Mon Sep 17 00:00:00 2001
From: "boxu.zhang" <boxu.zhang@hotmail.com>
Date: Wed, 20 Dec 2023 17:35:25 +0800
Subject: [PATCH 1/3] Make 'UnrollMaxUpperBound' to be overridable by target.
 The default value is still 8 and the command line argument
 '--unroll-max-upperbound' takes final effect if provided.

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h | 4 ++++
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp    | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f5114fa40c70a..735be3680aea0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -560,6 +560,10 @@ class TargetTransformInfo {
     // (set to UINT_MAX to disable). This does not apply in cases where the
     // loop is being fully unrolled.
     unsigned MaxCount;
+    /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound
+    /// to be overrided by a target gives more flexiblity on certain cases.
+    /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8.
+    unsigned MaxUpperBound;
     /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
     /// applies even if full unrolling is selected. This allows a target to fall
     /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f14541a1a037e..7cfeb019af972 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.Count = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.MaxUpperBound = UnrollMaxUpperBound;
   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
   UP.BEInsns = 2;
   UP.Partial = false;
@@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
     UP.MaxCount = UnrollMaxCount;
+  if (UnrollMaxUpperBound.getNumOccurrences() > 0)
+    UP.MaxUpperBound = UnrollMaxUpperBound;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
@@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
     return TripCount;
 
   if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
-      MaxTripCount <= UnrollMaxUpperBound)
+      MaxTripCount <= UP.MaxUpperBound)
     return MaxTripCount;
 
   // if didn't return until here, should continue to other priorties
@@ -952,7 +955,7 @@ bool llvm::computeUnrollCount(
   // cost of exact full unrolling.  As such, if we have an exact count and
   // found it unprofitable, we'll never chose to bounded unroll.
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
-      MaxTripCount <= UnrollMaxUpperBound) {
+      MaxTripCount <= UP.MaxUpperBound) {
     UP.Count = MaxTripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              MaxTripCount, UCE, UP)) {
@@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
-  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+  if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
     UP.Count = 0;
     return false;
   }

From ecc3e286cfe97ff033c00806658f6e69844c5434 Mon Sep 17 00:00:00 2001
From: "boxu.zhang" <boxu.zhang@hotmail.com>
Date: Tue, 20 May 2025 12:22:45 +0800
Subject: [PATCH 2/3] [Clang][OpenCL][NVPTX] precommit test for using vprintf
 to implement builtin printf on OpenCL with NVPTX

---
 clang/test/CodeGenOpenCL/test-printf-nvptx.cl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 clang/test/CodeGenOpenCL/test-printf-nvptx.cl

diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
new file mode 100644
index 0000000000000..6b8aa873a47b6
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -cl-std=CL3.0 -triple nvptx-- -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NV %s
+
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+
+// NV-LABEL: define dso_local spir_kernel void @test_printf(
+// NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
+// NV-NEXT:  entry:
+// NV-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]]
+// NV-NEXT:    ret void
+//
+__kernel void test_printf() {
+  printf("hello, printf on nvptx.");
+}

From 64c686105c830eab5a83cbdc62a67b07b0314072 Mon Sep 17 00:00:00 2001
From: "boxu.zhang" <boxu.zhang@hotmail.com>
Date: Tue, 20 May 2025 12:28:02 +0800
Subject: [PATCH 3/3] [Clang][OpenCL][NVPTX] using vprintf to implement builtin
 printf on OpenCL with NVPTX

---
 clang/lib/AST/ASTContext.cpp                  |  5 +++++
 clang/lib/AST/Decl.cpp                        |  4 +++-
 clang/lib/CodeGen/CGGPUBuiltin.cpp            | 11 +++++++++--
 clang/lib/Sema/SemaDecl.cpp                   |  8 ++++++--
 clang/test/CodeGenOpenCL/test-printf-nvptx.cl |  2 +-
 5 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0395b3e47ab6f..8e3e6d3331b63 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -11481,6 +11481,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
     // FIXME: There's no way to have a built-in with an rvalue ref arg.
     case 'C':
       Type = Type.withConst();
+      // adjust 'const char *' to 'const char __constant *' on OpenCL
+      if (Context.getLangOpts().OpenCL &&
+          Type.getTypePtr() == Context.CharTy.getTypePtr()) {
+        Type = Context.getAddrSpaceQualType(Type, LangAS::opencl_constant);
+      }
       break;
     case 'D':
       Type = Context.getVolatileType(Type);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c2ea155679193..a0b334e4323c8 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3600,8 +3600,10 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
 
   // OpenCL v1.2 s6.9.f - The library functions defined in
   // the C99 standard headers are not available.
+  // EXCEPTION: printf is supported for AMDGPU
   if (Context.getLangOpts().OpenCL &&
-      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
+      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) &&
+      BuiltinID != Builtin::BIprintf)
     return 0;
 
   // CUDA does not have device-side standard library. printf and malloc are the
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index e465789a003eb..aa7fa5426bff7 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -135,6 +135,7 @@ bool containsNonScalarVarargs(CodeGenFunction *CGF, const CallArgList &Args) {
 RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
                                 llvm::Function *Decl, bool WithSizeArg) {
   CodeGenModule &CGM = CGF->CGM;
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
   CGBuilderTy &Builder = CGF->Builder;
   assert(E->getBuiltinCallee() == Builtin::BIprintf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
@@ -155,9 +156,15 @@ RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
 
   auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args);
   llvm::Value *BufferPtr = r.first;
+  llvm::Value *Fmt = Args[0].getRValue(*CGF).getScalarVal();
 
-  llvm::SmallVector<llvm::Value *, 3> Vec = {
-      Args[0].getRValue(*CGF).getScalarVal(), BufferPtr};
+  // For OpenCL, the default addrspace of 'format' argument is LangAS::opencl_constant,
+  // however, the 'vprintf' requires it to be unqualified 'ptr' type. Do pointer cast if
+  // it's the case.
+  if (CGM.getContext().getLangOpts().OpenCL)
+    Fmt = Builder.CreatePointerCast(Fmt, llvm::PointerType::getUnqual(Ctx));
+
+  llvm::SmallVector<llvm::Value *, 3> Vec = {Fmt, BufferPtr};
   if (WithSizeArg) {
     // Passing > 32bit of data as a local alloca doesn't work for nvptx or
     // amdgpu
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ffbe317d55999..6792c768977c4 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7008,8 +7008,12 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
 }
 
 void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) {
-  if (Decl->getType().hasAddressSpace())
-    return;
+  // Address space is only meaningful for pointer type
+  if (Decl->getType()->isPointerType()) {
+    const PointerType *T = dyn_cast<PointerType>(Decl->getType().getTypePtr());
+    if (T->getPointeeType().hasAddressSpace())
+      return;
+  }
   if (Decl->getType()->isDependentType())
     return;
   if (VarDecl *Var = dyn_cast<VarDecl>(Decl)) {
diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
index 6b8aa873a47b6..4a674c396c23c 100644
--- a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
+++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl
@@ -6,7 +6,7 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)))
 // NV-LABEL: define dso_local spir_kernel void @test_printf(
 // NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
 // NV-NEXT:  entry:
-// NV-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]]
+// NV-NEXT:    [[TMP0:%.*]] = call i32 @vprintf(ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null)
 // NV-NEXT:    ret void
 //
 __kernel void test_printf() {