From a3e883a4dc55029b8a7e3c591c3bc85289869686 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" Date: Wed, 20 Dec 2023 17:35:25 +0800 Subject: [PATCH 1/3] Make 'UnrollMaxUpperBound' to be overridable by target. The default value is still 8 and the command line argument '--unroll-max-upperbound' takes final effect if provided. --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 4 ++++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f5114fa40c70a..735be3680aea0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -560,6 +560,10 @@ class TargetTransformInfo { // (set to UINT_MAX to disable). This does not apply in cases where the // loop is being fully unrolled. unsigned MaxCount; + /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound + /// to be overrided by a target gives more flexiblity on certain cases. + /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8. + unsigned MaxUpperBound; /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but /// applies even if full unrolling is selected. This allows a target to fall /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f14541a1a037e..7cfeb019af972 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.Count = 0; UP.DefaultUnrollRuntimeCount = 8; UP.MaxCount = std::numeric_limits::max(); + UP.MaxUpperBound = UnrollMaxUpperBound; UP.FullUnrollMaxCount = std::numeric_limits::max(); UP.BEInsns = 2; UP.Partial = false; @@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; + if (UnrollMaxUpperBound.getNumOccurrences() > 0) + UP.MaxUpperBound = UnrollMaxUpperBound; if (UnrollFullMaxCount.getNumOccurrences() > 0) UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) @@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return TripCount; if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && - MaxTripCount <= UnrollMaxUpperBound) + MaxTripCount <= UP.MaxUpperBound) return MaxTripCount; // if didn't return until here, should continue to other priorties @@ -952,7 +955,7 @@ bool llvm::computeUnrollCount( // cost of exact full unrolling. As such, if we have an exact count and // found it unprofitable, we'll never chose to bounded unroll. if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && - MaxTripCount <= UnrollMaxUpperBound) { + MaxTripCount <= UP.MaxUpperBound) { UP.Count = MaxTripCount; if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, MaxTripCount, UCE, UP)) { @@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount( } // Don't unroll a small upper bound loop unless user or TTI asked to do so. - if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) { UP.Count = 0; return false; } From ecc3e286cfe97ff033c00806658f6e69844c5434 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" Date: Tue, 20 May 2025 12:22:45 +0800 Subject: [PATCH 2/3] [Clang][OpenCL][NVPTX] precommit test for using vprintf to implement builtin printf on OpenCL with NVPTX --- clang/test/CodeGenOpenCL/test-printf-nvptx.cl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/test-printf-nvptx.cl diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl new file mode 100644 index 0000000000000..6b8aa873a47b6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -cl-std=CL3.0 -triple nvptx-- -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NV %s + +int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); + +// NV-LABEL: define dso_local spir_kernel void @test_printf( +// NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { +// NV-NEXT: entry: +// NV-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]] +// NV-NEXT: ret void +// +__kernel void test_printf() { + printf("hello, printf on nvptx."); +} From 64c686105c830eab5a83cbdc62a67b07b0314072 Mon Sep 17 00:00:00 2001 From: "boxu.zhang" Date: Tue, 20 May 2025 12:28:02 +0800 Subject: [PATCH 3/3] [Clang][OpenCL][NVPTX] using vprintf to implement builtin printf on OpenCL with NVPTX --- clang/lib/AST/ASTContext.cpp | 5 +++++ clang/lib/AST/Decl.cpp | 4 +++- clang/lib/CodeGen/CGGPUBuiltin.cpp | 11 +++++++++-- clang/lib/Sema/SemaDecl.cpp | 8 ++++++-- clang/test/CodeGenOpenCL/test-printf-nvptx.cl | 2 +- 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 0395b3e47ab6f..8e3e6d3331b63 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -11481,6 +11481,11 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, // FIXME: There's no way to have a built-in with an rvalue ref arg. case 'C': Type = Type.withConst(); + // adjust 'const char *' to 'const char __constant *' on OpenCL + if (Context.getLangOpts().OpenCL && + Type.getTypePtr() == Context.CharTy.getTypePtr()) { + Type = Context.getAddrSpaceQualType(Type, LangAS::opencl_constant); + } break; case 'D': Type = Context.getVolatileType(Type); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index c2ea155679193..a0b334e4323c8 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3600,8 +3600,10 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const { // OpenCL v1.2 s6.9.f - The library functions defined in // the C99 standard headers are not available. + // EXCEPTION: printf is supported for AMDGPU if (Context.getLangOpts().OpenCL && - Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID)) + Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) && + BuiltinID != Builtin::BIprintf) return 0; // CUDA does not have device-side standard library. printf and malloc are the diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp index e465789a003eb..aa7fa5426bff7 100644 --- a/clang/lib/CodeGen/CGGPUBuiltin.cpp +++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp @@ -135,6 +135,7 @@ bool containsNonScalarVarargs(CodeGenFunction *CGF, const CallArgList &Args) { RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, llvm::Function *Decl, bool WithSizeArg) { CodeGenModule &CGM = CGF->CGM; + llvm::LLVMContext &Ctx = CGM.getLLVMContext(); CGBuilderTy &Builder = CGF->Builder; assert(E->getBuiltinCallee() == Builtin::BIprintf); assert(E->getNumArgs() >= 1); // printf always has at least one arg. @@ -155,9 +156,15 @@ RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args); llvm::Value *BufferPtr = r.first; + llvm::Value *Fmt = Args[0].getRValue(*CGF).getScalarVal(); - llvm::SmallVector Vec = { - Args[0].getRValue(*CGF).getScalarVal(), BufferPtr}; + // For OpenCL, the default addrspace of 'format' argument is LangAS::opencl_constant, + // however, the 'vprintf' requires it to be unqualified 'ptr' type. Do pointer cast if + // it's the case. + if (CGM.getContext().getLangOpts().OpenCL) + Fmt = Builder.CreatePointerCast(Fmt, llvm::PointerType::getUnqual(Ctx)); + + llvm::SmallVector Vec = {Fmt, BufferPtr}; if (WithSizeArg) { // Passing > 32bit of data as a local alloca doesn't work for nvptx or // amdgpu diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index ffbe317d55999..6792c768977c4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7008,8 +7008,12 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) { } void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) { - if (Decl->getType().hasAddressSpace()) - return; + // Address space is only meaningful for pointer type + if (Decl->getType()->isPointerType()) { + const PointerType *T = dyn_cast(Decl->getType().getTypePtr()); + if (T->getPointeeType().hasAddressSpace()) + return; + } if (Decl->getType()->isDependentType()) return; if (VarDecl *Var = dyn_cast(Decl)) { diff --git a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl index 6b8aa873a47b6..4a674c396c23c 100644 --- a/clang/test/CodeGenOpenCL/test-printf-nvptx.cl +++ b/clang/test/CodeGenOpenCL/test-printf-nvptx.cl @@ -6,7 +6,7 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))) // NV-LABEL: define dso_local spir_kernel void @test_printf( // NV-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { // NV-NEXT: entry: -// NV-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR2:[0-9]+]] +// NV-NEXT: [[TMP0:%.*]] = call i32 @vprintf(ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null) // NV-NEXT: ret void // __kernel void test_printf() {