Skip to content

Commit aa00f44

Browse files
authored
Merge branch 'main' into fmv-add-feature-cssc
2 parents 5ddbfa1 + 577631f commit aa00f44

File tree

19 files changed

+462
-199
lines changed

19 files changed

+462
-199
lines changed

clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ struct ModuleDeps {
153153

154154
/// Get (or compute) the compiler invocation that can be used to build this
155155
/// module. Does not include argv[0].
156-
const std::vector<std::string> &getBuildArguments();
156+
const std::vector<std::string> &getBuildArguments() const;
157157

158158
private:
159159
friend class ModuleDepCollector;
@@ -166,7 +166,8 @@ struct ModuleDeps {
166166
/// including transitive dependencies.
167167
std::vector<std::string> FileDeps;
168168

169-
std::variant<std::monostate, CowCompilerInvocation, std::vector<std::string>>
169+
mutable std::variant<std::monostate, CowCompilerInvocation,
170+
std::vector<std::string>>
170171
BuildInfo;
171172
};
172173

clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ void ModuleDeps::forEachFileDep(llvm::function_ref<void(StringRef)> Cb) const {
3131
}
3232
}
3333

34-
const std::vector<std::string> &ModuleDeps::getBuildArguments() {
34+
const std::vector<std::string> &ModuleDeps::getBuildArguments() const {
35+
// FIXME: this operation is not thread safe and is expected to be called
36+
// on a single thread. Otherwise it should be protected with a lock.
3537
assert(!std::holds_alternative<std::monostate>(BuildInfo) &&
3638
"Using uninitialized ModuleDeps");
3739
if (const auto *CI = std::get_if<CowCompilerInvocation>(&BuildInfo))

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "MCTargetDesc/NVPTXBaseInfo.h"
1414
#include "NVPTX.h"
1515
#include "llvm/Analysis/ValueTracking.h"
16+
#include "llvm/IR/InlineAsm.h"
1617
#include "llvm/IR/Instructions.h"
1718
#include "llvm/Support/CommandLine.h"
1819

@@ -115,3 +116,29 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
115116

116117
return ModRefInfo::ModRef;
117118
}
119+
120+
MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
121+
AAQueryInfo &AAQI) {
122+
// Inline assembly with no side-effect or memory clobbers should not
123+
// indirectly access memory in the PTX specification.
124+
if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
125+
// Volatile is translated as side-effects.
126+
if (IA->hasSideEffects())
127+
return MemoryEffects::unknown();
128+
129+
for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
130+
// Indirect constraints (e.g. =*m) are unsupported in inline PTX.
131+
if (Constraint.isIndirect)
132+
return MemoryEffects::unknown();
133+
134+
// Memory clobbers prevent optimization.
135+
if ((Constraint.Type & InlineAsm::ConstraintPrefix::isClobber) &&
136+
any_of(Constraint.Codes,
137+
[](const auto &Code) { return Code == "{memory}"; }))
138+
return MemoryEffects::unknown();
139+
}
140+
return MemoryEffects::none();
141+
}
142+
143+
return MemoryEffects::unknown();
144+
}

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
3636

3737
ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
3838
bool IgnoreLocals);
39+
40+
MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
41+
42+
MemoryEffects getMemoryEffects(const Function *F) {
43+
return MemoryEffects::unknown();
44+
}
3945
};
4046

4147
/// Analysis pass providing a never-invalidated alias analysis result.

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "NVPTXTargetTransformInfo.h"
1010
#include "NVPTXUtilities.h"
11+
#include "llvm/ADT/STLExtras.h"
1112
#include "llvm/Analysis/LoopInfo.h"
1213
#include "llvm/Analysis/TargetTransformInfo.h"
1314
#include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,35 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
483484
return std::nullopt;
484485
}
485486

487+
InstructionCost
488+
NVPTXTTIImpl::getInstructionCost(const User *U,
489+
ArrayRef<const Value *> Operands,
490+
TTI::TargetCostKind CostKind) {
491+
if (const auto *CI = dyn_cast<CallInst>(U))
492+
if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
493+
// Without this implementation getCallCost() would return the number
494+
// of arguments+1 as the cost. Because the cost-model assumes it is a call
495+
// since it is classified as a call in the IR. A better cost model would
496+
// be to return the number of asm instructions embedded in the asm
497+
// string.
498+
auto &AsmStr = IA->getAsmString();
499+
const unsigned InstCount =
500+
count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
501+
// Trim off scopes denoted by '{' and '}' as these can be ignored
502+
AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
503+
// This is pretty coarse but does a reasonably good job of
504+
// identifying things that look like instructions, possibly with a
505+
// predicate ("@").
506+
return !AsmInst.empty() &&
507+
(AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
508+
AsmInst.find(".pragma") != StringRef::npos);
509+
});
510+
return InstCount * TargetTransformInfo::TCC_Basic;
511+
}
512+
513+
return BaseT::getInstructionCost(U, Operands, CostKind);
514+
}
515+
486516
InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
487517
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
488518
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
9494
// calls are particularly expensive in NVPTX.
9595
unsigned getInliningThresholdMultiplier() const { return 11; }
9696

97+
InstructionCost getInstructionCost(const User *U,
98+
ArrayRef<const Value *> Operands,
99+
TTI::TargetCostKind CostKind);
100+
97101
InstructionCost getArithmeticInstrCost(
98102
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
99103
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2247,6 +2247,36 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
22472247
}
22482248
}
22492249

2250+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
2251+
/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
2252+
/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
2253+
/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
2254+
/// be narrowed to an index-independent load if it feeds all wide ops at all
2255+
/// indices (checked by via the operands of the wide recipe at lane0, \p
2256+
/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
2257+
/// is defined at \p Idx of a load interleave group.
2258+
static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
2259+
VPValue *V, unsigned Idx) {
2260+
auto *DefR = V->getDefiningRecipe();
2261+
if (!DefR)
2262+
return false;
2263+
if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
2264+
return !W->getMask() &&
2265+
all_of(zip(WideMember0->operands(), WideMember->operands()),
2266+
[V](const auto P) {
2267+
// V must be as at the same places in both WideMember0 and
2268+
// WideMember.
2269+
const auto &[WideMember0Op, WideMemberOp] = P;
2270+
return (WideMember0Op == V) == (WideMemberOp == V);
2271+
});
2272+
2273+
if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
2274+
return IR->getInterleaveGroup()->getFactor() ==
2275+
IR->getInterleaveGroup()->getNumMembers() &&
2276+
IR->getVPValue(Idx) == V;
2277+
return false;
2278+
}
2279+
22502280
/// Returns true if \p IR is a full interleave group with factor and number of
22512281
/// members both equal to \p VF. The interleave group must also access the full
22522282
/// vector width \p VectorRegWidth.
@@ -2284,7 +2314,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
22842314
unsigned VectorRegWidth) {
22852315
using namespace llvm::VPlanPatternMatch;
22862316
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
2287-
if (VF.isScalable() || !VectorLoop)
2317+
if (VF.isScalable() || !VectorLoop || Plan.getUF() != 1)
22882318
return;
22892319

22902320
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
@@ -2309,6 +2339,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
23092339
if (R.mayWriteToMemory() && !InterleaveR)
23102340
return;
23112341

2342+
// All other ops are allowed, but we reject uses that cannot be converted
2343+
// when checking all allowed consumers (store interleave groups) below.
23122344
if (!InterleaveR)
23132345
continue;
23142346

@@ -2323,7 +2355,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
23232355

23242356
// For now, we only support full interleave groups storing load interleave
23252357
// groups.
2326-
if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
2358+
if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
23272359
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
23282360
if (!DefR)
23292361
return false;
@@ -2333,31 +2365,67 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
23332365
IR->getInterleaveGroup()->getNumMembers() &&
23342366
IR->getVPValue(Op.index()) == Op.value();
23352367
})) {
2368+
StoreGroups.push_back(InterleaveR);
2369+
continue;
2370+
}
2371+
2372+
// Check if all values feeding InterleaveR are matching wide recipes, which
2373+
// operands that can be narrowed.
2374+
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
2375+
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
2376+
if (!WideMember0)
23362377
return;
2378+
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
2379+
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
2380+
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
2381+
R->getNumOperands() > 2)
2382+
return;
2383+
if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
2384+
return !canNarrowLoad(WideMember0, R, V, Idx);
2385+
}))
2386+
return;
23372387
}
23382388
StoreGroups.push_back(InterleaveR);
23392389
}
23402390

23412391
if (StoreGroups.empty())
23422392
return;
23432393

2344-
// Convert InterleaveGroup R to a single VPWidenLoadRecipe.
2394+
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
23452395
auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
2346-
auto *LoadGroup = cast<VPInterleaveRecipe>(R);
2347-
// Narrow interleave group to wide load, as transformed VPlan will only
2396+
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
2397+
// Narrow interleave group to wide load, as transformed VPlan will only
2398+
// process one original iteration.
2399+
auto *L = new VPWidenLoadRecipe(
2400+
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2401+
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2402+
/*Reverse=*/false, LoadGroup->getDebugLoc());
2403+
L->insertBefore(LoadGroup);
2404+
return L;
2405+
}
2406+
2407+
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
2408+
2409+
// Narrow wide load to uniform scalar load, as transformed VPlan will only
23482410
// process one original iteration.
2349-
auto *L = new VPWidenLoadRecipe(
2350-
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2351-
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2352-
/*Reverse=*/false, LoadGroup->getDebugLoc());
2353-
L->insertBefore(LoadGroup);
2354-
return L;
2411+
auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
2412+
WideLoad->operands(), /*IsUniform*/ true);
2413+
N->insertBefore(WideLoad);
2414+
return N;
23552415
};
23562416

23572417
// Narrow operation tree rooted at store groups.
23582418
for (auto *StoreGroup : StoreGroups) {
2359-
VPValue *Res =
2360-
NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2419+
VPValue *Res = nullptr;
2420+
if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
2421+
StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
2422+
for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
2423+
WideMember0->setOperand(
2424+
Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
2425+
Res = WideMember0;
2426+
} else {
2427+
Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2428+
}
23612429

23622430
auto *S = new VPWidenStoreRecipe(
23632431
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
3+
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
define void @test1() {
7+
; CHECK-LABEL: 'test1'
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
14+
;
15+
%1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
16+
%2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
17+
%3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
18+
%4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
19+
call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
20+
ret void
21+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not "NVPTX" in config.root.targets:
2+
config.unsupported = True
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa,basic-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
2+
; RUN: | FileCheck %s --check-prefixes CHECK-ALIAS
3+
4+
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
;;CHECK-ALIAS-LABEL: Function: test_sideeffect
8+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
9+
define void @test_sideeffect(ptr %out) {
10+
entry:
11+
%0 = addrspacecast ptr %out to ptr addrspace(1)
12+
call void asm sideeffect "membar.gl;", ""()
13+
store i32 5, ptr addrspace(1) %0, align 4
14+
ret void
15+
}
16+
17+
;;CHECK-ALIAS-LABEL: Function: test_indirect
18+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
19+
define i32 @test_indirect(ptr %out) {
20+
entry:
21+
%0 = addrspacecast ptr %out to ptr addrspace(1)
22+
store i32 0, ptr addrspace(1) %0, align 4
23+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
24+
store i32 0, ptr addrspace(1) %0, align 4
25+
ret i32 %1
26+
}
27+
28+
;;CHECK-ALIAS-LABEL: Function: test_memory
29+
;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
30+
define i32 @test_memory(ptr %out) {
31+
entry:
32+
%0 = addrspacecast ptr %out to ptr addrspace(1)
33+
store i32 0, ptr addrspace(1) %0, align 4
34+
%1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
35+
store i32 0, ptr addrspace(1) %0, align 4
36+
ret i32 %1
37+
}
38+
39+
;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
40+
;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
41+
define void @test_no_sideeffect(ptr %in, ptr %out) {
42+
entry:
43+
%0 = addrspacecast ptr %out to ptr addrspace(1)
44+
%1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
45+
store i32 %1, ptr addrspace(1) %0, align 4
46+
ret void
47+
}

0 commit comments

Comments
 (0)