llvm
diff --git a/‎clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h‎
Lines changed: 3 additions & 2 deletions b/‎clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp‎
Lines changed: 3 additions & 1 deletion b/‎clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp‎
Lines changed: 27 additions & 0 deletions b/‎llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h‎
Lines changed: 6 additions & 0 deletions b/‎llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp‎
Lines changed: 30 additions & 0 deletions b/‎llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h‎
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 81 additions & 13 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 81 additions & 13 deletions
diff --git a/‎llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll‎
Lines changed: 21 additions & 0 deletions b/‎llvm/test/Analysis/CostModel/NVPTX/inline-asm.ll‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg‎
Lines changed: 2 additions & 0 deletions b/‎llvm/test/Analysis/CostModel/NVPTX/lit.local.cfg‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll‎
Lines changed: 47 additions & 0 deletions b/‎llvm/test/CodeGen/NVPTX/nvptx-aa-inline-asm.ll‎
Lines changed: 47 additions & 0 deletions
@@ -153,7 +153,7 @@ struct ModuleDeps {
 
   /// Get (or compute) the compiler invocation that can be used to build this
   /// module. Does not include argv[0].
-  const std::vector<std::string> &getBuildArguments();
+  const std::vector<std::string> &getBuildArguments() const;
 
 private:
   friend class ModuleDepCollector;
@@ -166,7 +166,8 @@ struct ModuleDeps {
   /// including transitive dependencies.
   std::vector<std::string> FileDeps;
 
-  std::variant<std::monostate, CowCompilerInvocation, std::vector<std::string>>
+  mutable std::variant<std::monostate, CowCompilerInvocation,
+                       std::vector<std::string>>
       BuildInfo;
 };
 
 
@@ -31,7 +31,9 @@ void ModuleDeps::forEachFileDep(llvm::function_ref<void(StringRef)> Cb) const {
   }
 }
 
-const std::vector<std::string> &ModuleDeps::getBuildArguments() {
+const std::vector<std::string> &ModuleDeps::getBuildArguments() const {
+  // FIXME: this operation is not thread safe and is expected to be called
+  // on a single thread. Otherwise it should be protected with a lock.
   assert(!std::holds_alternative<std::monostate>(BuildInfo) &&
          "Using uninitialized ModuleDeps");
   if (const auto *CI = std::get_if<CowCompilerInvocation>(&BuildInfo))
 
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -115,3 +116,29 @@ ModRefInfo NVPTXAAResult::getModRefInfoMask(const MemoryLocation &Loc,
 
   return ModRefInfo::ModRef;
 }
+
+MemoryEffects NVPTXAAResult::getMemoryEffects(const CallBase *Call,
+                                              AAQueryInfo &AAQI) {
+  // Inline assembly with no side-effect or memory clobbers should not
+  // indirectly access memory in the PTX specification.
+  if (const auto *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
+    // Volatile is translated as side-effects.
+    if (IA->hasSideEffects())
+      return MemoryEffects::unknown();
+
+    for (const InlineAsm::ConstraintInfo &Constraint : IA->ParseConstraints()) {
+      // Indirect constraints (e.g. =*m) are unsupported in inline PTX.
+      if (Constraint.isIndirect)
+        return MemoryEffects::unknown();
+
+      // Memory clobbers prevent optimization.
+      if ((Constraint.Type & InlineAsm::ConstraintPrefix::isClobber) &&
+          any_of(Constraint.Codes,
+                 [](const auto &Code) { return Code == "{memory}"; }))
+        return MemoryEffects::unknown();
+    }
+    return MemoryEffects::none();
+  }
+
+  return MemoryEffects::unknown();
+}
@@ -36,6 +36,12 @@ class NVPTXAAResult : public AAResultBase {
 
   ModRefInfo getModRefInfoMask(const MemoryLocation &Loc, AAQueryInfo &AAQI,
                                bool IgnoreLocals);
+
+  MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI);
+
+  MemoryEffects getMemoryEffects(const Function *F) {
+    return MemoryEffects::unknown();
+  }
 };
 
 /// Analysis pass providing a never-invalidated alias analysis result.
 
@@ -8,6 +8,7 @@
 
 #include "NVPTXTargetTransformInfo.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -483,6 +484,35 @@ NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   return std::nullopt;
 }
 
+InstructionCost
+NVPTXTTIImpl::getInstructionCost(const User *U,
+                                 ArrayRef<const Value *> Operands,
+                                 TTI::TargetCostKind CostKind) {
+  if (const auto *CI = dyn_cast<CallInst>(U))
+    if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
+      // Without this implementation getCallCost() would return the number
+      // of arguments+1 as the cost. Because the cost-model assumes it is a call
+      // since it is classified as a call in the IR. A better cost model would
+      // be to return the number of asm instructions embedded in the asm
+      // string.
+      auto &AsmStr = IA->getAsmString();
+      const unsigned InstCount =
+          count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
+            // Trim off scopes denoted by '{' and '}' as these can be ignored
+            AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
+            // This is pretty coarse but does a reasonably good job of
+            // identifying things that look like instructions, possibly with a
+            // predicate ("@").
+            return !AsmInst.empty() &&
+                   (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
+                    AsmInst.find(".pragma") != StringRef::npos);
+          });
+      return InstCount * TargetTransformInfo::TCC_Basic;
+    }
+
+  return BaseT::getInstructionCost(U, Operands, CostKind);
+}
+
 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
 
@@ -94,6 +94,10 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() const { return 11; }
 
+  InstructionCost getInstructionCost(const User *U,
+                                     ArrayRef<const Value *> Operands,
+                                     TTI::TargetCostKind CostKind);
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
 
@@ -2247,6 +2247,36 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   }
 }
 
+/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
+/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
+/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
+/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
+/// be narrowed to an index-independent load if it feeds all wide ops at all
+/// indices (checked by via the operands of the wide recipe at lane0, \p
+/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
+/// is defined at \p Idx of a load interleave group.
+static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
+                          VPValue *V, unsigned Idx) {
+  auto *DefR = V->getDefiningRecipe();
+  if (!DefR)
+    return false;
+  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
+    return !W->getMask() &&
+           all_of(zip(WideMember0->operands(), WideMember->operands()),
+                  [V](const auto P) {
+                    // V must be as at the same places in both WideMember0 and
+                    // WideMember.
+                    const auto &[WideMember0Op, WideMemberOp] = P;
+                    return (WideMember0Op == V) == (WideMemberOp == V);
+                  });
+
+  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
+    return IR->getInterleaveGroup()->getFactor() ==
+               IR->getInterleaveGroup()->getNumMembers() &&
+           IR->getVPValue(Idx) == V;
+  return false;
+}
+
 /// Returns true if \p IR is a full interleave group with factor and number of
 /// members both equal to \p VF. The interleave group must also access the full
 /// vector width \p VectorRegWidth.
@@ -2284,7 +2314,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                              unsigned VectorRegWidth) {
   using namespace llvm::VPlanPatternMatch;
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-  if (VF.isScalable() || !VectorLoop)
+  if (VF.isScalable() || !VectorLoop || Plan.getUF() != 1)
     return;
 
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
@@ -2309,6 +2339,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     if (R.mayWriteToMemory() && !InterleaveR)
       return;
 
+    // All other ops are allowed, but we reject uses that cannot be converted
+    // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
@@ -2323,7 +2355,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
 
     // For now, we only support full interleave groups storing load interleave
     // groups.
-    if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
+    if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
           VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
           if (!DefR)
             return false;
@@ -2333,31 +2365,67 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                      IR->getInterleaveGroup()->getNumMembers() &&
                  IR->getVPValue(Op.index()) == Op.value();
         })) {
+      StoreGroups.push_back(InterleaveR);
+      continue;
+    }
+
+    // Check if all values feeding InterleaveR are matching wide recipes, which
+    // operands that can be narrowed.
+    auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
+        InterleaveR->getStoredValues()[0]->getDefiningRecipe());
+    if (!WideMember0)
       return;
+    for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
+      auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
+      if (!R || R->getOpcode() != WideMember0->getOpcode() ||
+          R->getNumOperands() > 2)
+        return;
+      if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
+            return !canNarrowLoad(WideMember0, R, V, Idx);
+          }))
+        return;
     }
     StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
     return;
 
-  // Convert InterleaveGroup R to a single VPWidenLoadRecipe.
+  // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
-    auto *LoadGroup = cast<VPInterleaveRecipe>(R);
-    // Narrow interleave group to wide load, as transformed VPlan will only
+    if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+      // Narrow interleave group to wide load, as transformed VPlan will only
+      // process one original iteration.
+      auto *L = new VPWidenLoadRecipe(
+          *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
+          LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+          /*Reverse=*/false, LoadGroup->getDebugLoc());
+      L->insertBefore(LoadGroup);
+      return L;
+    }
+
+    auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+
+    // Narrow wide load to uniform scalar load, as transformed VPlan will only
     // process one original iteration.
-    auto *L = new VPWidenLoadRecipe(
-        *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
-        LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-        /*Reverse=*/false, LoadGroup->getDebugLoc());
-    L->insertBefore(LoadGroup);
-    return L;
+    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
+                                    WideLoad->operands(), /*IsUniform*/ true);
+    N->insertBefore(WideLoad);
+    return N;
   };
 
   // Narrow operation tree rooted at store groups.
   for (auto *StoreGroup : StoreGroups) {
-    VPValue *Res =
-        NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+    VPValue *Res = nullptr;
+    if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
+            StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
+      for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+        WideMember0->setOperand(
+            Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
+      Res = WideMember0;
+    } else {
+      Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+    }
 
     auto *S = new VPWidenStoreRecipe(
         *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
 
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.000000e+00)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32 $0, $2, $3, $4;\0A\09madc.hi.u32 $1, $2, $3, 0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %1 = call double asm "rsqrt.approx.ftz.f64 $0, $1;", "=d,d"(double 1.0)
+  %2 = call { i32, i32 } asm "{\0A\09mad.lo.cc.u32   $0, $2, $3, $4;\0A\09madc.hi.u32     $1, $2, $3,  0;\0A\09}", "=r,=r,r,r,r"(i32 2, i32 3, i32 3)
+  %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 0)
+  %4 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09@%p1 exit; \0A\09}", "=r,r"(i32 0)
+  call void asm sideeffect ".pragma \22nounroll\22;\0A\09", "~{memory}"()
+  ret void
+}
@@ -0,0 +1,2 @@
+if not "NVPTX" in config.root.targets:
+    config.unsupported = True
@@ -0,0 +1,47 @@
+; RUN: opt -passes=aa-eval -aa-pipeline=nvptx-aa,basic-aa -print-all-alias-modref-info < %s -disable-output 2>&1 \
+; RUN:   | FileCheck %s --check-prefixes CHECK-ALIAS
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+;;CHECK-ALIAS-LABEL: Function: test_sideeffect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> call
+define void @test_sideeffect(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  call void asm sideeffect "membar.gl;", ""()
+  store i32 5, ptr addrspace(1) %0, align 4
+  ret void
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_indirect
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_indirect(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  store i32 0, ptr addrspace(1) %0, align 4
+  %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,*m"(ptr addrspace(1) elementtype(i32) %0)
+  store i32 0, ptr addrspace(1) %0, align 4
+  ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_memory
+;;CHECK-ALIAS: Both ModRef: Ptr: i32* %0 <-> %1 = call
+define i32 @test_memory(ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  store i32 0, ptr addrspace(1) %0, align 4
+  %1 = call i32 asm "ld.global.u32 $0, [$1];", "=r,l,~{memory}"(ptr addrspace(1) %0)
+  store i32 0, ptr addrspace(1) %0, align 4
+  ret i32 %1
+}
+
+;;CHECK-ALIAS-LABEL: Function: test_no_sideeffect
+;;CHECK-ALIAS: NoModRef: Ptr: i32* %0 <-> %1 = call
+define void @test_no_sideeffect(ptr %in, ptr %out) {
+entry:
+  %0 = addrspacecast ptr %out to ptr addrspace(1)
+  %1 = call i32 asm "cvt.u32.u64 $0, $1;", "=r,l"(ptr %in)
+  store i32 %1, ptr addrspace(1) %0, align 4
+  ret void
+}
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,9 @@ void ModuleDeps::forEachFileDep(llvm::function_ref<void(StringRef)> Cb) const {`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
`34`		`-const std::vector<std::string> &ModuleDeps::getBuildArguments() {`
	`34`	`+const std::vector<std::string> &ModuleDeps::getBuildArguments() const {`
	`35`	`+ // FIXME: this operation is not thread safe and is expected to be called`
	`36`	`+ // on a single thread. Otherwise it should be protected with a lock.`
`35`	`37`	`assert(!std::holds_alternative<std::monostate>(BuildInfo) &&`
`36`	`38`	`"Using uninitialized ModuleDeps");`
`37`	`39`	`if (const auto *CI = std::get_if<CowCompilerInvocation>(&BuildInfo))`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+if not "NVPTX" in config.root.targets:`
	`2`	`+ config.unsupported = True`