[NVPTX] Vectorize loads when lowering of byval calls, misc. cleanup #151070

AlexMaclean · 2025-07-29T02:04:00Z

This change rewrites LowerCall handling of byval arguments to vectorize the loads in addition to the stores. In addition various minor NFC updates and cleanups are made to reduce code duplication.

llvmbot · 2025-07-29T02:04:32Z

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

This change rewrites LowerCall handling of byval arguments to vectorize the loads in addition to the stores. In addition various minor NFC updates and cleanups are made to reduce code duplication.

Patch is 41.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151070.diff

6 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+179-185)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (-6)
(added) llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll (+38)
(modified) llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll (+12-12)
(modified) llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (+33-33)
(modified) llvm/test/CodeGen/NVPTX/param-vectorize-device.ll (+11-11)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f79b8629f01e2..0a4726212bf12 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -382,6 +382,51 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   }
 }
 
+static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
+  if (N == 1)
+    return VT;
+
+  const unsigned PackingAmt = VT.isVector() ? VT.getVectorNumElements() : 1;
+  return EVT::getVectorVT(C, VT.getScalarType(), N * PackingAmt);
+}
+
+static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT,
+                                         const SDLoc &dl, SelectionDAG &DAG) {
+  if (V.getValueType() == VT) {
+    assert(I == 0 && "Index must be 0 for scalar value");
+    return V;
+  }
+
+  if (!VT.isVector())
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
+                       DAG.getVectorIdxConstant(I, dl));
+
+  return DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, dl, VT, V,
+      DAG.getVectorIdxConstant(I * VT.getVectorNumElements(), dl));
+}
+
+template <typename T>
+static inline SDValue getBuildVectorizedValue(T GetElement, unsigned N,
+                                              const SDLoc &dl,
+                                              SelectionDAG &DAG) {
+  if (N == 1)
+    return GetElement(0);
+
+  SmallVector<SDValue, 8> Values;
+  for (const unsigned I : llvm::seq(N)) {
+    SDValue Val = GetElement(I);
+    if (Val.getValueType().isVector())
+      DAG.ExtractVectorElements(Val, Values);
+    else
+      Values.push_back(Val);
+  }
+
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
+                            Values.size());
+  return DAG.getBuildVector(VT, dl, Values);
+}
+
 /// PromoteScalarIntegerPTX
 /// Used to make sure the arguments/returns are suitable for passing
 /// and promote them to a larger size if they're not.
@@ -420,9 +465,10 @@ static EVT promoteScalarIntegerPTX(const EVT VT) {
 // parameter starting at index Idx using a single vectorized op of
 // size AccessSize. If so, it returns the number of param pieces
 // covered by the vector op. Otherwise, it returns 1.
-static unsigned CanMergeParamLoadStoresStartingAt(
+template <typename T>
+static unsigned canMergeParamLoadStoresStartingAt(
     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
-    const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
+    const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
 
   // Can't vectorize if param alignment is not sufficient.
   if (ParamAlignment < AccessSize)
@@ -472,10 +518,11 @@ static unsigned CanMergeParamLoadStoresStartingAt(
 // of the same size as ValueVTs indicating how each piece should be
 // loaded/stored (i.e. as a scalar, or as part of a vector
 // load/store).
+template <typename T>
 static SmallVector<unsigned, 16>
 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
-                     const SmallVectorImpl<uint64_t> &Offsets,
-                     Align ParamAlignment, bool IsVAArg = false) {
+                     const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
+                     bool IsVAArg = false) {
   // Set vector size to match ValueVTs and mark all elements as
   // scalars by default.
 
@@ -486,7 +533,7 @@ VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
 
   const auto GetNumElts = [&](unsigned I) -> unsigned {
     for (const unsigned AccessSize : {16, 8, 4, 2}) {
-      const unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+      const unsigned NumElts = canMergeParamLoadStoresStartingAt(
           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
       assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
              "Unexpected vectorization size");
@@ -1476,15 +1523,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const SDValue ParamSymbol =
         getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
 
-    SmallVector<EVT, 16> VTs;
-    SmallVector<uint64_t, 16> Offsets;
-
     assert((!IsByVal || Arg.IndirectType) &&
            "byval arg must have indirect type");
     Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
-    ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
-    assert(VTs.size() == Offsets.size() && "Size mismatch");
-    assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch");
 
     const Align ArgAlign = [&]() {
       if (IsByVal) {
@@ -1492,17 +1533,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // so we don't need to worry whether it's naturally aligned or not.
         // See TargetLowering::LowerCallTo().
         const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
-        const Align ByValAlign = getFunctionByValParamAlign(
+        return getFunctionByValParamAlign(
             CB->getCalledFunction(), ETy, InitialAlign, DL);
-        if (IsVAArg)
-          VAOffset = alignTo(VAOffset, ByValAlign);
-        return ByValAlign;
       }
       return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
     }();
 
-    const unsigned TypeSize = DL.getTypeAllocSize(ETy);
-    assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
+    const unsigned TySize = DL.getTypeAllocSize(ETy);
+    assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
            "type size mismatch");
 
     const SDValue ArgDeclare = [&]() {
@@ -1510,105 +1548,120 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         return VADeclareParam;
 
       if (IsByVal || shouldPassAsArray(Arg.Ty))
-        return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+        return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
 
       assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
       assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
              "Only int and float types are supported as non-array arguments");
 
-      return MakeDeclareScalarParam(ParamSymbol, TypeSize);
+      return MakeDeclareScalarParam(ParamSymbol, TySize);
     }();
 
-    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
-    // than 32-bits are sign extended or zero extended, depending on
-    // whether they are signed or unsigned types. This case applies
-    // only to scalar parameters and not to aggregate values.
-    const bool ExtendIntegerParam =
-        Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
-
-    const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
-                                    const MaybeAlign PartAlign) {
-      if (IsByVal) {
-        SDValue Ptr = ArgOutVals[0];
-        auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
-        SDValue SrcAddr =
-            DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
+    if (IsByVal) {
+      assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
+      SDValue SrcPtr = ArgOutVals[0];
+      auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
 
-        return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
-      }
-      SDValue StVal = ArgOutVals[I];
-      assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
-                 StVal.getValueType() &&
-             "OutVal type should always be legal");
+      if (IsVAArg)
+        VAOffset = alignTo(VAOffset, ArgAlign);
 
-      const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
-      const EVT StoreVT =
-          ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+      SmallVector<EVT, 4> ValueVTs, MemVTs;
+      SmallVector<TypeSize, 4> Offsets;
+      ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
 
-      return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
-    };
+      unsigned J = 0;
+      const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
+      for (const unsigned NumElts : VI) {
+        const Align Alignment = commonAlignment(ArgAlign, Offsets[J]);
+        const EVT LoadVT =
+            getVectorizedVT(MemVTs[J], NumElts, *DAG.getContext());
 
-    const auto VectorInfo =
-        VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
+        SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
+        SDValue Load =
+            DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, Alignment);
 
-    unsigned J = 0;
-    for (const unsigned NumElts : VectorInfo) {
-      const int CurOffset = Offsets[J];
-      const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
+        SDValue ByvalAddr = DAG.getObjectPtrOffset(
+            dl, ParamSymbol, Offsets[J].getWithIncrement(VAOffset));
 
-      if (IsVAArg && !IsByVal)
-        // Align each part of the variadic argument to their type.
-        VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
+        SDValue StoreParam =
+            DAG.getStore(ArgDeclare, dl, Load, ByvalAddr,
+                         MachinePointerInfo(ADDRESS_SPACE_PARAM), Alignment);
+        CallPrereqs.push_back(StoreParam);
 
-      assert((IsVAArg || VAOffset == 0) &&
-             "VAOffset must be 0 for non-VA args");
+        J += NumElts;
+      }
+      if (IsVAArg)
+        VAOffset += TySize;
+    } else {
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset);
+      assert(VTs.size() == Offsets.size() && "Size mismatch");
+      assert(VTs.size() == ArgOuts.size() && "Size mismatch");
+
+      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+      // than 32-bits are sign extended or zero extended, depending on
+      // whether they are signed or unsigned types. This case applies
+      // only to scalar parameters and not to aggregate values.
+      const bool ExtendIntegerParam =
+          Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
+
+      const auto GetStoredValue = [&](const unsigned I) {
+        SDValue StVal = ArgOutVals[I];
+        assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+                   StVal.getValueType() &&
+               "OutVal type should always be legal");
+
+        const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+        const EVT StoreVT =
+            ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+        return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
+      };
+
+      unsigned J = 0;
+      const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
+      for (const unsigned NumElts : VI) {
+        const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
+
+        unsigned Offset;
+        if (IsVAArg) {
+          // TODO: We may need to support vector types that can be passed
+          // as scalars in variadic arguments.
+          assert(NumElts == 1 &&
+                 "Vectorization should be disabled for vaargs.");
+
+          // Align each part of the variadic argument to their type.
+          VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
+          Offset = VAOffset;
+
+          const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
+          VAOffset += DL.getTypeAllocSize(
+              TheStoreType.getTypeForEVT(*DAG.getContext()));
+        } else {
+          assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
+          Offset = Offsets[J];
+        }
 
-      const unsigned Offset =
-          (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
-      SDValue Ptr =
-          DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
+        SDValue Ptr =
+            DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
 
-      const MaybeAlign CurrentAlign = ExtendIntegerParam
-                                          ? MaybeAlign(std::nullopt)
-                                          : commonAlignment(ArgAlign, Offset);
+        const MaybeAlign CurrentAlign = ExtendIntegerParam
+                                            ? MaybeAlign(std::nullopt)
+                                            : commonAlignment(ArgAlign, Offset);
 
-      SDValue Val;
-      if (NumElts == 1) {
-        Val = GetStoredValue(J, EltVT, CurrentAlign);
-      } else {
-        SmallVector<SDValue, 8> StoreVals;
-        for (const unsigned K : llvm::seq(NumElts)) {
-          SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
-          if (ValJ.getValueType().isVector())
-            DAG.ExtractVectorElements(ValJ, StoreVals);
-          else
-            StoreVals.push_back(ValJ);
-        }
+        SDValue Val = getBuildVectorizedValue(
+            [&](unsigned K) { return GetStoredValue(J + K); }, NumElts, dl,
+            DAG);
 
-        EVT VT = EVT::getVectorVT(
-            *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
-        Val = DAG.getBuildVector(VT, dl, StoreVals);
-      }
+        SDValue StoreParam =
+            DAG.getStore(ArgDeclare, dl, Val, Ptr,
+                         MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+        CallPrereqs.push_back(StoreParam);
 
-      SDValue StoreParam =
-          DAG.getStore(ArgDeclare, dl, Val, Ptr,
-                       MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
-      CallPrereqs.push_back(StoreParam);
-
-      // TODO: We may need to support vector types that can be passed
-      // as scalars in variadic arguments.
-      if (IsVAArg && !IsByVal) {
-        assert(NumElts == 1 &&
-               "Vectorization is expected to be disabled for variadics.");
-        const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
-        VAOffset +=
-            DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
+        J += NumElts;
       }
-
-      J += NumElts;
     }
-    if (IsVAArg && IsByVal)
-      VAOffset += TypeSize;
   }
 
   // Handle Result
@@ -1676,17 +1729,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     CallPrereqs.push_back(PrototypeDeclare);
   }
 
-  if (ConvertToIndirectCall) {
-    // Copy the function ptr to a ptx register and use the register to call the
-    // function.
-    const MVT DestVT = Callee.getValueType().getSimpleVT();
-    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT));
-    auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
-    Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
-  }
-
   const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
   const unsigned NumArgs =
       std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
@@ -1703,10 +1745,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!Ins.empty()) {
     SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
     assert(VTs.size() == Ins.size() && "Bad value decomposition");
 
     const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+    const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
 
     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
     // 32-bits are sign extended or zero extended, depending on whether
@@ -1714,9 +1757,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const bool ExtendIntegerRetVal =
         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
 
-    const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
     unsigned I = 0;
-    for (const unsigned NumElts : VectorInfo) {
+    const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+    for (const unsigned NumElts : VI) {
       const MaybeAlign CurrentAlign =
           ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
                               : commonAlignment(RetAlign, Offsets[I]);
@@ -1724,16 +1767,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
       const EVT LoadVT =
           ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
-
-      const unsigned PackingAmt =
-          LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
-
-      const EVT VecVT = NumElts == 1 ? LoadVT
-                                     : EVT::getVectorVT(*DAG.getContext(),
-                                                        LoadVT.getScalarType(),
-                                                        NumElts * PackingAmt);
-
-      const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+      const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext());
       SDValue Ptr =
           DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
 
@@ -1742,17 +1776,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
 
       LoadChains.push_back(R.getValue(1));
-
-      if (NumElts == 1)
-        ProxyRegOps.push_back(R);
-      else
-        for (const unsigned J : llvm::seq(NumElts)) {
-          SDValue Elt = DAG.getNode(
-              LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
-                                : ISD::EXTRACT_VECTOR_ELT,
-              dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
-          ProxyRegOps.push_back(Elt);
-        }
+      for (const unsigned J : llvm::seq(NumElts))
+        ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
       I += NumElts;
     }
   }
@@ -3227,11 +3252,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   const DataLayout &DL = DAG.getDataLayout();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  const Function *F = &MF.getFunction();
+  const Function &F = DAG.getMachineFunction().getFunction();
 
   SDValue Root = DAG.getRoot();
   SmallVector<SDValue, 16> OutChains;
@@ -3247,7 +3271,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   // See similar issue in LowerCall.
 
   auto AllIns = ArrayRef(Ins);
-  for (const auto &Arg : F->args()) {
+  for (const auto &Arg : F.args()) {
     const auto ArgIns = AllIns.take_while(
         [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
     AllIns = AllIns.drop_front(ArgIns.size());
@@ -3287,7 +3311,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
 
       SDValue P;
-      if (isKernelFunction(*F)) {
+      if (isKernelFunction(F)) {
         P = ArgSymbol;
         P.getNode()->setIROrder(Arg.getArgNo() + 1);
       } else {
@@ -3305,43 +3329,27 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       assert(VTs.size() == Offsets.size() && "Size mismatch");
 
       const Align ArgAlign = getFunctionArgumentAlignment(
-          F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
+          &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
 
-      const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
       unsigned I = 0;
-      for (const unsigned NumElts : VectorInfo) {
+      const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+      for (const unsigned NumElts : VI) {
         // i1 is loaded/stored as i8
         const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
-        // If the element is a packed type (ex. v2f16, v4i8, etc) holding
-        // multiple elements.
-        const unsigned PackingAmt =
-            LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
-
-        const EVT VecVT =
-            NumElts == 1
-                ? LoadVT
-                : EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(),
-                                   NumElts * PackingAmt);
+        const EVT VecVT = getVectorizedVT(...
[truncated]

github-actions · 2025-07-29T02:07:03Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Artem-B

LGTM overall, with a few style nits.

Artem-B · 2025-07-29T23:22:35Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

  }
 }

+static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {


This could use a comment.

We return an EVT that can hold N VTs
If the VT is a vector, the resulting EVT is a flat vector with the same element type as VT's element type (as opposed to N-element vector of VT vectors).

If may be better for readability to restructure the code so we don't need to explain it as a comment:

return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(), VT.getVectorNumElements() * N) : EVT::getVectorVT(C, VT, N);

Nice. Switched to doing it like this.

Artem-B · 2025-07-29T23:26:46Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+}
+
+template <typename T>
+static inline SDValue getBuildVectorizedValue(T GetElement, unsigned N,


Nit: callables are somewhat more readable when they are passed as the last argument. Otherwise subsequent function argument are somewhat lost in the noise of closing brackets of the passed lambda.

Artem-B · 2025-07-29T23:35:23Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+          assert(NumElts == 1 &&
+                 "Vectorization should be disabled for vaargs.");


Can we hit this code path now? How do we handle it before this patch?

If this is a constraint on the user input, the check should probably be done somewhere in the verifier, and failing in a way that does not rely on the assertions enabled/disabled.

This check was in here before, I just moved it around. All we're doing is confirming that we didn't attempt to vectorize the vaargs stores, we can still handle vector types so this is not a constraint on user input, just confirming we did the right thing internally.

Artem-B · 2025-07-29T23:36:59Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

-            StoreVals.push_back(ValJ);
-        }
+        SDValue Val = getBuildVectorizedValue(
+            [&](unsigned K) { return GetStoredValue(J + K); }, NumElts, dl,


Nit: I'd use [=] here to indicate that we're not intending to modify the captured value.

Artem-B · 2025-07-29T23:44:34Z

llvm/test/CodeGen/NVPTX/param-vectorize-device.ll

  ; CHECK-LABEL: .visible .func caller_St4x3(
  ; CHECK:               .param .align 4 .b8 caller_St4x3_param_0[12],
  ; CHECK:               .param .b64 caller_St4x3_param_1
  ; CHECK:       )


Perhaps it's good opportunity to convert it to automatic checks, too. We do want to see that the values are passed in correct order and not just in a right kind of register matched by a regex.

This test seems to be checking for the defined function signature as well as some of the contents:

; CHECK-LABEL: .visible .func caller_St4x1( ; CHECK: .param .align 4 .b8 caller_St4x1_param_0[4], ; CHECK: .param .b64 caller_St4x1_param_1 ; CHECK: )

If we were to auto-generate the checks we'd miss this validation. Do you still think it makes sense to auto-generate?

llvm-ci · 2025-08-01T22:13:07Z

LLVM Buildbot has detected a new failure on builder clang-hip-vega20 running on hip-vega20-0 while building llvm at step 3 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/24548

Here is the relevant piece of the build log for the reference

Step 3 (annotate) failure: '../llvm-zorg/zorg/buildbot/builders/annotated/hip-build.sh --jobs=' (failure)
...
[57/59] Linking CXX executable External/HIP/math_h-hip-6.3.0
[58/59] Building CXX object External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o
[59/59] Linking CXX executable External/HIP/TheNextWeek-hip-6.3.0
+ build_step 'Testing HIP test-suite'
+ echo '@@@BUILD_STEP Testing HIP test-suite@@@'
+ ninja check-hip-simple
@@@BUILD_STEP Testing HIP test-suite@@@
[0/1] cd /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP && /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/llvm/bin/llvm-lit -sv array-hip-6.3.0.test empty-hip-6.3.0.test with-fopenmp-hip-6.3.0.test saxpy-hip-6.3.0.test memmove-hip-6.3.0.test split-kernel-args-hip-6.3.0.test builtin-logb-scalbn-hip-6.3.0.test TheNextWeek-hip-6.3.0.test algorithm-hip-6.3.0.test cmath-hip-6.3.0.test complex-hip-6.3.0.test math_h-hip-6.3.0.test new-hip-6.3.0.test blender.test
-- Testing: 14 tests, 14 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90
FAIL: test-suite :: External/HIP/blender.test (14 of 14)
******************** TEST 'test-suite :: External/HIP/blender.test' FAILED ********************

/home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/tools/timeit-target --timeout 7200 --limit-core 0 --limit-cpu 7200 --limit-file-size 209715200 --limit-rss-size 838860800 --append-exitstatus --redirect-output /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.out --redirect-input /dev/null --summary /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.time /bin/bash test_blender.sh
/bin/bash verify_blender.sh /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.out
Begin Blender test.
TEST_SUITE_HIP_ROOT=/opt/botworker/llvm/External/hip
Render /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo_release.blend
Blender 4.1.1 (hash e1743a0317bc built 2024-04-15 23:47:45)
Read blend: "/opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo_release.blend"
Could not open as Ogawa file from provided streams.
Unable to open /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.004", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.002", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.003", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.001", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
Could not open as Ogawa file from provided streams.
Unable to open /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.003", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.001", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.002", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.004", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
I0801 22:07:09.132745 2693369 device.cpp:39] HIPEW initialization succeeded
I0801 22:07:09.134615 2693369 device.cpp:45] Found HIPCC hipcc
I0801 22:07:09.197247 2693369 device.cpp:207] Device has compute preemption or is not used for display.
I0801 22:07:09.197312 2693369 device.cpp:211] Added device "" with id "HIP__0000:a3:00".
I0801 22:07:09.197398 2693369 device.cpp:568] Mapped host memory limit set to 536,444,985,344 bytes. (499.60G)
I0801 22:07:09.197654 2693369 device_impl.cpp:63] Using AVX2 CPU kernels.
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Eyepiece_rim
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.014
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.019
Fra:1 Mem:524.14M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.023
Fra:1 Mem:524.25M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Cables.004
Fra:1 Mem:525.03M (Peak 525.03M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Wires
Fra:1 Mem:525.02M (Peak 525.03M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.025
Fra:1 Mem:525.10M (Peak 525.10M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.026
Fra:1 Mem:525.20M (Peak 525.21M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.039
Step 12 (Testing HIP test-suite) failure: Testing HIP test-suite (failure)
@@@BUILD_STEP Testing HIP test-suite@@@
[0/1] cd /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP && /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/llvm/bin/llvm-lit -sv array-hip-6.3.0.test empty-hip-6.3.0.test with-fopenmp-hip-6.3.0.test saxpy-hip-6.3.0.test memmove-hip-6.3.0.test split-kernel-args-hip-6.3.0.test builtin-logb-scalbn-hip-6.3.0.test TheNextWeek-hip-6.3.0.test algorithm-hip-6.3.0.test cmath-hip-6.3.0.test complex-hip-6.3.0.test math_h-hip-6.3.0.test new-hip-6.3.0.test blender.test
-- Testing: 14 tests, 14 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90
FAIL: test-suite :: External/HIP/blender.test (14 of 14)
******************** TEST 'test-suite :: External/HIP/blender.test' FAILED ********************

/home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/tools/timeit-target --timeout 7200 --limit-core 0 --limit-cpu 7200 --limit-file-size 209715200 --limit-rss-size 838860800 --append-exitstatus --redirect-output /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.out --redirect-input /dev/null --summary /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.time /bin/bash test_blender.sh
/bin/bash verify_blender.sh /home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/test-suite-build/External/HIP/Output/blender.test.out
Begin Blender test.
TEST_SUITE_HIP_ROOT=/opt/botworker/llvm/External/hip
Render /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo_release.blend
Blender 4.1.1 (hash e1743a0317bc built 2024-04-15 23:47:45)
Read blend: "/opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo_release.blend"
Could not open as Ogawa file from provided streams.
Unable to open /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.004", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.002", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.003", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.001", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
Could not open as Ogawa file from provided streams.
Unable to open /opt/botworker/llvm/External/hip/Blender_Scenes/290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.003", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.001", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.002", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
WARN (bke.modifier): source/blender/blenkernel/intern/modifier.cc:425 BKE_modifier_set_error: Object: "GEO-flag.004", Modifier: "MeshSequenceCache", Could not create reader for file //290skydemo2_flags.abc
I0801 22:07:09.132745 2693369 device.cpp:39] HIPEW initialization succeeded
I0801 22:07:09.134615 2693369 device.cpp:45] Found HIPCC hipcc
I0801 22:07:09.197247 2693369 device.cpp:207] Device has compute preemption or is not used for display.
I0801 22:07:09.197312 2693369 device.cpp:211] Added device "" with id "HIP__0000:a3:00".
I0801 22:07:09.197398 2693369 device.cpp:568] Mapped host memory limit set to 536,444,985,344 bytes. (499.60G)
I0801 22:07:09.197654 2693369 device_impl.cpp:63] Using AVX2 CPU kernels.
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Eyepiece_rim
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.014
Fra:1 Mem:524.00M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.019
Fra:1 Mem:524.14M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.023
Fra:1 Mem:524.25M (Peak 524.70M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Cables.004
Fra:1 Mem:525.03M (Peak 525.03M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Wires
Fra:1 Mem:525.02M (Peak 525.03M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.025
Fra:1 Mem:525.10M (Peak 525.10M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.026
Fra:1 Mem:525.20M (Peak 525.21M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Rivets.039
Fra:1 Mem:525.25M (Peak 525.25M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Connectors.001
Fra:1 Mem:526.21M (Peak 526.21M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_wires
Fra:1 Mem:526.49M (Peak 526.49M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | ENV-fog
Fra:1 Mem:526.46M (Peak 526.49M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Ground
Fra:1 Mem:550.37M (Peak 550.43M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Connectors.002
Fra:1 Mem:550.48M (Peak 550.48M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Connectors.004
Fra:1 Mem:550.48M (Peak 550.48M) | Time:00:00.70 | Mem:0.00M, Peak:0.00M | Scene, View Layer | Synchronizing object | GEO-Curve_Connectors.007

llvm-ci · 2025-08-01T22:18:26Z

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-sie-win running on sie-win-worker while building llvm at step 7 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/46/builds/21082

Here is the relevant piece of the build log for the reference

Step 7 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'lld :: COFF/import_weak_alias.test' FAILED ********************
Exit Code: 3221225477

Command Output (stdout):
--
# RUN: at line 3
split-file Z:\b\llvm-clang-x86_64-sie-win\llvm-project\lld\test\COFF\import_weak_alias.test Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir
# executed command: split-file 'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\lld\test\COFF\import_weak_alias.test' 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir'
# note: command had no output on stdout or stderr
# RUN: at line 4
z:\b\llvm-clang-x86_64-sie-win\build\bin\llvm-mc.exe --filetype=obj -triple=x86_64-windows-msvc Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir/foo.s -o Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.foo.obj
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\llvm-mc.exe' --filetype=obj -triple=x86_64-windows-msvc 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir/foo.s' -o 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.foo.obj'
# note: command had no output on stdout or stderr
# RUN: at line 5
z:\b\llvm-clang-x86_64-sie-win\build\bin\llvm-mc.exe --filetype=obj -triple=x86_64-windows-msvc Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir/qux.s -o Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.qux.obj
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\llvm-mc.exe' --filetype=obj -triple=x86_64-windows-msvc 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dir/qux.s' -o 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.qux.obj'
# note: command had no output on stdout or stderr
# RUN: at line 6
z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.qux.obj Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.foo.obj -out:Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dll -dll
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe' 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.qux.obj' 'Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.foo.obj' '-out:Z:\b\llvm-clang-x86_64-sie-win\build\tools\lld\test\COFF\Output\import_weak_alias.test.tmp.dll' -dll
# .---command stderr------------
# | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
# | Stack dump:
# | 0.	Program arguments: z:\\b\\llvm-clang-x86_64-sie-win\\build\\bin\\lld-link.exe Z:\\b\\llvm-clang-x86_64-sie-win\\build\\tools\\lld\\test\\COFF\\Output\\import_weak_alias.test.tmp.qux.obj Z:\\b\\llvm-clang-x86_64-sie-win\\build\\tools\\lld\\test\\COFF\\Output\\import_weak_alias.test.tmp.foo.obj -out:Z:\\b\\llvm-clang-x86_64-sie-win\\build\\tools\\lld\\test\\COFF\\Output\\import_weak_alias.test.tmp.dll -dll
# | Exception Code: 0xC0000005
# | #0 0x00007ff8e5361b39 (C:\Windows\System32\KERNELBASE.dll+0x41b39)
# | #1 0x00007ff794ffbb18 (z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe+0xcbb18)
# | #2 0x00007ff7950832db (z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe+0x1532db)
# | #3 0x00007ff794fdd9aa (z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe+0xad9aa)
# | #4 0x00007ff794fdda14 (z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe+0xada14)
# | #5 0x00007ff797750454 (z:\b\llvm-clang-x86_64-sie-win\build\bin\lld-link.exe+0x2820454)
# | #6 0x00007ff8e7947ac4 (C:\Windows\System32\KERNEL32.DLL+0x17ac4)
# | #7 0x00007ff8e879a8c1 (C:\Windows\SYSTEM32\ntdll.dll+0x5a8c1)
# `-----------------------------
# error: command failed with exit status: 0xc0000005

--

********************

pre-commit tests

b286129

AlexMaclean requested review from Artem-B, justinfargnoli and kalxr July 29, 2025 02:04

AlexMaclean self-assigned this Jul 29, 2025

llvmbot added the backend:NVPTX label Jul 29, 2025

AlexMaclean force-pushed the dev/amaclean/byval-align branch from 3937042 to cdfc0e3 Compare July 29, 2025 17:26

[NVPTX] Vectorize loads when lowering of byval calls, misc. cleanup

cdfc0e3

Artem-B approved these changes Jul 29, 2025

View reviewed changes

address comments

ca22392

AlexMaclean merged commit 66e8163 into llvm:main Aug 1, 2025
9 checks passed

		assert(NumElts == 1 &&
		"Vectorization should be disabled for vaargs.");

[NVPTX] Vectorize loads when lowering of byval calls, misc. cleanup #151070

[NVPTX] Vectorize loads when lowering of byval calls, misc. cleanup #151070

Uh oh!

Conversation

AlexMaclean commented Jul 29, 2025

Uh oh!

llvmbot commented Jul 29, 2025

Uh oh!

github-actions bot commented Jul 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Aug 1, 2025

Uh oh!

llvm-ci commented Aug 1, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

github-actions bot commented Jul 29, 2025 •

edited

Loading