Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23445,6 +23445,65 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
return DAG.getSplat(VT, DL, InVal);

if (TLI.getTypeAction(*DAG.getContext(), VT) ==
TargetLowering::TypeSplitVector) {
// For dynamic insertelts, the type legalizer may spill the entire
// vector. For a chain of dynamic insertelts, this can be really
// inefficient and bad for compile time. If each insertelt is only fed
// into the next, the vector is write-only across this chain, and we can
// just spill once.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't spilling

SmallVector<SDNode *> Seq{N};
while (true) {
SDValue InVec = Seq.back()->getOperand(0);
if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT ||
isa<ConstantSDNode>(InVec.getOperand(2)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have mixed constant and dynamic indexes, I'd assume you wan't to collect and handle them as well

break;
Seq.push_back(InVec.getNode());
}

// Only care about chains, otherwise this instruction can be handled by
// the type legalizer just fine.
Comment on lines +23465 to +23466
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you rephrase this comment? This makes it sounds like it's working around a correctness issue in the legalizer

if (Seq.size() > 1) {
// In cases where the vector is illegal it will be broken down into
// parts and stored in parts - we should use the alignment for the
// smallest part.
Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
SDValue StackPtr =
DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
Comment on lines +23472 to +23473
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dynamic vector inserts are not always lowered to the stack, that just happens to be the default operation action.

Or is the vector type legalizer code in a worse state than the LegalizeDAG handling for this?

auto &MF = DAG.getMachineFunction();
auto FrameIndex =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No auto

cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);

// Begin spilling
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not spilling

SDValue InVec = Seq.back()->getOperand(0);
SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
PtrInfo, SmallestAlign);

// Lower each dynamic insertelt to a store
for (SDNode *N : reverse(Seq)) {
SDValue Elmnt = N->getOperand(1);
SDValue Index = N->getOperand(2);

// Store the new element. This may be larger than the vector element
// type, so use a truncating store.
SDValue EltPtr =
TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
EVT EltVT = Elmnt.getValueType();
Store = DAG.getTruncStore(
Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
EltVT,
commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
}

// Load the spilled vector
SDValue Load =
DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
return Load.getValue(0);
}
}

return SDValue();
}

Expand Down
207 changes: 207 additions & 0 deletions llvm/test/CodeGen/NVPTX/vector-spill.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

; COM: Spill the vector once.
define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
; CHECK-LABEL: spill_once(
; CHECK: {
; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b64 %rd<39>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0];
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1];
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+16];
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+32];
; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_once_param_1+48];
; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2];
; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
; CHECK-NEXT: add.u64 %rd17, %SP, 0;
; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3];
; CHECK-NEXT: and.b64 %rd20, %rd19, 7;
; CHECK-NEXT: shl.b64 %rd21, %rd20, 3;
; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4];
; CHECK-NEXT: and.b64 %rd24, %rd23, 7;
; CHECK-NEXT: shl.b64 %rd25, %rd24, 3;
; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25;
; CHECK-NEXT: st.b64 [%SP+56], %rd11;
; CHECK-NEXT: st.b64 [%SP+48], %rd10;
; CHECK-NEXT: st.b64 [%SP+40], %rd9;
; CHECK-NEXT: st.b64 [%SP+32], %rd8;
; CHECK-NEXT: st.b64 [%SP+24], %rd7;
; CHECK-NEXT: st.b64 [%SP+16], %rd6;
; CHECK-NEXT: st.b64 [%SP+8], %rd5;
; CHECK-NEXT: st.b64 [%SP], %rd4;
; CHECK-NEXT: st.b64 [%rd18], %rd2;
; CHECK-NEXT: st.b64 [%rd22], %rd3;
; CHECK-NEXT: st.b64 [%rd26], %rd12;
; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5];
; CHECK-NEXT: and.b64 %rd28, %rd27, 7;
; CHECK-NEXT: shl.b64 %rd29, %rd28, 3;
; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29;
; CHECK-NEXT: st.b64 [%rd30], %rd13;
; CHECK-NEXT: ld.b64 %rd31, [%SP+8];
; CHECK-NEXT: ld.b64 %rd32, [%SP];
; CHECK-NEXT: ld.b64 %rd33, [%SP+24];
; CHECK-NEXT: ld.b64 %rd34, [%SP+16];
; CHECK-NEXT: ld.b64 %rd35, [%SP+40];
; CHECK-NEXT: ld.b64 %rd36, [%SP+32];
; CHECK-NEXT: ld.b64 %rd37, [%SP+56];
; CHECK-NEXT: ld.b64 %rd38, [%SP+48];
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31};
; CHECK-NEXT: ret;
entry:
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
%element.3 = load double, ptr addrspace(3) %offset.3, align 8
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
ret void
}

; COM: Spill the vector twice. Because these are in two different slots, the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't spilling?

; resulting spill codes may be non-overlapping even though the insertelt
; sequences overlap.
define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
; CHECK-LABEL: spill_twice(
; CHECK: {
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b64 %rd<51>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0];
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1];
; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+16];
; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+32];
; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_twice_param_1+48];
; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
; CHECK-NEXT: ld.param.b32 %rd14, [spill_twice_param_2];
; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
; CHECK-NEXT: add.u64 %rd17, %SP, 0;
; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
; CHECK-NEXT: add.u64 %rd19, %SP, 64;
; CHECK-NEXT: add.s64 %rd20, %rd19, %rd16;
; CHECK-NEXT: ld.param.b32 %rd21, [spill_twice_param_3];
; CHECK-NEXT: and.b64 %rd22, %rd21, 7;
; CHECK-NEXT: shl.b64 %rd23, %rd22, 3;
; CHECK-NEXT: add.s64 %rd24, %rd17, %rd23;
; CHECK-NEXT: add.s64 %rd25, %rd19, %rd23;
; CHECK-NEXT: st.b64 [%SP+120], %rd11;
; CHECK-NEXT: st.b64 [%SP+112], %rd10;
; CHECK-NEXT: st.b64 [%SP+104], %rd9;
; CHECK-NEXT: st.b64 [%SP+96], %rd8;
; CHECK-NEXT: st.b64 [%SP+88], %rd7;
; CHECK-NEXT: st.b64 [%SP+80], %rd6;
; CHECK-NEXT: st.b64 [%SP+72], %rd5;
; CHECK-NEXT: st.b64 [%SP+64], %rd4;
; CHECK-NEXT: st.b64 [%rd20], %rd2;
; CHECK-NEXT: st.b64 [%rd25], %rd3;
; CHECK-NEXT: ld.param.b32 %rd26, [spill_twice_param_4];
; CHECK-NEXT: and.b64 %rd27, %rd26, 7;
; CHECK-NEXT: shl.b64 %rd28, %rd27, 3;
; CHECK-NEXT: add.s64 %rd29, %rd19, %rd28;
; CHECK-NEXT: st.b64 [%rd29], %rd12;
; CHECK-NEXT: add.s64 %rd30, %rd17, %rd28;
; CHECK-NEXT: ld.b64 %rd31, [%SP+72];
; CHECK-NEXT: ld.b64 %rd32, [%SP+64];
; CHECK-NEXT: ld.b64 %rd33, [%SP+88];
; CHECK-NEXT: ld.b64 %rd34, [%SP+80];
; CHECK-NEXT: ld.b64 %rd35, [%SP+104];
; CHECK-NEXT: ld.b64 %rd36, [%SP+96];
; CHECK-NEXT: ld.b64 %rd37, [%SP+120];
; CHECK-NEXT: ld.b64 %rd38, [%SP+112];
; CHECK-NEXT: st.b64 [%SP+56], %rd11;
; CHECK-NEXT: st.b64 [%SP+48], %rd10;
; CHECK-NEXT: st.b64 [%SP+40], %rd9;
; CHECK-NEXT: st.b64 [%SP+32], %rd8;
; CHECK-NEXT: st.b64 [%SP+24], %rd7;
; CHECK-NEXT: st.b64 [%SP+16], %rd6;
; CHECK-NEXT: st.b64 [%SP+8], %rd5;
; CHECK-NEXT: st.b64 [%SP], %rd4;
; CHECK-NEXT: st.b64 [%rd18], %rd2;
; CHECK-NEXT: st.b64 [%rd24], %rd3;
; CHECK-NEXT: st.b64 [%rd30], %rd12;
; CHECK-NEXT: ld.param.b32 %rd39, [spill_twice_param_5];
; CHECK-NEXT: and.b64 %rd40, %rd39, 7;
; CHECK-NEXT: shl.b64 %rd41, %rd40, 3;
; CHECK-NEXT: add.s64 %rd42, %rd17, %rd41;
; CHECK-NEXT: st.b64 [%rd42], %rd13;
; CHECK-NEXT: ld.b64 %rd43, [%SP+8];
; CHECK-NEXT: ld.b64 %rd44, [%SP];
; CHECK-NEXT: ld.b64 %rd45, [%SP+24];
; CHECK-NEXT: ld.b64 %rd46, [%SP+16];
; CHECK-NEXT: ld.b64 %rd47, [%SP+40];
; CHECK-NEXT: ld.b64 %rd48, [%SP+32];
; CHECK-NEXT: ld.b64 %rd49, [%SP+56];
; CHECK-NEXT: ld.b64 %rd50, [%SP+48];
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd50, %rd49};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd48, %rd47};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd46, %rd45};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd44, %rd43};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd38, %rd37};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd36, %rd35};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd34, %rd33};
; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd32, %rd31};
; CHECK-NEXT: ret;
entry:
%offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
%element.0 = load double, ptr addrspace(3) %offset.0, align 64
%offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
%element.1 = load double, ptr addrspace(3) %offset.1, align 8
%offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
%element.2 = load double, ptr addrspace(3) %offset.2, align 8
%offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
%element.3 = load double, ptr addrspace(3) %offset.3, align 8

; COM: begin chain 1
%vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
%vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1

; COM: interleave a second chain of insertelements
%vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2

; COM: continue chain 1
%vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
%vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3

; COM: save chain 1
%location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64

; COM: save chain 2
%location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096
store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64
ret void
}