diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 309f1bea8b77c..4cab72810b485 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23445,6 +23445,65 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) return DAG.getSplat(VT, DL, InVal); + + if (TLI.getTypeAction(*DAG.getContext(), VT) == + TargetLowering::TypeSplitVector) { + // For dynamic insertelts, the type legalizer may spill the entire + // vector. For a chain of dynamic insertelts, this can be really + // inefficient and bad for compile time. If each insertelt is only fed + // into the next, the vector is write-only across this chain, and we can + // just spill once. + SmallVector Seq{N}; + while (true) { + SDValue InVec = Seq.back()->getOperand(0); + if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT || + isa(InVec.getOperand(2))) + break; + Seq.push_back(InVec.getNode()); + } + + // Only care about chains, otherwise this instruction can be handled by + // the type legalizer just fine. + if (Seq.size() > 1) { + // In cases where the vector is illegal it will be broken down into + // parts and stored in parts - we should use the alignment for the + // smallest part. + Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = + cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + // Begin spilling + SDValue InVec = Seq.back()->getOperand(0); + SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, + PtrInfo, SmallestAlign); + + // Lower each dynamic insertelt to a store + for (SDNode *N : reverse(Seq)) { + SDValue Elmnt = N->getOperand(1); + SDValue Index = N->getOperand(2); + + // Store the new element. This may be larger than the vector element + // type, so use a truncating store. + SDValue EltPtr = + TLI.getVectorElementPointer(DAG, StackPtr, VT, Index); + EVT EltVT = Elmnt.getValueType(); + Store = DAG.getTruncStore( + Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF), + EltVT, + commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8)); + } + + // Load the spilled vector + SDValue Load = + DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign); + return Load.getValue(0); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/vector-spill.ll b/llvm/test/CodeGen/NVPTX/vector-spill.ll new file mode 100644 index 0000000000000..a0b00d340ec03 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/vector-spill.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; COM: Spill the vector once. +define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr { +; CHECK-LABEL: spill_once( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b64 %rd<39>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov.b64 %SPL, __local_depot0; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_once_param_1+48]; +; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16]; +; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24]; +; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2]; +; CHECK-NEXT: and.b64 %rd15, %rd14, 7; +; CHECK-NEXT: shl.b64 %rd16, %rd15, 3; +; CHECK-NEXT: add.u64 %rd17, %SP, 0; +; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16; +; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3]; +; CHECK-NEXT: and.b64 %rd20, %rd19, 7; +; CHECK-NEXT: shl.b64 %rd21, %rd20, 3; +; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21; +; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4]; +; CHECK-NEXT: and.b64 %rd24, %rd23, 7; +; CHECK-NEXT: shl.b64 %rd25, %rd24, 3; +; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25; +; CHECK-NEXT: st.b64 [%SP+56], %rd11; +; CHECK-NEXT: st.b64 [%SP+48], %rd10; +; CHECK-NEXT: st.b64 [%SP+40], %rd9; +; CHECK-NEXT: st.b64 [%SP+32], %rd8; +; CHECK-NEXT: st.b64 [%SP+24], %rd7; +; CHECK-NEXT: st.b64 [%SP+16], %rd6; +; CHECK-NEXT: st.b64 [%SP+8], %rd5; +; CHECK-NEXT: st.b64 [%SP], %rd4; +; CHECK-NEXT: st.b64 [%rd18], %rd2; +; CHECK-NEXT: st.b64 [%rd22], %rd3; +; CHECK-NEXT: st.b64 [%rd26], %rd12; +; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5]; +; CHECK-NEXT: and.b64 %rd28, %rd27, 7; +; CHECK-NEXT: shl.b64 %rd29, %rd28, 3; +; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29; +; CHECK-NEXT: st.b64 [%rd30], %rd13; +; CHECK-NEXT: ld.b64 %rd31, [%SP+8]; +; CHECK-NEXT: ld.b64 %rd32, [%SP]; +; CHECK-NEXT: ld.b64 %rd33, [%SP+24]; +; CHECK-NEXT: ld.b64 %rd34, [%SP+16]; +; CHECK-NEXT: ld.b64 %rd35, [%SP+40]; +; CHECK-NEXT: ld.b64 %rd36, [%SP+32]; +; CHECK-NEXT: ld.b64 %rd37, [%SP+56]; +; CHECK-NEXT: ld.b64 %rd38, [%SP+48]; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31}; +; CHECK-NEXT: ret; +entry: + %offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0 + %element.0 = load double, ptr addrspace(3) %offset.0, align 64 + %offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8 + %element.1 = load double, ptr addrspace(3) %offset.1, align 8 + %offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16 + %element.2 = load double, ptr addrspace(3) %offset.2, align 8 + %offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24 + %element.3 = load double, ptr addrspace(3) %offset.3, align 8 + %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0 + %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1 + %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2 + %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3 + %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024 + store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64 + ret void +} + +; COM: Spill the vector twice. Because these are in two different slots, the +; resulting spill codes may be non-overlapping even though the insertelt +; sequences overlap. +define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr { +; CHECK-LABEL: spill_twice( +; CHECK: { +; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128]; +; CHECK-NEXT: .reg .b64 %SP; +; CHECK-NEXT: .reg .b64 %SPL; +; CHECK-NEXT: .reg .b64 %rd<51>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov.b64 %SPL, __local_depot1; +; CHECK-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_twice_param_1+48]; +; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16]; +; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24]; +; CHECK-NEXT: ld.param.b32 %rd14, [spill_twice_param_2]; +; CHECK-NEXT: and.b64 %rd15, %rd14, 7; +; CHECK-NEXT: shl.b64 %rd16, %rd15, 3; +; CHECK-NEXT: add.u64 %rd17, %SP, 0; +; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16; +; CHECK-NEXT: add.u64 %rd19, %SP, 64; +; CHECK-NEXT: add.s64 %rd20, %rd19, %rd16; +; CHECK-NEXT: ld.param.b32 %rd21, [spill_twice_param_3]; +; CHECK-NEXT: and.b64 %rd22, %rd21, 7; +; CHECK-NEXT: shl.b64 %rd23, %rd22, 3; +; CHECK-NEXT: add.s64 %rd24, %rd17, %rd23; +; CHECK-NEXT: add.s64 %rd25, %rd19, %rd23; +; CHECK-NEXT: st.b64 [%SP+120], %rd11; +; CHECK-NEXT: st.b64 [%SP+112], %rd10; +; CHECK-NEXT: st.b64 [%SP+104], %rd9; +; CHECK-NEXT: st.b64 [%SP+96], %rd8; +; CHECK-NEXT: st.b64 [%SP+88], %rd7; +; CHECK-NEXT: st.b64 [%SP+80], %rd6; +; CHECK-NEXT: st.b64 [%SP+72], %rd5; +; CHECK-NEXT: st.b64 [%SP+64], %rd4; +; CHECK-NEXT: st.b64 [%rd20], %rd2; +; CHECK-NEXT: st.b64 [%rd25], %rd3; +; CHECK-NEXT: ld.param.b32 %rd26, [spill_twice_param_4]; +; CHECK-NEXT: and.b64 %rd27, %rd26, 7; +; CHECK-NEXT: shl.b64 %rd28, %rd27, 3; +; CHECK-NEXT: add.s64 %rd29, %rd19, %rd28; +; CHECK-NEXT: st.b64 [%rd29], %rd12; +; CHECK-NEXT: add.s64 %rd30, %rd17, %rd28; +; CHECK-NEXT: ld.b64 %rd31, [%SP+72]; +; CHECK-NEXT: ld.b64 %rd32, [%SP+64]; +; CHECK-NEXT: ld.b64 %rd33, [%SP+88]; +; CHECK-NEXT: ld.b64 %rd34, [%SP+80]; +; CHECK-NEXT: ld.b64 %rd35, [%SP+104]; +; CHECK-NEXT: ld.b64 %rd36, [%SP+96]; +; CHECK-NEXT: ld.b64 %rd37, [%SP+120]; +; CHECK-NEXT: ld.b64 %rd38, [%SP+112]; +; CHECK-NEXT: st.b64 [%SP+56], %rd11; +; CHECK-NEXT: st.b64 [%SP+48], %rd10; +; CHECK-NEXT: st.b64 [%SP+40], %rd9; +; CHECK-NEXT: st.b64 [%SP+32], %rd8; +; CHECK-NEXT: st.b64 [%SP+24], %rd7; +; CHECK-NEXT: st.b64 [%SP+16], %rd6; +; CHECK-NEXT: st.b64 [%SP+8], %rd5; +; CHECK-NEXT: st.b64 [%SP], %rd4; +; CHECK-NEXT: st.b64 [%rd18], %rd2; +; CHECK-NEXT: st.b64 [%rd24], %rd3; +; CHECK-NEXT: st.b64 [%rd30], %rd12; +; CHECK-NEXT: ld.param.b32 %rd39, [spill_twice_param_5]; +; CHECK-NEXT: and.b64 %rd40, %rd39, 7; +; CHECK-NEXT: shl.b64 %rd41, %rd40, 3; +; CHECK-NEXT: add.s64 %rd42, %rd17, %rd41; +; CHECK-NEXT: st.b64 [%rd42], %rd13; +; CHECK-NEXT: ld.b64 %rd43, [%SP+8]; +; CHECK-NEXT: ld.b64 %rd44, [%SP]; +; CHECK-NEXT: ld.b64 %rd45, [%SP+24]; +; CHECK-NEXT: ld.b64 %rd46, [%SP+16]; +; CHECK-NEXT: ld.b64 %rd47, [%SP+40]; +; CHECK-NEXT: ld.b64 %rd48, [%SP+32]; +; CHECK-NEXT: ld.b64 %rd49, [%SP+56]; +; CHECK-NEXT: ld.b64 %rd50, [%SP+48]; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd50, %rd49}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd48, %rd47}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd46, %rd45}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd44, %rd43}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd38, %rd37}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd36, %rd35}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd34, %rd33}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd32, %rd31}; +; CHECK-NEXT: ret; +entry: + %offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0 + %element.0 = load double, ptr addrspace(3) %offset.0, align 64 + %offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8 + %element.1 = load double, ptr addrspace(3) %offset.1, align 8 + %offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16 + %element.2 = load double, ptr addrspace(3) %offset.2, align 8 + %offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24 + %element.3 = load double, ptr addrspace(3) %offset.3, align 8 + +; COM: begin chain 1 + %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0 + %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1 + +; COM: interleave a second chain of insertelements + %vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2 + +; COM: continue chain 1 + %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2 + %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3 + +; COM: save chain 1 + %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024 + store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64 + +; COM: save chain 2 + %location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096 + store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64 + ret void +}