Skip to content

Commit b0fa117

Browse files
committed
[SelectionDAG][RISCV] Add support for splitting vp.splice
Use a stack based expansion similar to the non-VP splice. This code has been in our downstream for a while. I don't know how often is exercised though. Our downstream was missing clipping for the immediate value to keep it in range of the stack object so I've added it. As I'm writing this, I realize we might need to clip EVL1 when we create the pointer to store the second vector.
1 parent 6c8c816 commit b0fa117

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
985985
void SplitVecRes_VECTOR_INTERLEAVE(SDNode *N);
986986
void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
987987
void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
988+
void SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
988989
void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
989990
void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi);
990991
void SplitVecRes_GET_ACTIVE_LANE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi);

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
13821382
case ISD::UDIVFIXSAT:
13831383
SplitVecRes_FIX(N, Lo, Hi);
13841384
break;
1385+
case ISD::EXPERIMENTAL_VP_SPLICE:
1386+
SplitVecRes_VP_SPLICE(N, Lo, Hi);
1387+
break;
13851388
case ISD::EXPERIMENTAL_VP_REVERSE:
13861389
SplitVecRes_VP_REVERSE(N, Lo, Hi);
13871390
break;
@@ -3209,6 +3212,79 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
32093212
std::tie(Lo, Hi) = DAG.SplitVector(Load, DL);
32103213
}
32113214

3215+
void DAGTypeLegalizer::SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo,
3216+
SDValue &Hi) {
3217+
EVT VT = N->getValueType(0);
3218+
SDValue V1 = N->getOperand(0);
3219+
SDValue V2 = N->getOperand(1);
3220+
int64_t Imm = cast<ConstantSDNode>(N->getOperand(2))->getSExtValue();
3221+
SDValue Mask = N->getOperand(3);
3222+
SDValue EVL1 = N->getOperand(4);
3223+
SDValue EVL2 = N->getOperand(5);
3224+
SDLoc DL(N);
3225+
3226+
// Since EVL2 is considered the real VL it gets promoted during
3227+
// SelectionDAGBuilder. Promote EVL1 here if needed.
3228+
if (getTypeAction(EVL1.getValueType()) == TargetLowering::TypePromoteInteger)
3229+
EVL1 = ZExtPromotedInteger(EVL1);
3230+
3231+
Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
3232+
3233+
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
3234+
VT.getVectorElementCount() * 2);
3235+
SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
3236+
EVT PtrVT = StackPtr.getValueType();
3237+
auto &MF = DAG.getMachineFunction();
3238+
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
3239+
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
3240+
3241+
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
3242+
PtrInfo, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
3243+
Alignment);
3244+
MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
3245+
PtrInfo, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
3246+
Alignment);
3247+
3248+
unsigned EltWidth = VT.getScalarSizeInBits() / 8;
3249+
SDValue OffsetToV2 =
3250+
DAG.getNode(ISD::MUL, DL, PtrVT, DAG.getZExtOrTrunc(EVL1, DL, PtrVT),
3251+
DAG.getConstant(EltWidth, DL, PtrVT));
3252+
SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
3253+
3254+
SDValue TrueMask = DAG.getBoolConstant(true, DL, Mask.getValueType(), VT);
3255+
SDValue StoreV1 = DAG.getStoreVP(DAG.getEntryNode(), DL, V1, StackPtr,
3256+
DAG.getUNDEF(PtrVT), TrueMask, EVL1,
3257+
V1.getValueType(), StoreMMO, ISD::UNINDEXED);
3258+
3259+
SDValue StoreV2 =
3260+
DAG.getStoreVP(StoreV1, DL, V2, StackPtr2, DAG.getUNDEF(PtrVT), TrueMask,
3261+
EVL2, V2.getValueType(), StoreMMO, ISD::UNINDEXED);
3262+
3263+
SDValue Load;
3264+
if (Imm >= 0) {
3265+
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VT, N->getOperand(2));
3266+
Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr, Mask, EVL2, LoadMMO);
3267+
} else {
3268+
uint64_t TrailingElts = -Imm;
3269+
SDValue TrailingBytes = DAG.getConstant(TrailingElts * EltWidth, DL, PtrVT);
3270+
3271+
// Make sure TrailingBytes doesn't exceed the size of vec1.
3272+
TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, OffsetToV2);
3273+
3274+
// Calculate the start address of the spliced result.
3275+
StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
3276+
Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr2, Mask, EVL2, LoadMMO);
3277+
}
3278+
3279+
EVT LoVT, HiVT;
3280+
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
3281+
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Load,
3282+
DAG.getVectorIdxConstant(0, DL));
3283+
Hi =
3284+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Load,
3285+
DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
3286+
}
3287+
32123288
void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
32133289
SDValue &Hi) {
32143290
SDLoc DL(N);

llvm/test/CodeGen/RISCV/rvv/vp-splice.ll

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,130 @@ define <vscale x 2 x float> @test_vp_splice_nxv2f32_masked(<vscale x 2 x float>
286286
%v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
287287
ret <vscale x 2 x float> %v
288288
}
289+
290+
define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
291+
; CHECK-LABEL: test_vp_splice_nxv16i64:
292+
; CHECK: # %bb.0:
293+
; CHECK-NEXT: csrr a4, vlenb
294+
; CHECK-NEXT: slli a1, a4, 3
295+
; CHECK-NEXT: add a5, a0, a1
296+
; CHECK-NEXT: vl8re64.v v24, (a5)
297+
; CHECK-NEXT: mv a5, a2
298+
; CHECK-NEXT: bltu a2, a4, .LBB21_2
299+
; CHECK-NEXT: # %bb.1:
300+
; CHECK-NEXT: mv a5, a4
301+
; CHECK-NEXT: .LBB21_2:
302+
; CHECK-NEXT: addi sp, sp, -80
303+
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
304+
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
305+
; CHECK-NEXT: addi s0, sp, 80
306+
; CHECK-NEXT: csrr a6, vlenb
307+
; CHECK-NEXT: slli a6, a6, 5
308+
; CHECK-NEXT: sub sp, sp, a6
309+
; CHECK-NEXT: andi sp, sp, -64
310+
; CHECK-NEXT: vl8re64.v v0, (a0)
311+
; CHECK-NEXT: addi a0, sp, 64
312+
; CHECK-NEXT: sub a6, a2, a4
313+
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
314+
; CHECK-NEXT: vse64.v v8, (a0)
315+
; CHECK-NEXT: sltu a5, a2, a6
316+
; CHECK-NEXT: addi a5, a5, -1
317+
; CHECK-NEXT: and a5, a5, a6
318+
; CHECK-NEXT: add a6, a0, a1
319+
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
320+
; CHECK-NEXT: vse64.v v16, (a6)
321+
; CHECK-NEXT: sub a5, a3, a4
322+
; CHECK-NEXT: slli a2, a2, 3
323+
; CHECK-NEXT: sltu a6, a3, a5
324+
; CHECK-NEXT: add a2, a0, a2
325+
; CHECK-NEXT: addi a0, a6, -1
326+
; CHECK-NEXT: add a6, a2, a1
327+
; CHECK-NEXT: and a0, a0, a5
328+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
329+
; CHECK-NEXT: vse64.v v24, (a6)
330+
; CHECK-NEXT: bltu a3, a4, .LBB21_4
331+
; CHECK-NEXT: # %bb.3:
332+
; CHECK-NEXT: mv a3, a4
333+
; CHECK-NEXT: .LBB21_4:
334+
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
335+
; CHECK-NEXT: vse64.v v0, (a2)
336+
; CHECK-NEXT: addi a2, sp, 104
337+
; CHECK-NEXT: add a1, a2, a1
338+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
339+
; CHECK-NEXT: vle64.v v16, (a1)
340+
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
341+
; CHECK-NEXT: vle64.v v8, (a2)
342+
; CHECK-NEXT: addi sp, s0, -80
343+
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
344+
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
345+
; CHECK-NEXT: addi sp, sp, 80
346+
; CHECK-NEXT: ret
347+
%v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 5, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
348+
ret <vscale x 16 x i64> %v
349+
}
350+
351+
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
352+
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
353+
; CHECK: # %bb.0:
354+
; CHECK-NEXT: csrr a4, vlenb
355+
; CHECK-NEXT: slli a1, a4, 3
356+
; CHECK-NEXT: add a5, a0, a1
357+
; CHECK-NEXT: vl8re64.v v0, (a5)
358+
; CHECK-NEXT: mv a5, a2
359+
; CHECK-NEXT: bltu a2, a4, .LBB22_2
360+
; CHECK-NEXT: # %bb.1:
361+
; CHECK-NEXT: mv a5, a4
362+
; CHECK-NEXT: .LBB22_2:
363+
; CHECK-NEXT: addi sp, sp, -80
364+
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
365+
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
366+
; CHECK-NEXT: addi s0, sp, 80
367+
; CHECK-NEXT: csrr a6, vlenb
368+
; CHECK-NEXT: slli a6, a6, 5
369+
; CHECK-NEXT: sub sp, sp, a6
370+
; CHECK-NEXT: andi sp, sp, -64
371+
; CHECK-NEXT: vl8re64.v v24, (a0)
372+
; CHECK-NEXT: addi a0, sp, 64
373+
; CHECK-NEXT: sub a6, a2, a4
374+
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
375+
; CHECK-NEXT: vse64.v v8, (a0)
376+
; CHECK-NEXT: sltu a5, a2, a6
377+
; CHECK-NEXT: addi a5, a5, -1
378+
; CHECK-NEXT: and a5, a5, a6
379+
; CHECK-NEXT: add a6, a0, a1
380+
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
381+
; CHECK-NEXT: vse64.v v16, (a6)
382+
; CHECK-NEXT: sub a6, a3, a4
383+
; CHECK-NEXT: slli a2, a2, 3
384+
; CHECK-NEXT: sltu a7, a3, a6
385+
; CHECK-NEXT: add a5, a0, a2
386+
; CHECK-NEXT: addi a7, a7, -1
387+
; CHECK-NEXT: and a0, a7, a6
388+
; CHECK-NEXT: add a6, a5, a1
389+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
390+
; CHECK-NEXT: vse64.v v0, (a6)
391+
; CHECK-NEXT: bltu a3, a4, .LBB22_4
392+
; CHECK-NEXT: # %bb.3:
393+
; CHECK-NEXT: mv a3, a4
394+
; CHECK-NEXT: .LBB22_4:
395+
; CHECK-NEXT: li a4, 8
396+
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
397+
; CHECK-NEXT: vse64.v v24, (a5)
398+
; CHECK-NEXT: bltu a2, a4, .LBB22_6
399+
; CHECK-NEXT: # %bb.5:
400+
; CHECK-NEXT: li a2, 8
401+
; CHECK-NEXT: .LBB22_6:
402+
; CHECK-NEXT: sub a5, a5, a2
403+
; CHECK-NEXT: add a1, a5, a1
404+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
405+
; CHECK-NEXT: vle64.v v16, (a1)
406+
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
407+
; CHECK-NEXT: vle64.v v8, (a5)
408+
; CHECK-NEXT: addi sp, s0, -80
409+
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
410+
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
411+
; CHECK-NEXT: addi sp, sp, 80
412+
; CHECK-NEXT: ret
413+
%v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 -1, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
414+
ret <vscale x 16 x i64> %v
415+
}

0 commit comments

Comments
 (0)