Skip to content

Commit 73245b0

Browse files
authored
[RISCV] Rewrite deinterleave load as vlse optimization as DAG combine (llvm#150049)
This reworks an existing optimization on the fixed vector (shuffle based) deinterleave lowering into a DAG combine. This has the effect of making it kick in much more widely - in particular on the deinterleave intrinsic (i.e. scalable) path, deinterleaveN (without load) lowering, but also the intrinsic lowering paths.
1 parent fa6965f commit 73245b0

File tree

6 files changed

+86
-41
lines changed

6 files changed

+86
-41
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20843,6 +20843,62 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
2084320843
}
2084420844
break;
2084520845
}
20846+
case RISCVISD::TUPLE_EXTRACT: {
20847+
EVT VT = N->getValueType(0);
20848+
SDValue Tuple = N->getOperand(0);
20849+
unsigned Idx = N->getConstantOperandVal(1);
20850+
if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
20851+
break;
20852+
20853+
unsigned NF = 0;
20854+
switch (Tuple.getConstantOperandVal(1)) {
20855+
default:
20856+
break;
20857+
case Intrinsic::riscv_vlseg2_mask:
20858+
case Intrinsic::riscv_vlseg3_mask:
20859+
case Intrinsic::riscv_vlseg4_mask:
20860+
case Intrinsic::riscv_vlseg5_mask:
20861+
case Intrinsic::riscv_vlseg6_mask:
20862+
case Intrinsic::riscv_vlseg7_mask:
20863+
case Intrinsic::riscv_vlseg8_mask:
20864+
NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
20865+
break;
20866+
}
20867+
20868+
if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
20869+
break;
20870+
20871+
unsigned SEW = VT.getScalarSizeInBits();
20872+
assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
20873+
"Type mismatch without bitcast?");
20874+
unsigned Stride = SEW / 8 * NF;
20875+
unsigned Offset = SEW / 8 * Idx;
20876+
20877+
SDValue Ops[] = {
20878+
/*Chain=*/Tuple.getOperand(0),
20879+
/*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
20880+
/*Passthru=*/Tuple.getOperand(2),
20881+
/*Ptr=*/
20882+
DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
20883+
DAG.getConstant(Offset, DL, XLenVT)),
20884+
/*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
20885+
/*Mask=*/Tuple.getOperand(4),
20886+
/*VL=*/Tuple.getOperand(5),
20887+
/*Policy=*/Tuple.getOperand(6)};
20888+
20889+
auto TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
20890+
// Match getTgtMemIntrinsic for non-unit stride case
20891+
EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
20892+
MachineFunction &MF = DAG.getMachineFunction();
20893+
MachineMemOperand *MMO = MF.getMachineMemOperand(
20894+
TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
20895+
20896+
SDVTList VTs = DAG.getVTList({VT, MVT::Other});
20897+
SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
20898+
Ops, MemVT, MMO);
20899+
DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
20900+
return Result.getValue(0);
20901+
}
2084620902
}
2084720903

2084820904
return SDValue();

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
216216
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
217217
return false;
218218

219-
// If the segment load is going to be performed segment at a time anyways
220-
// and there's only one element used, use a strided load instead. This
221-
// will be equally fast, and create less vector register pressure.
222-
if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
223-
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
224-
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
225-
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
226-
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
227-
// For rv64, need to truncate i64 to i32 to match signature. As VL is at most
228-
// the number of active lanes (which is bounded by i32) this is safe.
229-
VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
230-
231-
CallInst *CI =
232-
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
233-
{VTy, BasePtr->getType(), Stride->getType()},
234-
{BasePtr, Stride, Mask, VL});
235-
Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
236-
CI->addParamAttr(0,
237-
Attribute::getWithAlignment(CI->getContext(), Alignment));
238-
Shuffles[0]->replaceAllUsesWith(CI);
239-
return true;
240-
};
241-
242219
CallInst *VlsegN = Builder.CreateIntrinsic(
243220
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
244221

llvm/test/CodeGen/RISCV/rvv/pr141907.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,27 +9,29 @@ define void @pr141907(ptr %0) nounwind {
99
; CHECK-NEXT: slli a1, a1, 2
1010
; CHECK-NEXT: sub sp, sp, a1
1111
; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma
12-
; CHECK-NEXT: vmv.v.i v9, 0
12+
; CHECK-NEXT: vmv.v.i v8, 0
1313
; CHECK-NEXT: vmclr.m v0
1414
; CHECK-NEXT: li a1, 0
15-
; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
16-
; CHECK-NEXT: vmv.v.i v12, 0
15+
; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
16+
; CHECK-NEXT: vmv.v.i v10, 0
1717
; CHECK-NEXT: addi a2, sp, 16
18+
; CHECK-NEXT: addi a3, sp, 20
19+
; CHECK-NEXT: li a4, 12
1820
; CHECK-NEXT: .LBB0_1: # %vector.body
1921
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2022
; CHECK-NEXT: vs4r.v v8, (a2)
2123
; CHECK-NEXT: vsetvli a1, a1, e8, mf8, ta, ma
2224
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
23-
; CHECK-NEXT: vnsrl.wi v11, v9, 0, v0.t
24-
; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
25-
; CHECK-NEXT: vlseg3e32.v v8, (a2)
25+
; CHECK-NEXT: vnsrl.wi v9, v8, 0, v0.t
26+
; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
27+
; CHECK-NEXT: vlse32.v v8, (a3), a4
2628
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
27-
; CHECK-NEXT: vsseg2e16.v v11, (zero)
29+
; CHECK-NEXT: vsseg2e16.v v9, (zero)
2830
; CHECK-NEXT: bnez a1, .LBB0_1
2931
; CHECK-NEXT: .LBB0_2: # %while.body5
3032
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3133
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
32-
; CHECK-NEXT: vse16.v v9, (a0)
34+
; CHECK-NEXT: vse16.v v8, (a0)
3335
; CHECK-NEXT: j .LBB0_2
3436
entry:
3537
br label %vector.body

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,9 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
407407
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
408408
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
409409
; CHECK: # %bb.0:
410-
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
411-
; CHECK-NEXT: vlseg4e8.v v8, (a0)
410+
; CHECK-NEXT: li a1, 4
411+
; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
412+
; CHECK-NEXT: vlse8.v v8, (a0), a1
412413
; CHECK-NEXT: ret
413414
%vec = load <vscale x 32 x i8>, ptr %p
414415
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -419,8 +420,10 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
419420
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
420421
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
421422
; CHECK: # %bb.0:
422-
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
423-
; CHECK-NEXT: vlseg4e8.v v5, (a0)
423+
; CHECK-NEXT: addi a0, a0, 3
424+
; CHECK-NEXT: li a1, 4
425+
; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
426+
; CHECK-NEXT: vlse8.v v8, (a0), a1
424427
; CHECK-NEXT: ret
425428
%vec = load <vscale x 32 x i8>, ptr %p
426429
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3712,8 +3712,9 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive(<vsca
37123712
; CHECK-NEXT: sub sp, sp, a0
37133713
; CHECK-NEXT: addi a0, sp, 16
37143714
; CHECK-NEXT: vs4r.v v8, (a0)
3715-
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
3716-
; CHECK-NEXT: vlseg8e32.v v8, (a0)
3715+
; CHECK-NEXT: li a1, 32
3716+
; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
3717+
; CHECK-NEXT: vlse32.v v8, (a0), a1
37173718
; CHECK-NEXT: csrr a0, vlenb
37183719
; CHECK-NEXT: slli a0, a0, 2
37193720
; CHECK-NEXT: add sp, sp, a0
@@ -3732,9 +3733,11 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc
37323733
; CHECK-NEXT: slli a0, a0, 2
37333734
; CHECK-NEXT: sub sp, sp, a0
37343735
; CHECK-NEXT: addi a0, sp, 16
3736+
; CHECK-NEXT: addi a1, sp, 36
37353737
; CHECK-NEXT: vs4r.v v8, (a0)
3736-
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
3737-
; CHECK-NEXT: vlseg8e32.v v3, (a0)
3738+
; CHECK-NEXT: li a0, 32
3739+
; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
3740+
; CHECK-NEXT: vlse32.v v8, (a1), a0
37383741
; CHECK-NEXT: csrr a0, vlenb
37393742
; CHECK-NEXT: slli a0, a0, 2
37403743
; CHECK-NEXT: add sp, sp, a0

llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -674,16 +674,20 @@ define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
674674
define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
675675
; RV32-LABEL: load_factor5_oneactive:
676676
; RV32: # %bb.0:
677+
; RV32-NEXT: addi a0, a0, 12
678+
; RV32-NEXT: li a2, 20
677679
; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
678-
; RV32-NEXT: vlseg5e32.v v5, (a0)
680+
; RV32-NEXT: vlse32.v v8, (a0), a2
679681
; RV32-NEXT: ret
680682
;
681683
; RV64-LABEL: load_factor5_oneactive:
682684
; RV64: # %bb.0:
683685
; RV64-NEXT: slli a1, a1, 32
686+
; RV64-NEXT: addi a0, a0, 12
684687
; RV64-NEXT: srli a1, a1, 32
688+
; RV64-NEXT: li a2, 20
685689
; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
686-
; RV64-NEXT: vlseg5e32.v v5, (a0)
690+
; RV64-NEXT: vlse32.v v8, (a0), a2
687691
; RV64-NEXT: ret
688692
%rvl = mul nuw i32 %evl, 5
689693
%wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)

0 commit comments

Comments
 (0)