-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[RISCV] Use vcompress in deinterleave2 intrinsic lowering #118325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Use vcompress in deinterleave2 intrinsic lowering #118325
Conversation
This is analogous to febbf91 which added shuffle lowering using vcompress; we can do the same thing in the deinterleave2 lowering path which is used for scalable vectors. Note that we can further improve this for high lmul usage by adjusting how we materialize the mask (whose result is at most m1 with a known bit pattern). I am deliberately staging the work so that the changes to reduce register pressure are more easily evaluated on their own merit.
|
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesThis is analogous to febbf91 which added shuffle lowering using vcompress; we can do the same thing in the deinterleave2 lowering path which is used for scalable vectors. Note that we can further improve this for high lmul usage by adjusting how we materialize the mask (whose result is at most m1 with a known bit pattern). I am deliberately staging the work so that the changes to reduce register pressure are more easily evaluated on their own merit. Full diff: https://github.com/llvm/llvm-project/pull/118325.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 78dc3cb27a6988..9229ca64a17391 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10756,9 +10756,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT,
Op.getOperand(0), Op.getOperand(1));
- // We want to operate on all lanes, so get the mask and VL and mask for it
- auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget);
- SDValue Passthru = DAG.getUNDEF(ConcatVT);
// We can deinterleave through vnsrl.wi if the element type is smaller than
// ELEN
@@ -10771,19 +10768,28 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
}
// For the indices, use the same SEW to avoid an extra vsetvli
+ // TODO: If container type is larger than m1, we can consider using a splat
+ // of a constant instead of the following sequence
+
+ // Create a vector of even indices {0, 1, 2, ...}
MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger();
- // Create a vector of even indices {0, 2, 4, ...}
- SDValue EvenIdx =
- DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2));
- // Create a vector of odd indices {1, 3, 5, ... }
- SDValue OddIdx =
- DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT));
-
- // Gather the even and odd elements into two separate vectors
- SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
- Concat, EvenIdx, Passthru, Mask, VL);
- SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
- Concat, OddIdx, Passthru, Mask, VL);
+ SDValue StepVec = DAG.getStepVector(DL, IdxVT);
+ // 0, 1, 0, 1, 0, 1
+ SDValue ZeroOnes = DAG.getNode(ISD::AND, DL, IdxVT, StepVec,
+ DAG.getConstant(1, DL, IdxVT));
+ MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
+ SDValue EvenMask = DAG.getSetCC(DL, MaskVT, ZeroOnes,
+ DAG.getConstant(0, DL, IdxVT),
+ ISD::CondCode::SETEQ);
+ // Have the later be the not of the former to minimize the live range of
+ // the index vector since that might be large.
+ SDValue OddMask = DAG.getLogicalNOT(DL, EvenMask, MaskVT);
+
+ // vcompress the even and odd elements into two separate vectors
+ SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+ EvenMask, DAG.getUNDEF(ConcatVT));
+ SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+ OddMask, DAG.getUNDEF(ConcatVT));
// Extract the result half of the gather for even and odd
SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 39a1bfcda3d83f..4338d1f61af728 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -106,95 +106,55 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 40
+; CHECK-NEXT: li a2, 24
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: vadd.vv v24, v8, v8
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vand.vi v8, v8, 1
; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vmseq.vi v24, v8, 0
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vadd.vi v8, v24, 1
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vrgather.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v24, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmnot.m v6, v24
+; CHECK-NEXT: vcompress.vm v8, v16, v24
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vcompress.vm v24, v16, v6
+; CHECK-NEXT: vmv1r.v v12, v6
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vcompress.vm v0, v16, v13
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v16, v24, v0
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vcompress.vm v0, v16, v12
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v12, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v28, v16
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index f20a90a4223139..99743066c79a82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -73,12 +73,13 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vid.v v12
-; CHECK-NEXT: vadd.vv v16, v12, v12
-; CHECK-NEXT: vrgather.vv v12, v8, v16
-; CHECK-NEXT: vadd.vi v16, v16, 1
-; CHECK-NEXT: vrgather.vv v20, v8, v16
+; CHECK-NEXT: vand.vi v12, v12, 1
+; CHECK-NEXT: vmseq.vi v16, v12, 0
+; CHECK-NEXT: vcompress.vm v12, v8, v16
+; CHECK-NEXT: vmnot.m v14, v16
+; CHECK-NEXT: vcompress.vm v16, v8, v14
; CHECK-NEXT: vmv2r.v v8, v12
-; CHECK-NEXT: vmv2r.v v10, v20
+; CHECK-NEXT: vmv2r.v v10, v16
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
@@ -89,12 +90,13 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vadd.vv v24, v16, v16
-; CHECK-NEXT: vrgather.vv v16, v8, v24
-; CHECK-NEXT: vadd.vi v24, v24, 1
-; CHECK-NEXT: vrgather.vv v0, v8, v24
+; CHECK-NEXT: vand.vi v16, v16, 1
+; CHECK-NEXT: vmseq.vi v24, v16, 0
+; CHECK-NEXT: vcompress.vm v16, v8, v24
+; CHECK-NEXT: vmnot.m v20, v24
+; CHECK-NEXT: vcompress.vm v24, v8, v20
; CHECK-NEXT: vmv4r.v v8, v16
-; CHECK-NEXT: vmv4r.v v12, v0
+; CHECK-NEXT: vmv4r.v v12, v24
; CHECK-NEXT: ret
%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
@@ -180,66 +182,50 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vadd.vv v0, v8, v8
-; CHECK-NEXT: vrgather.vv v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v16, v8, v0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vadd.vi v8, v0, 1
-; CHECK-NEXT: vrgather.vv v0, v24, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v16, v24, v8
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vand.vi v24, v16, 1
+; CHECK-NEXT: vmseq.vi v16, v24, 0
+; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmnot.m v17, v16
+; CHECK-NEXT: vcompress.vm v0, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vcompress.vm v24, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vcompress.vm v24, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v4, v8
+; CHECK-NEXT: vmv4r.v v4, v24
; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -366,12 +352,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vid.v v12
-; CHECK-NEXT: vadd.vv v16, v12, v12
-; CHECK-NEXT: vrgather.vv v12, v8, v16
-; CHECK-NEXT: vadd.vi v16, v16, 1
-; CHECK-NEXT: vrgather.vv v20, v8, v16
+; CHECK-NEXT: vand.vi v12, v12, 1
+; CHECK-NEXT: vmseq.vi v16, v12, 0
+; CHECK-NEXT: vcompress.vm v12, v8, v16
+; CHECK-NEXT: vmnot.m v14, v16
+; CHECK-NEXT: vcompress.vm v16, v8, v14
; CHECK-NEXT: vmv2r.v v8, v12
-; CHECK-NEXT: vmv2r.v v10, v20
+; CHECK-NEXT: vmv2r.v v10, v16
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
@@ -436,66 +423,50 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vadd.vv v0, v8, v8
-; CHECK-NEXT: vrgather.vv v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v16, v8, v0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vadd.vi v8, v0, 1
-; CHECK-NEXT: vrgather.vv v0, v24, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vrgather.vv v16, v24, v8
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vid.v v16
+; CHECK-NEXT: vand.vi v24, v16, 1
+; CHECK-NEXT: vmseq.vi v16, v24, 0
+; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmnot.m v17, v16
+; CHECK-NEXT: vcompress.vm v0, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vcompress.vm v24, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vcompress.vm v24, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmv4r.v v4, v8
+; CHECK-NEXT: vmv4r.v v4, v24
; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| // TODO: If container type is larger than m1, we can consider using a splat | ||
| // of a constant instead of the following sequence | ||
|
|
||
| // Create a vector of even indices {0, 1, 2, ...} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this comment still needed? Since we don't have indices anymore, just a mask
lukel97
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This is analogous to febbf91 which added shuffle lowering using vcompress; we can do the same thing in the deinterleave2 lowering path which is used for scalable vectors.
Note that we can further improve this for high lmul usage by adjusting how we materialize the mask (whose result is at most m1 with a known bit pattern). I am deliberately staging the work so that the changes to reduce register pressure are more easily evaluated on their own merit.