Skip to content

Commit d1b44a1

Browse files
committed
[RISCV] Use vcompress in deinterleave2 intrinsic lowering
This is analogous to febbf91 which added shuffle lowering using vcompress; we can do the same thing in the deinterleave2 lowering path which is used for scalable vectors. Note that we can further improve this for high lmul usage by adjusting how we materialize the mask (whose result is at most m1 with a known bit pattern). I am deliberately staging the work so that the changes to reduce register pressure are more easily evaluated on their own merit.
1 parent c6f2d35 commit d1b44a1

File tree

3 files changed

+101
-164
lines changed

3 files changed

+101
-164
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10756,9 +10756,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
1075610756
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT,
1075710757
Op.getOperand(0), Op.getOperand(1));
1075810758

10759-
// We want to operate on all lanes, so get the mask and VL and mask for it
10760-
auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget);
10761-
SDValue Passthru = DAG.getUNDEF(ConcatVT);
1076210759

1076310760
// We can deinterleave through vnsrl.wi if the element type is smaller than
1076410761
// ELEN
@@ -10771,19 +10768,28 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
1077110768
}
1077210769

1077310770
// For the indices, use the same SEW to avoid an extra vsetvli
10771+
// TODO: If container type is larger than m1, we can consider using a splat
10772+
// of a constant instead of the following sequence
10773+
10774+
// Create a vector of even indices {0, 1, 2, ...}
1077410775
MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger();
10775-
// Create a vector of even indices {0, 2, 4, ...}
10776-
SDValue EvenIdx =
10777-
DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2));
10778-
// Create a vector of odd indices {1, 3, 5, ... }
10779-
SDValue OddIdx =
10780-
DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT));
10781-
10782-
// Gather the even and odd elements into two separate vectors
10783-
SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
10784-
Concat, EvenIdx, Passthru, Mask, VL);
10785-
SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
10786-
Concat, OddIdx, Passthru, Mask, VL);
10776+
SDValue StepVec = DAG.getStepVector(DL, IdxVT);
10777+
// 0, 1, 0, 1, 0, 1
10778+
SDValue ZeroOnes = DAG.getNode(ISD::AND, DL, IdxVT, StepVec,
10779+
DAG.getConstant(1, DL, IdxVT));
10780+
MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
10781+
SDValue EvenMask = DAG.getSetCC(DL, MaskVT, ZeroOnes,
10782+
DAG.getConstant(0, DL, IdxVT),
10783+
ISD::CondCode::SETEQ);
10784+
// Have the later be the not of the former to minimize the live range of
10785+
// the index vector since that might be large.
10786+
SDValue OddMask = DAG.getLogicalNOT(DL, EvenMask, MaskVT);
10787+
10788+
// vcompress the even and odd elements into two separate vectors
10789+
SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
10790+
EvenMask, DAG.getUNDEF(ConcatVT));
10791+
SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
10792+
OddMask, DAG.getUNDEF(ConcatVT));
1078710793

1078810794
// Extract the result half of the gather for even and odd
1078910795
SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -106,95 +106,55 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
106106
; CHECK-NEXT: addi sp, sp, -16
107107
; CHECK-NEXT: .cfi_def_cfa_offset 16
108108
; CHECK-NEXT: csrr a1, vlenb
109-
; CHECK-NEXT: li a2, 40
109+
; CHECK-NEXT: li a2, 24
110110
; CHECK-NEXT: mul a1, a1, a2
111111
; CHECK-NEXT: sub sp, sp, a1
112-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
112+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
113113
; CHECK-NEXT: csrr a1, vlenb
114114
; CHECK-NEXT: vl8re64.v v16, (a0)
115115
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
116116
; CHECK-NEXT: vid.v v8
117117
; CHECK-NEXT: slli a1, a1, 3
118-
; CHECK-NEXT: vadd.vv v24, v8, v8
119-
; CHECK-NEXT: csrr a2, vlenb
120-
; CHECK-NEXT: slli a2, a2, 4
121-
; CHECK-NEXT: add a2, sp, a2
122-
; CHECK-NEXT: addi a2, a2, 16
123-
; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
118+
; CHECK-NEXT: vand.vi v8, v8, 1
124119
; CHECK-NEXT: add a0, a0, a1
120+
; CHECK-NEXT: vmseq.vi v24, v8, 0
125121
; CHECK-NEXT: vl8re64.v v8, (a0)
126122
; CHECK-NEXT: csrr a0, vlenb
127-
; CHECK-NEXT: slli a0, a0, 5
128-
; CHECK-NEXT: add a0, sp, a0
129-
; CHECK-NEXT: addi a0, a0, 16
130-
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
131-
; CHECK-NEXT: vadd.vi v8, v24, 1
132-
; CHECK-NEXT: csrr a0, vlenb
133-
; CHECK-NEXT: li a1, 24
134-
; CHECK-NEXT: mul a0, a0, a1
123+
; CHECK-NEXT: slli a0, a0, 4
135124
; CHECK-NEXT: add a0, sp, a0
136125
; CHECK-NEXT: addi a0, a0, 16
137126
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
138-
; CHECK-NEXT: vrgather.vv v8, v16, v24
139-
; CHECK-NEXT: csrr a0, vlenb
140-
; CHECK-NEXT: li a1, 24
141-
; CHECK-NEXT: mul a0, a0, a1
142-
; CHECK-NEXT: add a0, sp, a0
143-
; CHECK-NEXT: addi a0, a0, 16
144-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
145-
; CHECK-NEXT: vrgather.vv v24, v16, v0
146-
; CHECK-NEXT: csrr a0, vlenb
147-
; CHECK-NEXT: slli a0, a0, 3
148-
; CHECK-NEXT: add a0, sp, a0
149-
; CHECK-NEXT: addi a0, a0, 16
150-
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
151-
; CHECK-NEXT: csrr a0, vlenb
152-
; CHECK-NEXT: slli a0, a0, 5
153-
; CHECK-NEXT: add a0, sp, a0
154-
; CHECK-NEXT: addi a0, a0, 16
155-
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
127+
; CHECK-NEXT: vmnot.m v6, v24
128+
; CHECK-NEXT: vcompress.vm v8, v16, v24
129+
; CHECK-NEXT: vmv1r.v v13, v24
130+
; CHECK-NEXT: vcompress.vm v24, v16, v6
131+
; CHECK-NEXT: vmv1r.v v12, v6
156132
; CHECK-NEXT: csrr a0, vlenb
157133
; CHECK-NEXT: slli a0, a0, 4
158134
; CHECK-NEXT: add a0, sp, a0
159135
; CHECK-NEXT: addi a0, a0, 16
160-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
161-
; CHECK-NEXT: vrgather.vv v24, v16, v0
136+
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
137+
; CHECK-NEXT: vcompress.vm v0, v16, v13
162138
; CHECK-NEXT: addi a0, sp, 16
163-
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
164-
; CHECK-NEXT: csrr a0, vlenb
165-
; CHECK-NEXT: li a1, 24
166-
; CHECK-NEXT: mul a0, a0, a1
167-
; CHECK-NEXT: add a0, sp, a0
168-
; CHECK-NEXT: addi a0, a0, 16
169-
; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
170-
; CHECK-NEXT: csrr a0, vlenb
171-
; CHECK-NEXT: slli a0, a0, 5
172-
; CHECK-NEXT: add a0, sp, a0
173-
; CHECK-NEXT: addi a0, a0, 16
174-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
175-
; CHECK-NEXT: vrgather.vv v16, v24, v0
139+
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
140+
; CHECK-NEXT: vcompress.vm v0, v16, v12
176141
; CHECK-NEXT: csrr a0, vlenb
177-
; CHECK-NEXT: slli a0, a0, 4
142+
; CHECK-NEXT: slli a0, a0, 3
178143
; CHECK-NEXT: add a0, sp, a0
179144
; CHECK-NEXT: addi a0, a0, 16
180-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
145+
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
181146
; CHECK-NEXT: addi a0, sp, 16
182147
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
183148
; CHECK-NEXT: vmv4r.v v12, v16
184149
; CHECK-NEXT: csrr a0, vlenb
185-
; CHECK-NEXT: slli a0, a0, 4
186-
; CHECK-NEXT: add a0, sp, a0
187-
; CHECK-NEXT: addi a0, a0, 16
188-
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
189-
; CHECK-NEXT: csrr a0, vlenb
190150
; CHECK-NEXT: slli a0, a0, 3
191151
; CHECK-NEXT: add a0, sp, a0
192152
; CHECK-NEXT: addi a0, a0, 16
193-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
153+
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
194154
; CHECK-NEXT: vmv4r.v v28, v16
195155
; CHECK-NEXT: vmv8r.v v16, v24
196156
; CHECK-NEXT: csrr a0, vlenb
197-
; CHECK-NEXT: li a1, 40
157+
; CHECK-NEXT: li a1, 24
198158
; CHECK-NEXT: mul a0, a0, a1
199159
; CHECK-NEXT: add sp, sp, a0
200160
; CHECK-NEXT: .cfi_def_cfa sp, 16

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll

Lines changed: 62 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,13 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
7373
; CHECK: # %bb.0:
7474
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
7575
; CHECK-NEXT: vid.v v12
76-
; CHECK-NEXT: vadd.vv v16, v12, v12
77-
; CHECK-NEXT: vrgather.vv v12, v8, v16
78-
; CHECK-NEXT: vadd.vi v16, v16, 1
79-
; CHECK-NEXT: vrgather.vv v20, v8, v16
76+
; CHECK-NEXT: vand.vi v12, v12, 1
77+
; CHECK-NEXT: vmseq.vi v16, v12, 0
78+
; CHECK-NEXT: vcompress.vm v12, v8, v16
79+
; CHECK-NEXT: vmnot.m v14, v16
80+
; CHECK-NEXT: vcompress.vm v16, v8, v14
8081
; CHECK-NEXT: vmv2r.v v8, v12
81-
; CHECK-NEXT: vmv2r.v v10, v20
82+
; CHECK-NEXT: vmv2r.v v10, v16
8283
; CHECK-NEXT: ret
8384
%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
8485
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
@@ -89,12 +90,13 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
8990
; CHECK: # %bb.0:
9091
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
9192
; CHECK-NEXT: vid.v v16
92-
; CHECK-NEXT: vadd.vv v24, v16, v16
93-
; CHECK-NEXT: vrgather.vv v16, v8, v24
94-
; CHECK-NEXT: vadd.vi v24, v24, 1
95-
; CHECK-NEXT: vrgather.vv v0, v8, v24
93+
; CHECK-NEXT: vand.vi v16, v16, 1
94+
; CHECK-NEXT: vmseq.vi v24, v16, 0
95+
; CHECK-NEXT: vcompress.vm v16, v8, v24
96+
; CHECK-NEXT: vmnot.m v20, v24
97+
; CHECK-NEXT: vcompress.vm v24, v8, v20
9698
; CHECK-NEXT: vmv4r.v v8, v16
97-
; CHECK-NEXT: vmv4r.v v12, v0
99+
; CHECK-NEXT: vmv4r.v v12, v24
98100
; CHECK-NEXT: ret
99101
%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
100102
ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
@@ -180,66 +182,50 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
180182
; CHECK-NEXT: addi sp, sp, -16
181183
; CHECK-NEXT: .cfi_def_cfa_offset 16
182184
; CHECK-NEXT: csrr a0, vlenb
183-
; CHECK-NEXT: slli a0, a0, 5
184-
; CHECK-NEXT: sub sp, sp, a0
185-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
186-
; CHECK-NEXT: csrr a0, vlenb
187-
; CHECK-NEXT: li a1, 24
188-
; CHECK-NEXT: mul a0, a0, a1
189-
; CHECK-NEXT: add a0, sp, a0
190-
; CHECK-NEXT: addi a0, a0, 16
191-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
192-
; CHECK-NEXT: vmv8r.v v24, v8
193-
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
194-
; CHECK-NEXT: vid.v v8
195-
; CHECK-NEXT: vadd.vv v0, v8, v8
196-
; CHECK-NEXT: vrgather.vv v8, v24, v0
197-
; CHECK-NEXT: csrr a0, vlenb
198-
; CHECK-NEXT: slli a0, a0, 3
199-
; CHECK-NEXT: add a0, sp, a0
200-
; CHECK-NEXT: addi a0, a0, 16
201-
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
202-
; CHECK-NEXT: csrr a0, vlenb
203185
; CHECK-NEXT: li a1, 24
204186
; CHECK-NEXT: mul a0, a0, a1
205-
; CHECK-NEXT: add a0, sp, a0
206-
; CHECK-NEXT: addi a0, a0, 16
207-
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
208-
; CHECK-NEXT: vrgather.vv v16, v8, v0
187+
; CHECK-NEXT: sub sp, sp, a0
188+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
209189
; CHECK-NEXT: csrr a0, vlenb
210190
; CHECK-NEXT: slli a0, a0, 4
211191
; CHECK-NEXT: add a0, sp, a0
212192
; CHECK-NEXT: addi a0, a0, 16
213193
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
214-
; CHECK-NEXT: vadd.vi v8, v0, 1
215-
; CHECK-NEXT: vrgather.vv v0, v24, v8
216-
; CHECK-NEXT: csrr a0, vlenb
217-
; CHECK-NEXT: li a1, 24
218-
; CHECK-NEXT: mul a0, a0, a1
219-
; CHECK-NEXT: add a0, sp, a0
220-
; CHECK-NEXT: addi a0, a0, 16
221-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
222-
; CHECK-NEXT: vrgather.vv v16, v24, v8
194+
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
195+
; CHECK-NEXT: vid.v v16
196+
; CHECK-NEXT: vand.vi v24, v16, 1
197+
; CHECK-NEXT: vmseq.vi v16, v24, 0
198+
; CHECK-NEXT: vcompress.vm v24, v8, v16
223199
; CHECK-NEXT: addi a0, sp, 16
224-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
200+
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
201+
; CHECK-NEXT: vmnot.m v17, v16
202+
; CHECK-NEXT: vcompress.vm v0, v8, v17
225203
; CHECK-NEXT: csrr a0, vlenb
226204
; CHECK-NEXT: slli a0, a0, 4
227205
; CHECK-NEXT: add a0, sp, a0
228206
; CHECK-NEXT: addi a0, a0, 16
229207
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
208+
; CHECK-NEXT: vcompress.vm v24, v8, v16
209+
; CHECK-NEXT: csrr a0, vlenb
210+
; CHECK-NEXT: slli a0, a0, 3
211+
; CHECK-NEXT: add a0, sp, a0
212+
; CHECK-NEXT: addi a0, a0, 16
213+
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
214+
; CHECK-NEXT: vcompress.vm v24, v8, v17
230215
; CHECK-NEXT: csrr a0, vlenb
231216
; CHECK-NEXT: slli a0, a0, 3
232217
; CHECK-NEXT: add a0, sp, a0
233218
; CHECK-NEXT: addi a0, a0, 16
219+
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
220+
; CHECK-NEXT: addi a0, sp, 16
234221
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
235222
; CHECK-NEXT: vmv4r.v v20, v8
236-
; CHECK-NEXT: addi a0, sp, 16
237-
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
238-
; CHECK-NEXT: vmv4r.v v4, v8
223+
; CHECK-NEXT: vmv4r.v v4, v24
239224
; CHECK-NEXT: vmv8r.v v8, v16
240225
; CHECK-NEXT: vmv8r.v v16, v0
241226
; CHECK-NEXT: csrr a0, vlenb
242-
; CHECK-NEXT: slli a0, a0, 5
227+
; CHECK-NEXT: li a1, 24
228+
; CHECK-NEXT: mul a0, a0, a1
243229
; CHECK-NEXT: add sp, sp, a0
244230
; CHECK-NEXT: .cfi_def_cfa sp, 16
245231
; CHECK-NEXT: addi sp, sp, 16
@@ -366,12 +352,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
366352
; CHECK: # %bb.0:
367353
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
368354
; CHECK-NEXT: vid.v v12
369-
; CHECK-NEXT: vadd.vv v16, v12, v12
370-
; CHECK-NEXT: vrgather.vv v12, v8, v16
371-
; CHECK-NEXT: vadd.vi v16, v16, 1
372-
; CHECK-NEXT: vrgather.vv v20, v8, v16
355+
; CHECK-NEXT: vand.vi v12, v12, 1
356+
; CHECK-NEXT: vmseq.vi v16, v12, 0
357+
; CHECK-NEXT: vcompress.vm v12, v8, v16
358+
; CHECK-NEXT: vmnot.m v14, v16
359+
; CHECK-NEXT: vcompress.vm v16, v8, v14
373360
; CHECK-NEXT: vmv2r.v v8, v12
374-
; CHECK-NEXT: vmv2r.v v10, v20
361+
; CHECK-NEXT: vmv2r.v v10, v16
375362
; CHECK-NEXT: ret
376363
%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
377364
ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
@@ -436,66 +423,50 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
436423
; CHECK-NEXT: addi sp, sp, -16
437424
; CHECK-NEXT: .cfi_def_cfa_offset 16
438425
; CHECK-NEXT: csrr a0, vlenb
439-
; CHECK-NEXT: slli a0, a0, 5
440-
; CHECK-NEXT: sub sp, sp, a0
441-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
442-
; CHECK-NEXT: csrr a0, vlenb
443-
; CHECK-NEXT: li a1, 24
444-
; CHECK-NEXT: mul a0, a0, a1
445-
; CHECK-NEXT: add a0, sp, a0
446-
; CHECK-NEXT: addi a0, a0, 16
447-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
448-
; CHECK-NEXT: vmv8r.v v24, v8
449-
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
450-
; CHECK-NEXT: vid.v v8
451-
; CHECK-NEXT: vadd.vv v0, v8, v8
452-
; CHECK-NEXT: vrgather.vv v8, v24, v0
453-
; CHECK-NEXT: csrr a0, vlenb
454-
; CHECK-NEXT: slli a0, a0, 3
455-
; CHECK-NEXT: add a0, sp, a0
456-
; CHECK-NEXT: addi a0, a0, 16
457-
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
458-
; CHECK-NEXT: csrr a0, vlenb
459426
; CHECK-NEXT: li a1, 24
460427
; CHECK-NEXT: mul a0, a0, a1
461-
; CHECK-NEXT: add a0, sp, a0
462-
; CHECK-NEXT: addi a0, a0, 16
463-
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
464-
; CHECK-NEXT: vrgather.vv v16, v8, v0
428+
; CHECK-NEXT: sub sp, sp, a0
429+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
465430
; CHECK-NEXT: csrr a0, vlenb
466431
; CHECK-NEXT: slli a0, a0, 4
467432
; CHECK-NEXT: add a0, sp, a0
468433
; CHECK-NEXT: addi a0, a0, 16
469434
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
470-
; CHECK-NEXT: vadd.vi v8, v0, 1
471-
; CHECK-NEXT: vrgather.vv v0, v24, v8
472-
; CHECK-NEXT: csrr a0, vlenb
473-
; CHECK-NEXT: li a1, 24
474-
; CHECK-NEXT: mul a0, a0, a1
475-
; CHECK-NEXT: add a0, sp, a0
476-
; CHECK-NEXT: addi a0, a0, 16
477-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
478-
; CHECK-NEXT: vrgather.vv v16, v24, v8
435+
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
436+
; CHECK-NEXT: vid.v v16
437+
; CHECK-NEXT: vand.vi v24, v16, 1
438+
; CHECK-NEXT: vmseq.vi v16, v24, 0
439+
; CHECK-NEXT: vcompress.vm v24, v8, v16
479440
; CHECK-NEXT: addi a0, sp, 16
480-
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
441+
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
442+
; CHECK-NEXT: vmnot.m v17, v16
443+
; CHECK-NEXT: vcompress.vm v0, v8, v17
481444
; CHECK-NEXT: csrr a0, vlenb
482445
; CHECK-NEXT: slli a0, a0, 4
483446
; CHECK-NEXT: add a0, sp, a0
484447
; CHECK-NEXT: addi a0, a0, 16
485448
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
449+
; CHECK-NEXT: vcompress.vm v24, v8, v16
450+
; CHECK-NEXT: csrr a0, vlenb
451+
; CHECK-NEXT: slli a0, a0, 3
452+
; CHECK-NEXT: add a0, sp, a0
453+
; CHECK-NEXT: addi a0, a0, 16
454+
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
455+
; CHECK-NEXT: vcompress.vm v24, v8, v17
486456
; CHECK-NEXT: csrr a0, vlenb
487457
; CHECK-NEXT: slli a0, a0, 3
488458
; CHECK-NEXT: add a0, sp, a0
489459
; CHECK-NEXT: addi a0, a0, 16
460+
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
461+
; CHECK-NEXT: addi a0, sp, 16
490462
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
491463
; CHECK-NEXT: vmv4r.v v20, v8
492-
; CHECK-NEXT: addi a0, sp, 16
493-
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
494-
; CHECK-NEXT: vmv4r.v v4, v8
464+
; CHECK-NEXT: vmv4r.v v4, v24
495465
; CHECK-NEXT: vmv8r.v v8, v16
496466
; CHECK-NEXT: vmv8r.v v16, v0
497467
; CHECK-NEXT: csrr a0, vlenb
498-
; CHECK-NEXT: slli a0, a0, 5
468+
; CHECK-NEXT: li a1, 24
469+
; CHECK-NEXT: mul a0, a0, a1
499470
; CHECK-NEXT: add sp, sp, a0
500471
; CHECK-NEXT: .cfi_def_cfa sp, 16
501472
; CHECK-NEXT: addi sp, sp, 16

0 commit comments

Comments
 (0)