Skip to content

Commit b52efa5

Browse files
[LLVM][CodeGen][SVE] Add lowering for 3-way VECTOR_(DE)INTERLEAVE operations. (#162502)
SVE has no in-register instructions to do this, but we can perform the operation through memory by using ld3/st3.
1 parent 6f1ce2b commit b52efa5

File tree

3 files changed

+653
-28
lines changed

3 files changed

+653
-28
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30606,6 +30606,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
3060630606
assert(OpVT.isScalableVector() &&
3060730607
"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
3060830608

30609+
if (Op->getNumOperands() == 3) {
30610+
// aarch64_sve_ld3 only supports packed datatypes.
30611+
EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
30612+
Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
30613+
SDValue StackPtr =
30614+
DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
30615+
30616+
// Write out unmodified operands.
30617+
SmallVector<SDValue, 3> Chains;
30618+
for (unsigned I = 0; I < 3; ++I) {
30619+
SDValue Ptr =
30620+
DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
30621+
SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
30622+
Chains.push_back(
30623+
DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
30624+
}
30625+
30626+
Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
30627+
EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
30628+
30629+
SmallVector<SDValue, 7> Ops;
30630+
Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
30631+
Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30632+
Ops.push_back(DAG.getConstant(1, DL, PredVT));
30633+
Ops.push_back(StackPtr);
30634+
30635+
// Read back and deinterleave data.
30636+
SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
30637+
SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
30638+
30639+
SmallVector<SDValue, 3> Results;
30640+
Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
30641+
Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
30642+
Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
30643+
return DAG.getMergeValues(Results, DL);
30644+
}
30645+
3060930646
// Are multi-register uzp instructions available?
3061030647
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
3061130648
OpVT.getVectorElementType() != MVT::i1) {
@@ -30647,6 +30684,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
3064730684
assert(OpVT.isScalableVector() &&
3064830685
"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
3064930686

30687+
if (Op->getNumOperands() == 3) {
30688+
// aarch64_sve_st3 only supports packed datatypes.
30689+
EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
30690+
SmallVector<SDValue, 3> InVecs;
30691+
for (SDValue V : Op->ops())
30692+
InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
30693+
30694+
Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
30695+
SDValue StackPtr =
30696+
DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
30697+
30698+
Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
30699+
EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
30700+
30701+
SmallVector<SDValue, 7> Ops;
30702+
Ops.push_back(DAG.getEntryNode());
30703+
Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30704+
Ops.append(InVecs);
30705+
Ops.push_back(DAG.getConstant(1, DL, PredVT));
30706+
Ops.push_back(StackPtr);
30707+
30708+
// Interleave operands and store.
30709+
SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
30710+
30711+
// Read back the interleaved data.
30712+
SmallVector<SDValue, 3> Results;
30713+
for (unsigned I = 0; I < 3; ++I) {
30714+
SDValue Ptr =
30715+
DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
30716+
SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
30717+
Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
30718+
}
30719+
30720+
return DAG.getMergeValues(Results, DL);
30721+
}
30722+
3065030723
// Are multi-register zip instructions available?
3065130724
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
3065230725
OpVT.getVectorElementType() != MVT::i1) {

llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll

Lines changed: 268 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,274 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
231231
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
232232
}
233233

234+
define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv6f16(<vscale x 6 x half> %vec) {
235+
; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv6f16:
236+
; CHECK: // %bb.0:
237+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
238+
; CHECK-NEXT: addvl sp, sp, #-3
239+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
240+
; CHECK-NEXT: .cfi_offset w29, -16
241+
; CHECK-NEXT: uunpkhi z1.s, z0.h
242+
; CHECK-NEXT: uunpklo z0.s, z0.h
243+
; CHECK-NEXT: ptrue p0.d
244+
; CHECK-NEXT: uunpklo z1.d, z1.s
245+
; CHECK-NEXT: uunpkhi z2.d, z0.s
246+
; CHECK-NEXT: uunpklo z0.d, z0.s
247+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
248+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
249+
; CHECK-NEXT: str z0, [sp]
250+
; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
251+
; CHECK-NEXT: addvl sp, sp, #3
252+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
253+
; CHECK-NEXT: ret
254+
%retval = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave3.nxv6f16(<vscale x 6 x half> %vec)
255+
ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %retval
256+
}
257+
258+
define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv12f16(<vscale x 12 x half> %vec) {
259+
; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv12f16:
260+
; CHECK: // %bb.0:
261+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
262+
; CHECK-NEXT: addvl sp, sp, #-3
263+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
264+
; CHECK-NEXT: .cfi_offset w29, -16
265+
; CHECK-NEXT: uunpklo z1.s, z1.h
266+
; CHECK-NEXT: uunpkhi z2.s, z0.h
267+
; CHECK-NEXT: uunpklo z0.s, z0.h
268+
; CHECK-NEXT: ptrue p0.s
269+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
270+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
271+
; CHECK-NEXT: str z0, [sp]
272+
; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
273+
; CHECK-NEXT: addvl sp, sp, #3
274+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
275+
; CHECK-NEXT: ret
276+
%retval = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave3.nxv12f16(<vscale x 12 x half> %vec)
277+
ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %retval
278+
}
279+
280+
define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv24f16(<vscale x 24 x half> %vec) {
281+
; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv24f16:
282+
; CHECK: // %bb.0:
283+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
284+
; CHECK-NEXT: addvl sp, sp, #-3
285+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
286+
; CHECK-NEXT: .cfi_offset w29, -16
287+
; CHECK-NEXT: ptrue p0.h
288+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
289+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
290+
; CHECK-NEXT: str z0, [sp]
291+
; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
292+
; CHECK-NEXT: addvl sp, sp, #3
293+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
294+
; CHECK-NEXT: ret
295+
%retval = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave3.nxv24f16(<vscale x 24 x half> %vec)
296+
ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %retval
297+
}
298+
299+
define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv6f32(<vscale x 6 x float> %vec) {
300+
; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv6f32:
301+
; CHECK: // %bb.0:
302+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
303+
; CHECK-NEXT: addvl sp, sp, #-3
304+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
305+
; CHECK-NEXT: .cfi_offset w29, -16
306+
; CHECK-NEXT: uunpklo z1.d, z1.s
307+
; CHECK-NEXT: uunpkhi z2.d, z0.s
308+
; CHECK-NEXT: uunpklo z0.d, z0.s
309+
; CHECK-NEXT: ptrue p0.d
310+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
311+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
312+
; CHECK-NEXT: str z0, [sp]
313+
; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
314+
; CHECK-NEXT: addvl sp, sp, #3
315+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
316+
; CHECK-NEXT: ret
317+
%retval = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave3.nxv6f32(<vscale x 6 x float> %vec)
318+
ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %retval
319+
}
320+
321+
define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv12f32(<vscale x 12 x float> %vec) {
322+
; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv12f32:
323+
; CHECK: // %bb.0:
324+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
325+
; CHECK-NEXT: addvl sp, sp, #-3
326+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
327+
; CHECK-NEXT: .cfi_offset w29, -16
328+
; CHECK-NEXT: ptrue p0.s
329+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
330+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
331+
; CHECK-NEXT: str z0, [sp]
332+
; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
333+
; CHECK-NEXT: addvl sp, sp, #3
334+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
335+
; CHECK-NEXT: ret
336+
%retval = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave3.nxv12f32(<vscale x 12 x float> %vec)
337+
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %retval
338+
}
339+
340+
define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv6f64(<vscale x 6 x double> %vec) {
341+
; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv6f64:
342+
; CHECK: // %bb.0:
343+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
344+
; CHECK-NEXT: addvl sp, sp, #-3
345+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
346+
; CHECK-NEXT: .cfi_offset w29, -16
347+
; CHECK-NEXT: ptrue p0.d
348+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
349+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
350+
; CHECK-NEXT: str z0, [sp]
351+
; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
352+
; CHECK-NEXT: addvl sp, sp, #3
353+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
354+
; CHECK-NEXT: ret
355+
%retval = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave3.nxv6f64(<vscale x 6 x double> %vec)
356+
ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %retval
357+
}
358+
359+
define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv6bf16(<vscale x 6 x bfloat> %vec) {
360+
; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv6bf16:
361+
; CHECK: // %bb.0:
362+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
363+
; CHECK-NEXT: addvl sp, sp, #-3
364+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
365+
; CHECK-NEXT: .cfi_offset w29, -16
366+
; CHECK-NEXT: uunpkhi z1.s, z0.h
367+
; CHECK-NEXT: uunpklo z0.s, z0.h
368+
; CHECK-NEXT: ptrue p0.d
369+
; CHECK-NEXT: uunpklo z1.d, z1.s
370+
; CHECK-NEXT: uunpkhi z2.d, z0.s
371+
; CHECK-NEXT: uunpklo z0.d, z0.s
372+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
373+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
374+
; CHECK-NEXT: str z0, [sp]
375+
; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
376+
; CHECK-NEXT: addvl sp, sp, #3
377+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
378+
; CHECK-NEXT: ret
379+
%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave3.nxv6bf16(<vscale x 6 x bfloat> %vec)
380+
ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
381+
}
382+
383+
define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv12bf16(<vscale x 12 x bfloat> %vec) {
384+
; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv12bf16:
385+
; CHECK: // %bb.0:
386+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
387+
; CHECK-NEXT: addvl sp, sp, #-3
388+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
389+
; CHECK-NEXT: .cfi_offset w29, -16
390+
; CHECK-NEXT: uunpklo z1.s, z1.h
391+
; CHECK-NEXT: uunpkhi z2.s, z0.h
392+
; CHECK-NEXT: uunpklo z0.s, z0.h
393+
; CHECK-NEXT: ptrue p0.s
394+
; CHECK-NEXT: str z1, [sp, #2, mul vl]
395+
; CHECK-NEXT: str z2, [sp, #1, mul vl]
396+
; CHECK-NEXT: str z0, [sp]
397+
; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
398+
; CHECK-NEXT: addvl sp, sp, #3
399+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
400+
; CHECK-NEXT: ret
401+
%retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave3.nxv12bf16(<vscale x 12 x bfloat> %vec)
402+
ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
403+
}
404+
405+
define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv24bf16(<vscale x 24 x bfloat> %vec) {
406+
; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv24bf16:
407+
; CHECK: // %bb.0:
408+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
409+
; CHECK-NEXT: addvl sp, sp, #-3
410+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
411+
; CHECK-NEXT: .cfi_offset w29, -16
412+
; CHECK-NEXT: ptrue p0.h
413+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
414+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
415+
; CHECK-NEXT: str z0, [sp]
416+
; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
417+
; CHECK-NEXT: addvl sp, sp, #3
418+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
419+
; CHECK-NEXT: ret
420+
%retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave3.nxv24bf16(<vscale x 24 x bfloat> %vec)
421+
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval
422+
}
423+
424+
; Integers
425+
426+
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv48i8(<vscale x 48 x i8> %vec) {
427+
; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8:
428+
; CHECK: // %bb.0:
429+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
430+
; CHECK-NEXT: addvl sp, sp, #-3
431+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
432+
; CHECK-NEXT: .cfi_offset w29, -16
433+
; CHECK-NEXT: ptrue p0.b
434+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
435+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
436+
; CHECK-NEXT: str z0, [sp]
437+
; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [sp]
438+
; CHECK-NEXT: addvl sp, sp, #3
439+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
440+
; CHECK-NEXT: ret
441+
%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave3.nxv48i8(<vscale x 48 x i8> %vec)
442+
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
443+
}
444+
445+
define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv24i16(<vscale x 24 x i16> %vec) {
446+
; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16:
447+
; CHECK: // %bb.0:
448+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
449+
; CHECK-NEXT: addvl sp, sp, #-3
450+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
451+
; CHECK-NEXT: .cfi_offset w29, -16
452+
; CHECK-NEXT: ptrue p0.h
453+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
454+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
455+
; CHECK-NEXT: str z0, [sp]
456+
; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
457+
; CHECK-NEXT: addvl sp, sp, #3
458+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
459+
; CHECK-NEXT: ret
460+
%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave3.nxv24i16(<vscale x 24 x i16> %vec)
461+
ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
462+
}
463+
464+
define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv12i32(<vscale x 12 x i32> %vec) {
465+
; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv12i32:
466+
; CHECK: // %bb.0:
467+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
468+
; CHECK-NEXT: addvl sp, sp, #-3
469+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
470+
; CHECK-NEXT: .cfi_offset w29, -16
471+
; CHECK-NEXT: ptrue p0.s
472+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
473+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
474+
; CHECK-NEXT: str z0, [sp]
475+
; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
476+
; CHECK-NEXT: addvl sp, sp, #3
477+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
478+
; CHECK-NEXT: ret
479+
%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %vec)
480+
ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
481+
}
482+
483+
define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv6i64(<vscale x 6 x i64> %vec) {
484+
; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64:
485+
; CHECK: // %bb.0:
486+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
487+
; CHECK-NEXT: addvl sp, sp, #-3
488+
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
489+
; CHECK-NEXT: .cfi_offset w29, -16
490+
; CHECK-NEXT: ptrue p0.d
491+
; CHECK-NEXT: str z2, [sp, #2, mul vl]
492+
; CHECK-NEXT: str z1, [sp, #1, mul vl]
493+
; CHECK-NEXT: str z0, [sp]
494+
; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
495+
; CHECK-NEXT: addvl sp, sp, #3
496+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
497+
; CHECK-NEXT: ret
498+
%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> %vec)
499+
ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
500+
}
501+
234502
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
235503
; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
236504
; SVE: // %bb.0:
@@ -599,31 +867,3 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
599867
%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
600868
ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
601869
}
602-
603-
; Floating declarations
604-
declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
605-
declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
606-
declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
607-
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
608-
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
609-
declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
610-
611-
; Integer declarations
612-
declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
613-
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
614-
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
615-
declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
616-
617-
; Predicated declarations
618-
declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
619-
declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
620-
declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
621-
declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
622-
623-
; Illegal size type
624-
declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
625-
declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
626-
627-
declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
628-
declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
629-
declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)

0 commit comments

Comments
 (0)