Skip to content

Commit 57b797f

Browse files
[LLVM][CodeGen][SVE] Add lowering for ISD::VECREDUCE_MUL/FMUL. (llvm#161842)
We might be able to do better by using SVE2 and perhaps even NEON for the final stages, but this version works everywhere so seems like is a good place to start. Fixes llvm#155468
1 parent d2a8486 commit 57b797f

File tree

4 files changed

+333
-3
lines changed

4 files changed

+333
-3
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
15611561
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
15621562
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
15631563
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1564+
setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
15641565
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
15651566
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
15661567
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
17171718
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
17181719
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
17191720
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1721+
setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
17201722
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
17211723
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
17221724
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77757777
case ISD::VECREDUCE_FMAXIMUM:
77767778
case ISD::VECREDUCE_FMINIMUM:
77777779
return LowerVECREDUCE(Op, DAG);
7780+
case ISD::VECREDUCE_MUL:
7781+
case ISD::VECREDUCE_FMUL:
7782+
return LowerVECREDUCE_MUL(Op, DAG);
77787783
case ISD::ATOMIC_LOAD_AND:
77797784
return LowerATOMIC_LOAD_AND(Op, DAG);
77807785
case ISD::DYNAMIC_STACKALLOC:
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
1679416799
}
1679516800
}
1679616801

16802+
SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
16803+
SelectionDAG &DAG) const {
16804+
SDLoc DL(Op);
16805+
SDValue Src = Op.getOperand(0);
16806+
EVT SrcVT = Src.getValueType();
16807+
assert(SrcVT.isScalableVector() && "Unexpected operand type!");
16808+
16809+
SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
16810+
unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
16811+
SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
16812+
16813+
// Whilst we don't know the size of the vector we do know the maximum size so
16814+
// can perform a tree reduction with an identity vector, which means once we
16815+
// arrive at the result the remaining stages (when the vector is smaller than
16816+
// the maximum) have no affect.
16817+
16818+
unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
16819+
unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
16820+
16821+
for (unsigned I = 0; I < Stages; ++I) {
16822+
Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
16823+
Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
16824+
}
16825+
16826+
return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
16827+
}
16828+
1679716829
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
1679816830
SelectionDAG &DAG) const {
1679916831
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,7 @@ class AArch64TargetLowering : public TargetLowering {
752752
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
753753
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
754754
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
755+
SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
755756
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
756757
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
757758
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AArch64/sve-fp-reduce.ll

Lines changed: 175 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
359359
ret float %r
360360
}
361361

362+
; No FMULV instruction so use knowledge about the architectural maximum size of
363+
; an SVE register to "scalarise" the reduction.
364+
365+
define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
366+
; CHECK-LABEL: fmulv_nxv2f16:
367+
; CHECK: // %bb.0:
368+
; CHECK-NEXT: fmov z2.h, #1.00000000
369+
; CHECK-NEXT: ptrue p0.d
370+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
371+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
372+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
373+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
374+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
375+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
376+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
377+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
378+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
379+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
380+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
381+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
382+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
383+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
384+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
385+
; CHECK-NEXT: fmul h0, h0, h1
386+
; CHECK-NEXT: ret
387+
%res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
388+
ret half %res
389+
}
390+
391+
define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
392+
; CHECK-LABEL: fmulv_nxv4f16:
393+
; CHECK: // %bb.0:
394+
; CHECK-NEXT: fmov z2.h, #1.00000000
395+
; CHECK-NEXT: ptrue p0.s
396+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
397+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
398+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
399+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
400+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
401+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
402+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
403+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
404+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
405+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
406+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
407+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
408+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
409+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
410+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
411+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
412+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
413+
; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
414+
; CHECK-NEXT: fmul h0, h0, h1
415+
; CHECK-NEXT: ret
416+
%res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
417+
ret half %res
418+
}
419+
420+
define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
421+
; CHECK-LABEL: fmulv_nxv8f16:
422+
; CHECK: // %bb.0:
423+
; CHECK-NEXT: fmov z2.h, #1.00000000
424+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
425+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
426+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
427+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
428+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
429+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
430+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
431+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
432+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
433+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
434+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
435+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
436+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
437+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
438+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
439+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
440+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
441+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
442+
; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
443+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
444+
; CHECK-NEXT: fmul z1.h, z1.h, z3.h
445+
; CHECK-NEXT: fmul h0, h0, h1
446+
; CHECK-NEXT: ret
447+
%res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
448+
ret half %res
449+
}
450+
451+
define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
452+
; CHECK-LABEL: fmulv_nxv2f32:
453+
; CHECK: // %bb.0:
454+
; CHECK-NEXT: fmov z2.s, #1.00000000
455+
; CHECK-NEXT: ptrue p0.d
456+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
457+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
458+
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
459+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
460+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
461+
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
462+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
463+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
464+
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
465+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
466+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
467+
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
468+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
469+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
470+
; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
471+
; CHECK-NEXT: fmul s0, s0, s1
472+
; CHECK-NEXT: ret
473+
%res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
474+
ret float %res
475+
}
476+
477+
define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
478+
; CHECK-LABEL: fmulv_nxv4f32:
479+
; CHECK: // %bb.0:
480+
; CHECK-NEXT: fmov z2.s, #1.00000000
481+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
482+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
483+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
484+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
485+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
486+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
487+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
488+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
489+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
490+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
491+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
492+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
493+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
494+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
495+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
496+
; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
497+
; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
498+
; CHECK-NEXT: fmul z1.s, z1.s, z3.s
499+
; CHECK-NEXT: fmul s0, s0, s1
500+
; CHECK-NEXT: ret
501+
%res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
502+
ret float %res
503+
}
504+
505+
define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
506+
; CHECK-LABEL: fmulv_nxv2f64:
507+
; CHECK: // %bb.0:
508+
; CHECK-NEXT: fmov z2.d, #1.00000000
509+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
510+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
511+
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
512+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
513+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
514+
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
515+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
516+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
517+
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
518+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
519+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
520+
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
521+
; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
522+
; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
523+
; CHECK-NEXT: fmul z1.d, z1.d, z3.d
524+
; CHECK-NEXT: fmul d0, d0, d1
525+
; CHECK-NEXT: ret
526+
%res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
527+
ret double %res
528+
}
529+
362530
declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
363531
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
364532
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
365-
declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
366-
declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
367-
declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
368533
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
369534
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
370535
declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
397562
declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
398563
declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
399564
declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
565+
566+
declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
567+
declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
568+
declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
569+
declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
570+
declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
571+
declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)

llvm/test/CodeGen/AArch64/sve-int-reduce.ll

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
369369
ret i64 %res
370370
}
371371

372+
; No MULV instruction so use knowledge about the architectural maximum size of
373+
; an SVE register to "scalarise" the reduction.
374+
375+
define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
376+
; CHECK-LABEL: mulv_nxv16i8:
377+
; CHECK: // %bb.0:
378+
; CHECK-NEXT: mov z1.b, #1 // =0x1
379+
; CHECK-NEXT: ptrue p0.b
380+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
381+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
382+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
383+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
384+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
385+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
386+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
387+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
388+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
389+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
390+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
391+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
392+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
393+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
394+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
395+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
396+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
397+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
398+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
399+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
400+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
401+
; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
402+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
403+
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
404+
; CHECK-NEXT: fmov w0, s0
405+
; CHECK-NEXT: ret
406+
%res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
407+
ret i8 %res
408+
}
409+
410+
define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
411+
; CHECK-LABEL: mulv_nxv8i16:
412+
; CHECK: // %bb.0:
413+
; CHECK-NEXT: mov z1.h, #1 // =0x1
414+
; CHECK-NEXT: ptrue p0.h
415+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
416+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
417+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
418+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
419+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
420+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
421+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
422+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
423+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
424+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
425+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
426+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
427+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
428+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
429+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
430+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
431+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
432+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
433+
; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
434+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
435+
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
436+
; CHECK-NEXT: fmov w0, s0
437+
; CHECK-NEXT: ret
438+
%res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
439+
ret i16 %res
440+
}
441+
442+
define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
443+
; CHECK-LABEL: mulv_nxv4i32:
444+
; CHECK: // %bb.0:
445+
; CHECK-NEXT: mov z1.s, #1 // =0x1
446+
; CHECK-NEXT: ptrue p0.s
447+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
448+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
449+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
450+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
451+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
452+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
453+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
454+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
455+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
456+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
457+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
458+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
459+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
460+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
461+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
462+
; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
463+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
464+
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
465+
; CHECK-NEXT: fmov w0, s0
466+
; CHECK-NEXT: ret
467+
%res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
468+
ret i32 %res
469+
}
470+
471+
define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
472+
; CHECK-LABEL: mulv_nxv2i64:
473+
; CHECK: // %bb.0:
474+
; CHECK-NEXT: mov z1.d, #1 // =0x1
475+
; CHECK-NEXT: ptrue p0.d
476+
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
477+
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
478+
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
479+
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
480+
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
481+
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
482+
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
483+
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
484+
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
485+
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
486+
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
487+
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
488+
; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
489+
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
490+
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
491+
; CHECK-NEXT: fmov x0, d0
492+
; CHECK-NEXT: ret
493+
%res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
494+
ret i64 %res
495+
}
496+
372497
; Test widen vector reduce type
373498
declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
374499

0 commit comments

Comments
 (0)